diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index 315a389339a..b0f367e1f87 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -18,6 +18,8 @@ ENV DEFAULT_CONDA_ENV=rapids FROM ${PYTHON_PACKAGE_MANAGER}-base +ARG TARGETARCH + ARG CUDA ENV CUDAARCHS="RAPIDS" ENV CUDA_VERSION="${CUDA_VERSION:-${CUDA}}" @@ -29,8 +31,36 @@ ENV PYTHONSAFEPATH="1" ENV PYTHONUNBUFFERED="1" ENV PYTHONDONTWRITEBYTECODE="1" -ENV SCCACHE_REGION="us-east-2" -ENV SCCACHE_BUCKET="rapids-sccache-devs" -ENV AWS_ROLE_ARN="arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs" ENV HISTFILE="/home/coder/.cache/._bash_history" ENV LIBCUDF_KERNEL_CACHE_PATH="/home/coder/cudf/cpp/build/${PYTHON_PACKAGE_MANAGER}/cuda-${CUDA_VERSION}/latest/jitify_cache" + +### +# sccache configuration +### +ENV AWS_ROLE_ARN="arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs" +ENV SCCACHE_REGION="us-east-2" +ENV SCCACHE_BUCKET="rapids-sccache-devs" +# 2hr (1 minute longer than sccache-dist request timeout) +ENV SCCACHE_IDLE_TIMEOUT=7200 + +### +# sccache-dist configuration +### +# Enable sccache-dist by default +ENV DEVCONTAINER_UTILS_ENABLE_SCCACHE_DIST=1 +# Compile locally if max retries exceeded +ENV SCCACHE_DIST_FALLBACK_TO_LOCAL_COMPILE=true +# Retry transient errors 4 times (for a total of 5 attempts) +ENV SCCACHE_DIST_MAX_RETRIES=4 +ENV SCCACHE_DIST_CONNECT_TIMEOUT=30 +ENV SCCACHE_DIST_CONNECTION_POOL=false +# 1hr 59min (to accommodate debug builds) +ENV SCCACHE_DIST_REQUEST_TIMEOUT=7140 +ENV SCCACHE_DIST_KEEPALIVE_ENABLED=true +ENV SCCACHE_DIST_KEEPALIVE_INTERVAL=20 +ENV SCCACHE_DIST_KEEPALIVE_TIMEOUT=600 +ENV SCCACHE_DIST_URL="https://${TARGETARCH}.linux.sccache.rapids.nvidia.com" + +# Build as much in parallel as possible +ENV INFER_NUM_DEVICE_ARCHITECTURES=1 +ENV MAX_DEVICE_OBJ_TO_COMPILE_IN_PARALLEL=20 diff --git a/.devcontainer/cuda12.9-conda/devcontainer.json b/.devcontainer/cuda12.9-conda/devcontainer.json index 1e00021f0ed..9e5bc0306a3 100644 --- a/.devcontainer/cuda12.9-conda/devcontainer.json +++ b/.devcontainer/cuda12.9-conda/devcontainer.json @@ -5,19 +5,21 @@ "args": { "CUDA": "12.9", "PYTHON_PACKAGE_MANAGER": "conda", - "BASE": "rapidsai/devcontainers:25.08-cpp-mambaforge-ubuntu22.04" + "BASE": "rapidsai/devcontainers:25.10-cpp-mambaforge" } }, "runArgs": [ "--rm", "--name", - "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.08-cuda12.9-conda" + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.10-cuda12.9-conda", + "--ulimit", + "nofile=500000" ], "hostRequirements": { "gpu": "optional" }, "features": { - "ghcr.io/rapidsai/devcontainers/features/cuda:25.8": { + "ghcr.io/rapidsai/devcontainers/features/cuda:25.10": { "version": "12.9", "installCompilers": false, "installProfilers": true, @@ -38,7 +40,7 @@ "installnvJPEG": false, "pruneStaticLibs": true }, - "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.8": {} + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.10": {} }, "overrideFeatureInstallOrder": [ "ghcr.io/rapidsai/devcontainers/features/cuda", diff --git a/.devcontainer/cuda12.9-pip/devcontainer.json b/.devcontainer/cuda12.9-pip/devcontainer.json index 0debf91e159..ea7d5a19515 100644 --- a/.devcontainer/cuda12.9-pip/devcontainer.json +++ b/.devcontainer/cuda12.9-pip/devcontainer.json @@ -5,19 +5,21 @@ "args": { "CUDA": "12.9", "PYTHON_PACKAGE_MANAGER": "pip", - "BASE": "rapidsai/devcontainers:25.08-cpp-cuda12.9-ubuntu22.04" + "BASE": "rapidsai/devcontainers:25.10-cpp-cuda12.9" } }, "runArgs": [ "--rm", "--name", - "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.08-cuda12.9-pip" + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.10-cuda12.9-pip", + "--ulimit", + "nofile=500000" ], "hostRequirements": { "gpu": "optional" }, "features": { - "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.8": {} + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.10": {} }, "overrideFeatureInstallOrder": [ "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" diff --git a/.devcontainer/cuda13.0-conda/devcontainer.json b/.devcontainer/cuda13.0-conda/devcontainer.json new file mode 100644 index 00000000000..f236ef00da3 --- /dev/null +++ b/.devcontainer/cuda13.0-conda/devcontainer.json @@ -0,0 +1,76 @@ +{ + "build": { + "context": "${localWorkspaceFolder}/.devcontainer", + "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile", + "args": { + "CUDA": "13.0", + "PYTHON_PACKAGE_MANAGER": "conda", + "BASE": "rapidsai/devcontainers:25.10-cpp-mambaforge" + } + }, + "runArgs": [ + "--rm", + "--name", + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.10-cuda13.0-conda", + "--ulimit", + "nofile=500000" + ], + "hostRequirements": { + "gpu": "optional" + }, + "features": { + "ghcr.io/rapidsai/devcontainers/features/cuda:25.10": { + "version": "13.0", + "installCompilers": false, + "installProfilers": true, + "installDevPackages": false, + "installcuDNN": false, + "installcuTensor": false, + "installNCCL": false, + "installCUDARuntime": false, + "installNVRTC": false, + "installOpenCL": false, + "installcuBLAS": false, + "installcuSPARSE": false, + "installcuFFT": false, + "installcuFile": false, + "installcuRAND": false, + "installcuSOLVER": false, + "installNPP": false, + "installnvJPEG": false, + "pruneStaticLibs": true + }, + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.10": {} + }, + "overrideFeatureInstallOrder": [ + "ghcr.io/rapidsai/devcontainers/features/cuda", + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" + ], + "initializeCommand": [ + "/bin/bash", + "-c", + "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda13.0-envs}" + ], + "postAttachCommand": [ + "/bin/bash", + "-c", + "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi" + ], + "workspaceFolder": "/home/coder", + "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cudf,type=bind,consistency=consistent", + "mounts": [ + "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/../.conda/pkgs,target=/home/coder/.conda/pkgs,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda13.0-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent" + ], + "customizations": { + "vscode": { + "extensions": [ + "ms-python.flake8", + "nvidia.nsight-vscode-edition" + ] + } + } +} diff --git a/.devcontainer/cuda13.0-pip/devcontainer.json b/.devcontainer/cuda13.0-pip/devcontainer.json new file mode 100644 index 00000000000..c6c0f0c2230 --- /dev/null +++ b/.devcontainer/cuda13.0-pip/devcontainer.json @@ -0,0 +1,53 @@ +{ + "build": { + "context": "${localWorkspaceFolder}/.devcontainer", + "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile", + "args": { + "CUDA": "13.0", + "PYTHON_PACKAGE_MANAGER": "pip", + "BASE": "rapidsai/devcontainers:25.10-cpp-cuda13.0" + } + }, + "runArgs": [ + "--rm", + "--name", + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.10-cuda13.0-pip", + "--ulimit", + "nofile=500000" + ], + "hostRequirements": { + "gpu": "optional" + }, + "features": { + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.10": {} + }, + "overrideFeatureInstallOrder": [ + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" + ], + "initializeCommand": [ + "/bin/bash", + "-c", + "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda13.0-venvs}" + ], + "postAttachCommand": [ + "/bin/bash", + "-c", + "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi" + ], + "workspaceFolder": "/home/coder", + "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cudf,type=bind,consistency=consistent", + "mounts": [ + "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda13.0-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent" + ], + "customizations": { + "vscode": { + "extensions": [ + "ms-python.flake8", + "nvidia.nsight-vscode-edition" + ] + } + } +} diff --git a/.github/labeler.yml b/.github/labeler.yml index 63ef619b64e..a59b05cab34 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -1,26 +1,34 @@ # Documentation for config - https://github.com/actions/labeler#common-examples Python: - - 'python/**' - - 'notebooks/**' - +- changed-files: + - any-glob-to-any-file: + - 'python/**' + - 'notebooks/**' cudf.pandas: - - 'python/cudf/cudf/pandas/**' - - 'python/cudf/cudf_pandas_tests/**' - +- changed-files: + - any-glob-to-any-file: + - 'python/cudf/cudf/pandas/**' + - 'python/cudf/cudf_pandas_tests/**' cudf-polars: - - 'python/cudf_polars/**' - +- changed-files: + - any-glob-to-any-file: + - 'python/cudf_polars/**' pylibcudf: - - 'python/pylibcudf/**' - +- changed-files: + - any-glob-to-any-file: + - 'python/pylibcudf/**' libcudf: - - 'cpp/**' - +- changed-files: + - any-glob-to-any-file: + - 'cpp/**' CMake: - - '**/CMakeLists.txt' - - '**/cmake/**' - - '**/*.cmake' - +- changed-files: + - any-glob-to-any-file: + - '**/CMakeLists.txt' + - '**/cmake/**' + - '**/*.cmake' Java: - - 'java/**' +- changed-files: + - any-glob-to-any-file: + - 'java/**' diff --git a/.github/release.yml b/.github/release.yml new file mode 100644 index 00000000000..2c9a85805c4 --- /dev/null +++ b/.github/release.yml @@ -0,0 +1,27 @@ +# GitHub Auto-Generated Release Notes Configuration for RAPIDS +# This file configures how GitHub automatically generates release notes + +changelog: + exclude: + labels: + - ignore-for-release + - dependencies + authors: + - rapids-bot[bot] + - dependabot[bot] + categories: + - title: 🚨 Breaking Changes + labels: + - breaking + - title: 🐛 Bug Fixes + labels: + - bug + - title: 📖 Documentation + labels: + - doc + - title: 🚀 New Features + labels: + - feature request + - title: 🛠️ Improvements + labels: + - improvement diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 6a52c49a128..d9bb501c968 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -46,7 +46,7 @@ jobs: cpp-build: needs: [telemetry-setup] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -57,7 +57,7 @@ jobs: python-build: needs: [telemetry-setup, cpp-build] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -67,7 +67,7 @@ jobs: upload-conda: needs: [cpp-build, python-build] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-25.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -77,12 +77,12 @@ jobs: if: github.ref_type == 'branch' needs: python-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10 with: arch: "amd64" branch: ${{ inputs.branch }} build_type: ${{ inputs.build_type || 'branch' }} - container_image: "rapidsai/ci-conda:25.08-latest" + container_image: "rapidsai/ci-conda:25.10-latest" date: ${{ inputs.date }} node_type: "gpu-l4-latest-1" script: "ci/build_docs.sh" @@ -90,7 +90,7 @@ jobs: wheel-build-libcudf: needs: [telemetry-setup] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10 with: # build for every combination of arch and CUDA version, but only for the latest Python matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber))) @@ -105,7 +105,7 @@ jobs: wheel-publish-libcudf: needs: wheel-build-libcudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -116,7 +116,7 @@ jobs: wheel-build-pylibcudf: needs: [telemetry-setup, wheel-build-libcudf] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -128,7 +128,7 @@ jobs: wheel-publish-pylibcudf: needs: wheel-build-pylibcudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -139,7 +139,7 @@ jobs: wheel-build-cudf: needs: [telemetry-setup, wheel-build-pylibcudf] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -151,7 +151,7 @@ jobs: wheel-publish-cudf: needs: wheel-build-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -162,7 +162,7 @@ jobs: wheel-build-dask-cudf: needs: [telemetry-setup, wheel-build-cudf] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -177,7 +177,7 @@ jobs: wheel-publish-dask-cudf: needs: wheel-build-dask-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -188,7 +188,7 @@ jobs: wheel-build-cudf-polars: needs: [telemetry-setup, wheel-build-pylibcudf] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -203,7 +203,7 @@ jobs: wheel-publish-cudf-polars: needs: wheel-build-cudf-polars secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml index acfefc5e4af..1ce0448d231 100644 --- a/.github/workflows/labeler.yml +++ b/.github/workflows/labeler.yml @@ -5,6 +5,9 @@ on: jobs: triage: + permissions: + contents: read + pull-requests: write runs-on: ubuntu-latest steps: - name: Checkout code @@ -13,6 +16,6 @@ jobs: persist-credentials: false sparse-checkout: .github/labeler.yml sparse-checkout-cone-mode: false - - uses: actions/labeler@v4 + - uses: actions/labeler@v5 with: repo-token: "${{ secrets.GITHUB_TOKEN }}" diff --git a/.github/workflows/pandas-tests.yaml b/.github/workflows/pandas-tests.yaml index ef06159ab90..53004b6f0ce 100644 --- a/.github/workflows/pandas-tests.yaml +++ b/.github/workflows/pandas-tests.yaml @@ -22,11 +22,14 @@ jobs: pandas-tests: # run the Pandas unit tests secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10 with: - matrix_filter: '[{"ARCH": "amd64", "PY_VER": "3.13", "CUDA_VER": "12.9.1", "LINUX_VER": "ubuntu24.04", "GPU": "l4", "DRIVER": "latest", "DEPENDENCIES": "newest"}]' build_type: nightly branch: ${{ inputs.branch }} date: ${{ inputs.date }} sha: ${{ inputs.sha }} + node_type: "gpu-l4-latest-1" + container_image: "rapidsai/citestwheel:25.10-latest" script: ci/cudf_pandas_scripts/pandas-tests/run.sh main + file_to_upload: ./main-results.json + artifact-name: main-results.json diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index befc06d7c98..d714bdf7c2e 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -38,12 +38,11 @@ jobs: - devcontainer - unit-tests-cudf-pandas - pandas-tests - - pandas-tests-diff - narwhals-tests - telemetry-setup - third-party-integration-tests-cudf-pandas secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-25.10 if: always() with: needs: ${{ toJSON(needs) }} @@ -68,7 +67,7 @@ jobs: changed-files: secrets: inherit needs: telemetry-setup - uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@branch-25.10 with: files_yaml: | test_cpp: @@ -130,14 +129,14 @@ jobs: checks: secrets: inherit needs: telemetry-setup - uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-25.10 with: enable_check_generated_files: false ignored_pr_jobs: "telemetry-summarize spark-rapids-jni" conda-cpp-build: needs: checks secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.10 with: build_type: pull-request node_type: "cpu16" @@ -145,7 +144,7 @@ jobs: cpp-linters: secrets: inherit needs: checks - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10 with: build_type: pull-request script: "ci/cpp_linters.sh" @@ -153,13 +152,13 @@ jobs: conda-cpp-checks: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-25.10 with: build_type: pull-request conda-cpp-tests: needs: [conda-cpp-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.10 if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp with: build_type: pull-request @@ -167,14 +166,14 @@ jobs: conda-python-build: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.10 with: build_type: pull-request script: ci/build_python.sh conda-python-cudf-tests: needs: [conda-python-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.10 if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python with: build_type: pull-request @@ -183,7 +182,7 @@ jobs: # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism needs: [conda-python-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.10 if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python with: build_type: pull-request @@ -191,39 +190,39 @@ jobs: conda-java-tests: needs: [conda-cpp-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10 if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_java with: build_type: pull-request node_type: "gpu-l4-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:25.08-latest" + container_image: "rapidsai/ci-conda:25.10-latest" script: "ci/test_java.sh" conda-notebook-tests: needs: [conda-python-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10 if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_notebooks with: build_type: pull-request node_type: "gpu-l4-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:25.08-latest" + container_image: "rapidsai/ci-conda:25.10-latest" script: "ci/test_notebooks.sh" docs-build: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10 with: build_type: pull-request node_type: "gpu-l4-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:25.08-latest" + container_image: "rapidsai/ci-conda:25.10-latest" script: "ci/build_docs.sh" wheel-build-libcudf: needs: checks secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10 with: # build for every combination of arch and CUDA version, but only for the latest Python matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber))) @@ -235,7 +234,7 @@ jobs: wheel-build-pylibcudf: needs: [checks, wheel-build-libcudf] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10 with: build_type: pull-request script: "ci/build_wheel_pylibcudf.sh" @@ -244,7 +243,7 @@ jobs: wheel-build-cudf: needs: wheel-build-pylibcudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10 with: build_type: pull-request script: "ci/build_wheel_cudf.sh" @@ -253,7 +252,7 @@ jobs: wheel-tests-cudf: needs: [wheel-build-cudf, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.10 if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python with: build_type: pull-request @@ -261,7 +260,7 @@ jobs: wheel-build-cudf-polars: needs: wheel-build-pylibcudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -273,7 +272,7 @@ jobs: wheel-tests-cudf-polars: needs: [wheel-build-cudf-polars, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.10 if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". @@ -281,9 +280,10 @@ jobs: build_type: pull-request script: "ci/test_wheel_cudf_polars.sh" cudf-polars-polars-tests: - needs: wheel-build-cudf-polars + needs: [wheel-build-cudf-polars, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.10 + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -292,7 +292,7 @@ jobs: wheel-build-dask-cudf: needs: wheel-build-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -304,7 +304,7 @@ jobs: wheel-tests-dask-cudf: needs: [wheel-build-dask-cudf, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.10 if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". @@ -314,19 +314,25 @@ jobs: devcontainer: secrets: inherit needs: telemetry-setup - uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-25.10 with: - node_type: "cpu32" - arch: '["amd64"]' - cuda: '["12.9"]' + arch: '["amd64", "arm64"]' + cuda: '["13.0"]' + node_type: "cpu8" + rapids-aux-secret-1: GIST_REPO_READ_ORG_GITHUB_TOKEN + env: | + SCCACHE_DIST_MAX_RETRIES=inf + SCCACHE_SERVER_LOG=sccache=debug + SCCACHE_DIST_FALLBACK_TO_LOCAL_COMPILE=false + SCCACHE_DIST_AUTH_TOKEN_VAR=RAPIDS_AUX_SECRET_1 build_command: | - sccache -z; - build-all -DBUILD_BENCHMARKS=ON --verbose; - sccache -s; + sccache --zero-stats; + build-all -j0 -DBUILD_BENCHMARKS=ON --verbose 2>&1 | tee telemetry-artifacts/build.log; + sccache --show-adv-stats | tee telemetry-artifacts/sccache-stats.txt; unit-tests-cudf-pandas: needs: [wheel-build-cudf, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.10 if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python || fromJSON(needs.changed-files.outputs.changed_file_groups).test_cudf_pandas with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". @@ -334,9 +340,10 @@ jobs: build_type: pull-request script: ci/cudf_pandas_scripts/run_tests.sh third-party-integration-tests-cudf-pandas: - needs: conda-python-build + needs: [conda-python-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10 + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python || fromJSON(needs.changed-files.outputs.changed_file_groups).test_cudf_pandas with: build_type: pull-request branch: ${{ inputs.branch }} @@ -344,33 +351,28 @@ jobs: sha: ${{ inputs.sha }} node_type: "gpu-l4-latest-1" continue-on-error: true - container_image: "rapidsai/ci-conda:cuda12.9.0-ubuntu24.04-py3.12" + # TODO: Switch to ci-conda:25-10-latest when XGBoost has CUDA 13 packages + container_image: "rapidsai/ci-conda:25.10-cuda12.9.1-ubuntu24.04-py3.13" script: | ci/cudf_pandas_scripts/third-party-integration/test.sh python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml pandas-tests: # run the Pandas unit tests using PR branch needs: [wheel-build-cudf, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10 if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python || fromJSON(needs.changed-files.outputs.changed_file_groups).test_cudf_pandas with: - matrix_filter: '[{"ARCH": "amd64", "PY_VER": "3.13", "CUDA_VER": "12.9.1", "LINUX_VER": "ubuntu24.04", "GPU": "l4", "DRIVER": "latest", "DEPENDENCIES": "newest"}]' build_type: pull-request + branch: ${{ inputs.branch }} + date: ${{ inputs.date }} + sha: ${{ inputs.sha }} + node_type: "gpu-l4-latest-1" + container_image: "rapidsai/citestwheel:25.10-latest" script: ci/cudf_pandas_scripts/pandas-tests/run.sh pr - # Hide test failures because they exceed the GITHUB_STEP_SUMMARY output limit. - test_summary_show: "none" - pandas-tests-diff: - # diff the results of running the Pandas unit tests and publish a job summary - needs: pandas-tests - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.08 - with: - node_type: "cpu4" - build_type: pull-request - script: "ci/cudf_pandas_scripts/pandas-tests/diff.sh" narwhals-tests: needs: [conda-python-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10 if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python with: build_type: pull-request @@ -378,7 +380,7 @@ jobs: date: ${{ inputs.date }} sha: ${{ inputs.sha }} node_type: "gpu-l4-latest-1" - container_image: "rapidsai/ci-conda:25.08-latest" + container_image: "rapidsai/ci-conda:25.10-latest" script: ci/test_narwhals.sh spark-rapids-jni: needs: changed-files diff --git a/.github/workflows/pr_issue_status_automation.yml b/.github/workflows/pr_issue_status_automation.yml index 46973456a90..148d83e73d6 100644 --- a/.github/workflows/pr_issue_status_automation.yml +++ b/.github/workflows/pr_issue_status_automation.yml @@ -23,7 +23,7 @@ on: jobs: get-project-id: - uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@branch-25.10 if: github.event.pull_request.state == 'open' secrets: inherit permissions: @@ -34,7 +34,7 @@ jobs: update-status: # This job sets the PR and its linked issues to "In Progress" status - uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-25.10 if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }} needs: get-project-id with: @@ -50,7 +50,7 @@ jobs: update-sprint: # This job sets the PR and its linked issues to the current "Weekly Sprint" - uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@branch-25.10 if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }} needs: get-project-id with: @@ -79,7 +79,7 @@ jobs: update-release: # This job sets the PR and its linked issues to the release they are targeting - uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-25.10 if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }} needs: [get-project-id, process-branch-name] with: diff --git a/.github/workflows/spark-rapids-jni.yaml b/.github/workflows/spark-rapids-jni.yaml index 832c749874c..f4c168aff1a 100644 --- a/.github/workflows/spark-rapids-jni.yaml +++ b/.github/workflows/spark-rapids-jni.yaml @@ -13,10 +13,11 @@ jobs: with: repository: NVIDIA/spark-rapids-jni submodules: recursive + ref: ${{ github.event.pull_request.base.ref }} - uses: actions/checkout@v4 with: path: thirdparty/cudf - name: "Build spark-rapids-jni" run: | mkdir target - CMAKE_CUDA_ARCHITECTURES=90 LIBCUDF_DEPENDENCY_MODE=latest USE_GDS=on scl enable gcc-toolset-11 build/buildcpp.sh + source build/env.sh && CMAKE_CUDA_ARCHITECTURES=90 LIBCUDF_DEPENDENCY_MODE=latest USE_GDS=on ${sclCMD} build/buildcpp.sh diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index a0f8176a812..d3a800ccdb3 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -24,7 +24,7 @@ on: jobs: conda-cpp-checks: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-25.10 with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} @@ -32,7 +32,7 @@ jobs: sha: ${{ inputs.sha }} conda-cpp-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.10 with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} @@ -41,7 +41,7 @@ jobs: sha: ${{ inputs.sha }} conda-cpp-memcheck-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10 with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} @@ -49,11 +49,11 @@ jobs: sha: ${{ inputs.sha }} node_type: "gpu-l4-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:25.08-latest" + container_image: "rapidsai/ci-conda:25.10-latest" script: "ci/test_cpp_memcheck.sh" cpp-linters: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10 with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} @@ -63,7 +63,7 @@ jobs: file_to_upload: iwyu_results.txt conda-python-cudf-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.10 with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} @@ -73,7 +73,7 @@ jobs: conda-python-other-tests: # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.10 with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} @@ -82,7 +82,7 @@ jobs: script: "ci/test_python_other.sh" conda-java-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10 with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} @@ -90,11 +90,11 @@ jobs: sha: ${{ inputs.sha }} node_type: "gpu-l4-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:25.08-latest" + container_image: "rapidsai/ci-conda:25.10-latest" script: "ci/test_java.sh" conda-notebook-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10 with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} @@ -102,11 +102,11 @@ jobs: sha: ${{ inputs.sha }} node_type: "gpu-l4-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:25.08-latest" + container_image: "rapidsai/ci-conda:25.10-latest" script: "ci/test_notebooks.sh" wheel-tests-cudf: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.10 with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} @@ -115,7 +115,7 @@ jobs: script: ci/test_wheel_cudf.sh wheel-tests-dask-cudf: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.10 with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} @@ -124,7 +124,7 @@ jobs: script: ci/test_wheel_dask_cudf.sh unit-tests-cudf-pandas: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.10 with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} @@ -133,19 +133,20 @@ jobs: script: ci/cudf_pandas_scripts/run_tests.sh third-party-integration-tests-cudf-pandas: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10 with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} date: ${{ inputs.date }} sha: ${{ inputs.sha }} node_type: "gpu-l4-latest-1" - container_image: "rapidsai/ci-conda:cuda12.9.0-ubuntu24.04-py3.12" + # TODO: Switch to ci-conda:25-10-latest when XGBoost has CUDA 13 packages + container_image: "rapidsai/ci-conda:25.10-cuda12.9.1-ubuntu24.04-py3.13" script: | ci/cudf_pandas_scripts/third-party-integration/test.sh python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml wheel-tests-cudf-polars: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.10 with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} @@ -154,7 +155,7 @@ jobs: script: "ci/test_wheel_cudf_polars.sh" cudf-polars-polars-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.10 with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} @@ -163,12 +164,12 @@ jobs: script: "ci/test_cudf_polars_polars_tests.sh" narwhals-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10 with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} date: ${{ inputs.date }} sha: ${{ inputs.sha }} node_type: "gpu-l4-latest-1" - container_image: "rapidsai/ci-conda:25.08-latest" + container_image: "rapidsai/ci-conda:25.10-latest" script: ci/test_narwhals.sh diff --git a/.github/workflows/trigger-breaking-change-alert.yaml b/.github/workflows/trigger-breaking-change-alert.yaml index 593fcb1086a..48bf37afc40 100644 --- a/.github/workflows/trigger-breaking-change-alert.yaml +++ b/.github/workflows/trigger-breaking-change-alert.yaml @@ -12,7 +12,7 @@ jobs: trigger-notifier: if: contains(github.event.pull_request.labels.*.name, 'breaking') secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@branch-25.10 with: sender_login: ${{ github.event.sender.login }} sender_avatar: ${{ github.event.sender.avatar_url }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1483a268ba3..e66e866c4f4 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -63,7 +63,7 @@ repos: ^cpp/src/io/parquet/ipc/Schema_generated.h| ^cpp/src/io/parquet/ipc/Message_generated.h| ^cpp/include/cudf_test/cxxopts.hpp| - ^python/cudf/cudf/tests/data/vocab.txt| + ^python/cudf/cudf/tests/data/text/vocab.txt| ^python/cudf/cudf/tests/text/test_text_methods.py ) - repo: local @@ -158,10 +158,10 @@ repos: - id: verify-codeowners args: [--fix, --project-prefix=cudf] - repo: https://github.com/rapidsai/dependency-file-generator - rev: v1.19.1 + rev: v1.20.0 hooks: - id: rapids-dependency-file-generator - args: ["--clean"] + args: ["--clean", "--warn-all", "--strict"] - repo: https://github.com/shellcheck-py/shellcheck-py rev: v0.10.0.1 hooks: diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index ab7a5731b69..35a896559cd 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -121,7 +121,7 @@ Instructions for a minimal build environment without conda are included below. # create the conda environment (assuming in base `cudf` directory) # note: RAPIDS currently doesn't support `channel_priority: strict`; # use `channel_priority: flexible` instead -conda env create --name cudf_dev --file conda/environments/all_cuda-129_arch-x86_64.yaml +conda env create --name cudf_dev --file conda/environments/all_cuda-130_arch-x86_64.yaml # activate the environment conda activate cudf_dev ``` diff --git a/RAPIDS_BRANCH b/RAPIDS_BRANCH new file mode 100644 index 00000000000..9b1c52d9415 --- /dev/null +++ b/RAPIDS_BRANCH @@ -0,0 +1 @@ +branch-25.10 diff --git a/README.md b/README.md index d5c5782882f..538a1b6d344 100644 --- a/README.md +++ b/README.md @@ -65,6 +65,10 @@ Be sure to select the appropriate cuDF package depending on the major version of CUDA available in your environment: ```bash +# CUDA 13 +pip install cudf-cu13 + +# CUDA 12 pip install cudf-cu12 ``` @@ -73,7 +77,11 @@ pip install cudf-cu12 cuDF can be installed with conda (via [miniforge](https://github.com/conda-forge/miniforge)) from the `rapidsai` channel: ```bash -conda install -c rapidsai -c conda-forge cudf=25.08 +# CUDA 13 +conda install -c rapidsai -c conda-forge cudf=25.10 cuda-version=13.0 + +# CUDA 12 +conda install -c rapidsai -c conda-forge cudf=25.10 cuda-version=12.9 ``` We also provide [nightly Conda packages](https://anaconda.org/rapidsai-nightly) built from the HEAD diff --git a/VERSION b/VERSION index 3af4bda0205..296e35288d1 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -25.08.00 +25.10.00 diff --git a/build.sh b/build.sh index d4443695347..28a3a30738c 100755 --- a/build.sh +++ b/build.sh @@ -73,7 +73,6 @@ BUILD_PER_THREAD_DEFAULT_STREAM=OFF BUILD_REPORT_METRICS=OFF BUILD_REPORT_INCL_CACHE_STATS=OFF BUILD_DISABLE_LARGE_STRINGS=OFF -USE_PROPRIETARY_NVCOMP=ON PYTHON_ARGS_FOR_INSTALL=("-m" "pip" "install" "--no-build-isolation" "--no-deps" "--config-settings" "rapidsai.disable-cuda=true") # Set defaults for vars that may not have been defined externally @@ -153,7 +152,6 @@ function buildLibCudfJniInDocker { -DCMAKE_CUDA_ARCHITECTURES=${CUDF_CMAKE_CUDA_ARCHITECTURES} \ -DCMAKE_INSTALL_PREFIX=/usr/local/rapids \ -DUSE_NVTX=ON \ - -DCUDF_USE_PROPRIETARY_NVCOMP=ON \ -DCUDF_USE_ARROW_STATIC=ON \ -DCUDF_ENABLE_ARROW_S3=OFF \ -DBUILD_TESTS=OFF \ @@ -221,9 +219,6 @@ fi if hasArg --disable_nvtx; then BUILD_NVTX="OFF" fi -if hasArg --opensource_nvcomp; then - USE_PROPRIETARY_NVCOMP="OFF" -fi if hasArg --show_depr_warn; then BUILD_DISABLE_DEPRECATION_WARNINGS=OFF fi @@ -292,7 +287,6 @@ if buildAll || hasArg libcudf; then -DCMAKE_INSTALL_PREFIX="${INSTALL_PREFIX}" \ -DCMAKE_CUDA_ARCHITECTURES="${CUDF_CMAKE_CUDA_ARCHITECTURES}" \ -DUSE_NVTX=${BUILD_NVTX} \ - -DCUDF_USE_PROPRIETARY_NVCOMP=${USE_PROPRIETARY_NVCOMP} \ -DBUILD_TESTS=${BUILD_TESTS} \ -DBUILD_BENCHMARKS=${BUILD_BENCHMARKS} \ -DDISABLE_DEPRECATION_WARNINGS=${BUILD_DISABLE_DEPRECATION_WARNINGS} \ diff --git a/ci/build_wheel_cudf.sh b/ci/build_wheel_cudf.sh index af289fe7229..8b75a01479f 100755 --- a/ci/build_wheel_cudf.sh +++ b/ci/build_wheel_cudf.sh @@ -24,7 +24,7 @@ echo "pylibcudf-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo ${PYLIBCUDF_WHEELHOUSE} # repair wheels and write to the location that artifact-uploading code expects to find them python -m auditwheel repair \ --exclude libcudf.so \ - --exclude libnvcomp.so \ + --exclude libnvcomp.so.* \ --exclude libkvikio.so \ --exclude librapids_logger.so \ --exclude librmm.so \ diff --git a/ci/build_wheel_libcudf.sh b/ci/build_wheel_libcudf.sh index 768ee5c8c0b..ea2a818d97f 100755 --- a/ci/build_wheel_libcudf.sh +++ b/ci/build_wheel_libcudf.sh @@ -27,17 +27,13 @@ rapids-pip-retry install \ # 0 really means "add --no-build-isolation" (ref: https://github.com/pypa/pip/issues/5735) export PIP_NO_BUILD_ISOLATION=0 -# TODO(nvcomp): when `nvcomp` supports Python 3.13 and we de-vendor `nvcomp` from `kvikio` -# this should be switched back to using the nvcomp runtime wheel -# https://github.com/rapidsai/build-planning/issues/171 -# export SKBUILD_CMAKE_ARGS="-DUSE_NVCOMP_RUNTIME_WHEEL=ON" -export SKBUILD_CMAKE_ARGS="-DUSE_NVCOMP_FROM_LIBKVIKIO_WHEEL=ON" +export SKBUILD_CMAKE_ARGS="-DUSE_NVCOMP_RUNTIME_WHEEL=OFF" ./ci/build_wheel.sh "${package_name}" "${package_dir}" # repair wheels and write to the location that artifact-uploading code expects to find them python -m auditwheel repair \ - --exclude libnvcomp.so.4 \ --exclude libkvikio.so \ + --exclude libnvcomp.so.5 \ --exclude librapids_logger.so \ --exclude librmm.so \ -w "${RAPIDS_WHEEL_BLD_OUTPUT_DIR}" \ diff --git a/ci/build_wheel_pylibcudf.sh b/ci/build_wheel_pylibcudf.sh index 3f99f75ceb4..4c09752626f 100755 --- a/ci/build_wheel_pylibcudf.sh +++ b/ci/build_wheel_pylibcudf.sh @@ -22,7 +22,7 @@ echo "libcudf-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo ${LIBCUDF_WHEELHOUSE}/lib # repair wheels and write to the location that artifact-uploading code expects to find them python -m auditwheel repair \ --exclude libcudf.so \ - --exclude libnvcomp.so \ + --exclude libnvcomp.so.* \ --exclude libkvikio.so \ --exclude librapids_logger.so \ --exclude librmm.so \ diff --git a/ci/cudf_pandas_scripts/pandas-tests/diff.sh b/ci/cudf_pandas_scripts/pandas-tests/diff.sh deleted file mode 100755 index aec1acd1539..00000000000 --- a/ci/cudf_pandas_scripts/pandas-tests/diff.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env bash -# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES. -# All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -# Download the summarized results of running the Pandas tests on both the main -# branch and the PR branch: - -# Hard-coded needs to match the version deduced by rapids-upload-artifacts-dir -GH_JOB_NAME="pandas-tests-diff / build" -RAPIDS_FULL_VERSION=$(<./VERSION) -rapids-logger "Github job name: ${GH_JOB_NAME}" -rapids-logger "Rapids version: ${RAPIDS_FULL_VERSION}" - -PY_VER="313" -PR_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py${PY_VER}.pr-${RAPIDS_FULL_VERSION}-results.json - -rapids-logger "Fetching latest available results from nightly" -aws s3api list-objects-v2 --bucket rapids-downloads --prefix "nightly/cudf/" --query "sort_by(Contents[?ends_with(Key, '_py${PY_VER}.main-${RAPIDS_FULL_VERSION}-results.json')], &LastModified)[::].[Key]" --output text | tee s3_output.txt -COMPARE_ENV=$(tail -n 1 s3_output.txt) -rapids-logger "Latest available results from nightly: ${COMPARE_ENV}" - -aws s3 cp "s3://rapids-downloads/${COMPARE_ENV}" main-results.json -aws s3 cp "$PR_ARTIFACT" pr-results.json - -# Compute the diff and prepare job summary: -python -m pip install pandas tabulate -python ci/cudf_pandas_scripts/pandas-tests/job-summary.py main-results.json pr-results.json | tee summary.txt >> "$GITHUB_STEP_SUMMARY" - -COMMENT=$(head -1 summary.txt | grep -oP '\d+/\d+ \(\d+\.\d+%\).*?(a decrease by|an increase by) \d+\.\d+%') -echo "$COMMENT" -jq --arg COMMENT "$COMMENT" --arg GH_JOB_NAME "$GH_JOB_NAME" -n \ - '{"context": "Pandas tests", - "description": $COMMENT, - "state":"success", - "job_name": $GH_JOB_NAME}' \ - > gh-status.json diff --git a/ci/cudf_pandas_scripts/pandas-tests/job-summary.py b/ci/cudf_pandas_scripts/pandas-tests/job-summary.py index af3e28f440f..60668280d7e 100644 --- a/ci/cudf_pandas_scripts/pandas-tests/job-summary.py +++ b/ci/cudf_pandas_scripts/pandas-tests/job-summary.py @@ -12,41 +12,85 @@ def get_total_and_passed(results): total_failed = 0 total_errored = 0 total_passed = 0 + total_skipped = 0 + total_xfailed_by_cudf_pandas = 0 + total_skipped_by_cudf_pandas = 0 for module_name, row in results.items(): total_failed += row.get("failed", 0) total_errored += row.get("errored", 0) total_passed += row.get("passed", 0) - total_tests = total_failed + total_errored + total_passed - return total_tests, total_passed + total_skipped += row.get("skipped", 0) + total_xfailed_by_cudf_pandas += row.get("xfailed_by_cudf_pandas", 0) + total_skipped_by_cudf_pandas += row.get("skipped_by_cudf_pandas", 0) + total_tests = total_failed + total_errored + total_passed + total_skipped + return ( + total_tests, + total_passed, + total_xfailed_by_cudf_pandas, + total_skipped_by_cudf_pandas, + total_skipped, + ) main_json = sys.argv[1] pr_json = sys.argv[2] +branch_version = sys.argv[3] # read the results of summarize-test-results.py --summary with open(main_json) as f: main_results = json.load(f) -main_total, main_passed = get_total_and_passed(main_results) +( + main_total, + main_passed, + main_xfailed_by_cudf_pandas, + main_skipped_by_cudf_pandas, + main_skipped, +) = get_total_and_passed(main_results) with open(pr_json) as f: pr_results = json.load(f) -pr_total, pr_passed = get_total_and_passed(pr_results) +( + pr_total, + pr_passed, + pr_xfailed_by_cudf_pandas, + pr_skipped_by_cudf_pandas, + pr_skipped, +) = get_total_and_passed(pr_results) passing_percentage = pr_passed / pr_total * 100 -pass_rate_change = abs(pr_passed - main_passed) / main_passed * 100 -rate_change_type = "a decrease" if pr_passed < main_passed else "an increase" - -comment = ( - "Merging this PR would result in " - f"{pr_passed}/{pr_total} ({passing_percentage:.2f}%) " - "Pandas tests passing, " - f"{rate_change_type} by " - f"{pass_rate_change:.2f}%. " - f"Trunk stats: {main_passed}/{main_total}." + + +metrics_df = pd.DataFrame( + { + "This PR": [ + pr_total, + pr_passed, + pr_skipped_by_cudf_pandas, + pr_xfailed_by_cudf_pandas, + pr_skipped + - (pr_skipped_by_cudf_pandas + pr_xfailed_by_cudf_pandas), + ], + f"branch-{branch_version}": [ + main_total, + main_passed, + main_skipped_by_cudf_pandas, + main_xfailed_by_cudf_pandas, + main_skipped + - (main_skipped_by_cudf_pandas + main_xfailed_by_cudf_pandas), + ], + }, + index=[ + "Total tests", + "Passed tests", + "cudf.Pandas Skipped", + "cudf.Pandas xFailed", + "pandas skipped", + ], ) def emoji_passed(x): + """Format number with emoji: positive -> ✅, negative -> ❌""" if x > 0: return f"{x}✅" elif x < 0: @@ -56,6 +100,7 @@ def emoji_passed(x): def emoji_failed(x): + """Format number with emoji: positive -> ❌, negative -> ✅ (inverse of emoji_passed)""" if x > 0: return f"{x}❌" elif x < 0: @@ -67,6 +112,7 @@ def emoji_failed(x): # convert pr_results to a pandas DataFrame and then a markdown table pr_df = pd.DataFrame.from_dict(pr_results, orient="index").sort_index() main_df = pd.DataFrame.from_dict(main_results, orient="index").sort_index() +# Calculate CPU and GPU usage percentages for main branch total_usage = main_df["_slow_function_call"] + main_df["_fast_function_call"] main_df["CPU Usage"] = ( (main_df["_slow_function_call"] / total_usage) * 100.0 @@ -75,6 +121,7 @@ def emoji_failed(x): (main_df["_fast_function_call"] / total_usage) * 100.0 ).round(1) +# Calculate CPU and GPU usage percentages for PR total_usage = pr_df["_slow_function_call"] + pr_df["_fast_function_call"] pr_df["CPU Usage"] = ( (pr_df["_slow_function_call"] / total_usage) * 100.0 @@ -83,17 +130,20 @@ def emoji_failed(x): (pr_df["_fast_function_call"] / total_usage) * 100.0 ).round(1) +# Calculate average usages cpu_usage_mean = pr_df["CPU Usage"].mean().round(2) gpu_usage_mean = pr_df["GPU Usage"].mean().round(2) - -gpu_usage_rate_change = abs( +gpu_usage_rate_change = ( pr_df["GPU Usage"].mean() - main_df["GPU Usage"].mean() -) +).round(2) + +# Handle NaN values pr_df["CPU Usage"] = pr_df["CPU Usage"].fillna(0) pr_df["GPU Usage"] = pr_df["GPU Usage"].fillna(0) main_df["CPU Usage"] = main_df["CPU Usage"].fillna(0) main_df["GPU Usage"] = main_df["GPU Usage"].fillna(0) +# Calculate differences between PR and main diff_df = pr_df - main_df diff_df["CPU Usage"] = diff_df["CPU Usage"].round(1).fillna(0) diff_df["GPU Usage"] = diff_df["GPU Usage"].round(1).fillna(0) @@ -102,59 +152,51 @@ def emoji_failed(x): pr_df["CPU Usage"] = pr_df["CPU Usage"].astype(str) + "%" pr_df["GPU Usage"] = pr_df["GPU Usage"].astype(str) + "%" -pr_df = pr_df[ - ["total", "passed", "failed", "skipped", "CPU Usage", "GPU Usage"] -] -diff_df = diff_df[ - ["total", "passed", "failed", "skipped", "CPU Usage", "GPU Usage"] -] +# Select relevant columns +pr_df = pr_df[["total", "CPU Usage", "GPU Usage"]] +diff_df = diff_df[["total", "CPU Usage", "GPU Usage"]] + +# Rename diff columns to indicate they are differences diff_df.columns = diff_df.columns + "_diff" -diff_df["passed_diff"] = diff_df["passed_diff"].map(emoji_passed) -diff_df["failed_diff"] = diff_df["failed_diff"].map(emoji_failed) -diff_df["skipped_diff"] = diff_df["skipped_diff"].map(emoji_failed) +# Combine PR results with differences df = pd.concat([pr_df, diff_df], axis=1) df = df.rename_axis("Test module") +# Rename columns for better readability df = df.rename( columns={ "total": "Total tests", - "passed": "Passed tests", - "failed": "Failed tests", - "skipped": "Skipped tests", "total_diff": "Total delta", - "passed_diff": "Passed delta", - "failed_diff": "Failed delta", - "skipped_diff": "Skipped delta", "CPU Usage_diff": "CPU Usage delta", "GPU Usage_diff": "GPU Usage delta", } ) + +# Sort by CPU usage delta and total tests df = df.sort_values(by=["CPU Usage delta", "Total tests"], ascending=False) + +# Apply emoji formatting to usage deltas df["CPU Usage delta"] = df["CPU Usage delta"].map(emoji_failed) df["GPU Usage delta"] = df["GPU Usage delta"].map(emoji_passed) + +# Select final columns to display df = df[ [ "Total tests", "CPU Usage delta", "GPU Usage delta", - "Passed tests", - "Failed tests", - "Skipped tests", "CPU Usage", "GPU Usage", "Total delta", - "Passed delta", - "Failed delta", - "Skipped delta", ] ] -print(comment) +# Print summary and results +print(metrics_df.to_markdown()) print() print( - f"Average GPU usage: {gpu_usage_mean}% {'an increase' if gpu_usage_rate_change > 0 else 'a decrease'} by {gpu_usage_rate_change}%" + f"Average GPU usage: {gpu_usage_mean}% ({gpu_usage_rate_change:+.2f}% change from trunk)" ) -print() print(f"Average CPU usage: {cpu_usage_mean}%") print() print("Here are the results of running the Pandas tests against this PR:") diff --git a/ci/cudf_pandas_scripts/pandas-tests/run.sh b/ci/cudf_pandas_scripts/pandas-tests/run.sh index 74d1fc4bdaf..53ddd00a55b 100755 --- a/ci/cudf_pandas_scripts/pandas-tests/run.sh +++ b/ci/cudf_pandas_scripts/pandas-tests/run.sh @@ -11,6 +11,9 @@ EXITCODE=0 trap "EXITCODE=1" ERR set +e +rapids-logger "Check GPU usage" +nvidia-smi + PANDAS_TESTS_BRANCH=${1} RAPIDS_FULL_VERSION=$(<./VERSION) rapids-logger "Running Pandas tests using $PANDAS_TESTS_BRANCH branch and rapids-version $RAPIDS_FULL_VERSION" @@ -33,8 +36,8 @@ RESULTS_DIR=${RAPIDS_TESTS_DIR:-"$(mktemp -d)"} RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${RESULTS_DIR}/test-results"}/ mkdir -p "${RAPIDS_TESTS_DIR}" -bash python/cudf/cudf/pandas/scripts/run-pandas-tests.sh \ - --numprocesses 5 \ +timeout 90m bash python/cudf/cudf/pandas/scripts/run-pandas-tests.sh \ + --numprocesses 6 \ --tb=line \ -vv \ --disable-warnings \ @@ -44,12 +47,35 @@ bash python/cudf/cudf/pandas/scripts/run-pandas-tests.sh \ --dist worksteal \ --report-log="${PANDAS_TESTS_BRANCH}.json" 2>&1 -SUMMARY_FILE_NAME=${PANDAS_TESTS_BRANCH}-${RAPIDS_FULL_VERSION}-results.json +SUMMARY_FILE_NAME=${PANDAS_TESTS_BRANCH}-results.json # summarize the results and save them to artifacts: -python python/cudf/cudf/pandas/scripts/summarize-test-results.py --output json pandas-testing/"${PANDAS_TESTS_BRANCH}.json" > "pandas-testing/${SUMMARY_FILE_NAME}" -RAPIDS_ARTIFACTS_DIR=${RAPIDS_ARTIFACTS_DIR:-"${PWD}/artifacts"} -mkdir -p "${RAPIDS_ARTIFACTS_DIR}" -mv pandas-testing/"${SUMMARY_FILE_NAME}" "${RAPIDS_ARTIFACTS_DIR}"/ -rapids-upload-to-s3 "${RAPIDS_ARTIFACTS_DIR}"/"${SUMMARY_FILE_NAME}" "${RAPIDS_ARTIFACTS_DIR}" +python python/cudf/cudf/pandas/scripts/summarize-test-results.py --output json pandas-testing/"${PANDAS_TESTS_BRANCH}.json" > "./${SUMMARY_FILE_NAME}" + +# Exit early if running tests for main branch +if [[ "${PANDAS_TESTS_BRANCH}" == "main" ]]; then + rapids-logger "Exiting early for main branch testing: ${EXITCODE}" + exit ${EXITCODE} +fi + + +MAIN_RUN_ID=$( + gh run list \ + -w "Pandas Test Job" \ + -b branch-25.10 \ + --repo 'rapidsai/cudf' \ + --status success \ + --limit 7 \ + --json 'createdAt,databaseId' \ + --jq 'sort_by(.createdAt) | reverse | .[0] | .databaseId' +) +rapids-logger "Fetching latest available results from nightly: ${MAIN_RUN_ID}" +gh run download \ + --repo 'rapidsai/cudf' \ + --name main-results.json \ + $MAIN_RUN_ID + +# Compute the diff and prepare job summary: +python ci/cudf_pandas_scripts/pandas-tests/job-summary.py main-results.json pr-results.json "${RAPIDS_FULL_VERSION}" >> "$GITHUB_STEP_SUMMARY" + rapids-logger "Test script exiting with value: $EXITCODE" exit ${EXITCODE} diff --git a/ci/cudf_pandas_scripts/run_tests.sh b/ci/cudf_pandas_scripts/run_tests.sh index e953f7be090..439645a0add 100755 --- a/ci/cudf_pandas_scripts/run_tests.sh +++ b/ci/cudf_pandas_scripts/run_tests.sh @@ -116,7 +116,7 @@ python -m pytest -p cudf.pandas \ # More details: https://github.com/rapidsai/cudf/pull/16930#issuecomment-2707873968 python -m pytest -p cudf.pandas \ --ignore=./python/cudf/cudf_pandas_tests/third_party_integration_tests/ \ - --numprocesses=1 \ + --numprocesses=0 \ -k "profiler" \ ./python/cudf/cudf_pandas_tests/ @@ -133,15 +133,19 @@ for version in "${versions[@]}"; do --numprocesses=8 \ --dist=worksteal \ -k "not profiler" \ + -m "not serial" \ --cov-config=./python/cudf/.coveragerc \ --cov=cudf \ --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cudf-pandas-coverage.xml" \ --cov-report=term \ ./python/cudf/cudf_pandas_tests/ + # NOTE: We don't currently run serial tests (only 1 as of 2025-07-25) + # with multiple versions of pandas. + python -m pytest -p cudf.pandas \ --ignore=./python/cudf/cudf_pandas_tests/third_party_integration_tests/ \ - --numprocesses=1 \ + --numprocesses=0 \ -k "profiler" \ ./python/cudf/cudf_pandas_tests/ done diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh index 0e3a2d471f5..9dfbf259d2c 100755 --- a/ci/release/update-version.sh +++ b/ci/release/update-version.sh @@ -87,6 +87,7 @@ for FILE in .github/workflows/*.yaml .github/workflows/*.yml; do done sed_runner "s/branch-[0-9]\+\.[0-9]\+/branch-${NEXT_SHORT_TAG}/g" ci/test_wheel_cudf_polars.sh sed_runner "s/branch-[0-9]\+\.[0-9]\+/branch-${NEXT_SHORT_TAG}/g" ci/test_cudf_polars_polars_tests.sh +sed_runner "s/branch-[0-9]\+\.[0-9]\+/branch-${NEXT_SHORT_TAG}/g" ci/cudf_pandas_scripts/pandas-tests/run.sh # Java files NEXT_FULL_JAVA_TAG="${NEXT_SHORT_TAG}.${PATCH_PEP440}-SNAPSHOT" diff --git a/ci/run_cudf_examples.sh b/ci/run_cudf_examples.sh index 24de63ef45f..cc202fa9d56 100755 --- a/ci/run_cudf_examples.sh +++ b/ci/run_cudf_examples.sh @@ -9,14 +9,21 @@ trap "EXITCODE=1" ERR # Support customizing the examples' install location cd "${INSTALL_PREFIX:-${CONDA_PREFIX:-/usr}}/bin/examples/libcudf/" || exit +cd basic || exit compute-sanitizer --tool memcheck basic_example +cd .. +cd nested_types || exit compute-sanitizer --tool memcheck deduplication +cd .. +cd strings || exit compute-sanitizer --tool memcheck custom_optimized names.csv compute-sanitizer --tool memcheck custom_prealloc names.csv compute-sanitizer --tool memcheck custom_with_malloc names.csv +cd .. +cd string_transformers || exit compute-sanitizer --tool memcheck compute_checksum_jit info.csv output.csv compute-sanitizer --tool memcheck extract_email_jit info.csv output.csv compute-sanitizer --tool memcheck extract_email_precompiled info.csv output.csv @@ -24,11 +31,14 @@ compute-sanitizer --tool memcheck format_phone_jit info.csv output.csv compute-sanitizer --tool memcheck format_phone_precompiled info.csv output.csv compute-sanitizer --tool memcheck localize_phone_jit info.csv output.csv compute-sanitizer --tool memcheck localize_phone_precompiled info.csv output.csv +cd .. +cd parquet_io || exit compute-sanitizer --tool memcheck parquet_io example.parquet compute-sanitizer --tool memcheck parquet_io example.parquet output.parquet DELTA_BINARY_PACKED ZSTD TRUE compute-sanitizer --tool memcheck parquet_io_multithreaded example.parquet compute-sanitizer --tool memcheck parquet_io_multithreaded example.parquet 4 DEVICE_BUFFER 2 2 +cd .. exit ${EXITCODE} diff --git a/ci/run_cudf_polars_pytests.sh b/ci/run_cudf_polars_pytests.sh index 4fdf7080c03..304573d6a85 100755 --- a/ci/run_cudf_polars_pytests.sh +++ b/ci/run_cudf_polars_pytests.sh @@ -12,7 +12,7 @@ cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cudf_polars/ python -m pytest --cache-clear "$@" tests --executor in-memory # Test the default "streaming" executor -python -m pytest --cache-clear "$@" tests +python -m pytest --cache-clear "$@" tests --executor streaming # Test the "streaming" executor with small blocksize python -m pytest --cache-clear "$@" tests --executor streaming --blocksize-mode small diff --git a/ci/test_narwhals.sh b/ci/test_narwhals.sh index 6af3be24b3b..bd5bd7e208d 100755 --- a/ci/test_narwhals.sh +++ b/ci/test_narwhals.sh @@ -27,45 +27,34 @@ rapids-pip-retry install -U -e . rapids-logger "Check narwhals versions" python -c "import narwhals; print(narwhals.show_versions())" -# test_horizontal_slice_with_series: xpassing in Narwhals, fixed in cuDF https://github.com/rapidsai/cudf/pull/18558 -# test_rolling_mean_expr_lazy_grouped: xpassing in Narwhals -# test_rolling_std_expr_lazy_grouped: xpassing in Narwhals -# test_rolling_sum_expr_lazy_grouped: xpassing in Narwhals -# test_rolling_var_expr_lazy_grouped: xpassing in Narwhals -TESTS_THAT_NEED_NARWHALS_FIX_FOR_CUDF="not test_rolling_mean_expr_lazy_grouped[cudf-expected_a4-3-1-True] \ -and not test_rolling_mean_expr_lazy_grouped[cudf-expected_a5-4-1-True] \ -and not test_rolling_mean_expr_lazy_grouped[cudf-expected_a6-5-1-True] \ -and not test_rolling_std_expr_lazy_grouped[cudf-expected_a4-3-1-True-1] \ -and not test_rolling_std_expr_lazy_grouped[cudf-expected_a5-4-1-True-1] \ -and not test_rolling_std_expr_lazy_grouped[cudf-expected_a6-5-1-True-0] \ -and not test_rolling_sum_expr_lazy_grouped[cudf-expected_a4-3-1-True] \ -and not test_rolling_sum_expr_lazy_grouped[cudf-expected_a5-4-1-True] \ -and not test_rolling_sum_expr_lazy_grouped[cudf-expected_a6-5-1-True] \ -and not test_rolling_var_expr_lazy_grouped[cudf-expected_a4-3-1-True-1] \ -and not test_rolling_var_expr_lazy_grouped[cudf-expected_a5-4-1-True-1] \ -and not test_rolling_var_expr_lazy_grouped[cudf-expected_a6-5-1-True-0] \ -and not test_horizontal_slice_with_series" +# test_to_numpy[cudf]: Passes as of https://github.com/rapidsai/cudf/pull/19923 +# test_fill_null_strategies_with_limit_as_none[cudf]: Narwhals passes inplace=None instead of a bool +# test_fill_null_series_limit_as_none[cudf]: Narwhals passes inplace=None instead of a bool +TESTS_THAT_NEED_NARWHALS_FIX_FOR_CUDF=" \ +test_to_numpy[cudf] or \ +test_fill_null_strategies_with_limit_as_none[cudf] or \ +test_fill_null_series_limit_as_none[cudf] \ +" rapids-logger "Run narwhals tests for cuDF" PYTEST_DISABLE_PLUGIN_AUTOLOAD=1 python -m pytest \ --cache-clear \ - --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf-narwhals.xml" \ -p xdist \ -p env \ -p no:pytest_benchmark \ -p cudf.testing.narwhals_test_plugin \ - -k "$TESTS_THAT_NEED_NARWHALS_FIX_FOR_CUDF" \ + -k "not ( \ + ${TESTS_THAT_NEED_NARWHALS_FIX_FOR_CUDF} \ + )" \ --numprocesses=8 \ --dist=worksteal \ --constructors=cudf -# test_dtypes: With cudf.pandas loaded, to_pandas() preserves Arrow dtypes like list and struct, so pandas -# columns aren't object anymore. The test expects object, causing a mismatch. -# test_nan: Narwhals expect this test to fail, but as of polars 1.30 we raise a RuntimeError, -# not polars ComputeError. So the test is looking for the wrong error and fails. +# test_datetime[polars[lazy]]: Fixed in the next narwhals release >2.0.1 +# test_nan[polars[lazy]]: Passes as of https://github.com/rapidsai/cudf/pull/19742 TESTS_THAT_NEED_NARWHALS_FIX_FOR_CUDF_POLARS=" \ -test_dtypes or \ -test_nan \ +test_datetime[polars[lazy]] or \ +test_nan[polars[lazy]] \ " rapids-logger "Run narwhals tests for cuDF Polars" @@ -92,12 +81,23 @@ rapids-logger "Run narwhals tests for cuDF Pandas" # test_maybe_convert_dtypes_pandas: https://github.com/rapidsai/cudf/issues/14149 # test_log_dtype_pandas: cudf is promoting the type to float64 # test_len_over_2369: It fails during fallback. The error is 'DataFrame' object has no attribute 'to_frame' +# test_all_ignore_nulls, test_allh_kleene, and test_anyh_kleene: https://github.com/rapidsai/cudf/issues/19417 +# test_offset_by_date_pandas: https://github.com/rapidsai/cudf/issues/19418 +# test_select_boolean_cols and test_select_boolean_cols_multi_group_by: https://github.com/rapidsai/cudf/issues/19421 +# test_to_datetime_pd_preserves_pyarrow_backend_dtype: https://github.com/rapidsai/cudf/issues/19422 TESTS_THAT_NEED_CUDF_FIX=" \ test_is_finite_expr or \ test_is_finite_series or \ test_maybe_convert_dtypes_pandas or \ test_log_dtype_pandas or \ -test_len_over_2369 \ +test_len_over_2369 or \ +test_all_ignore_nulls or \ +test_allh_kleene or \ +test_anyh_kleene or \ +test_offset_by_date_pandas or \ +test_select_boolean_cols or \ +test_select_boolean_cols_multi_group_by or \ +test_to_datetime_pd_preserves_pyarrow_backend_dtype \ " # test_array_dunder_with_copy: https://github.com/rapidsai/cudf/issues/18248#issuecomment-2719234741 @@ -112,8 +112,13 @@ test_pandas_object_series \ # test_dtypes: With cudf.pandas loaded, to_pandas() preserves Arrow dtypes like list and struct, so pandas # columns aren't object anymore. The test expects object, causing a mismatch. +# test_get_dtype_backend: We now preserve arrow extension dtypes +# (e.g. bool[pyarrow], duration[ns][pyarrow]). +# test_explode_multiple_cols[pandas-l1-more_columns0-expected0] matches pandas now so needs a skip in the test TESTS_THAT_NEED_NARWHALS_FIX_FOR_CUDF_PANDAS=" \ -test_dtypes \ +test_dtypes or \ +test_explode_multiple_cols or \ +(test_get_dtype_backend and pyarrow and (pandas or modin)) \ " PYTEST_DISABLE_PLUGIN_AUTOLOAD=1 NARWHALS_DEFAULT_CONSTRUCTORS=pandas python -m pytest \ diff --git a/ci/test_python_cudf.sh b/ci/test_python_cudf.sh index d92654d35d0..6188fc82a54 100755 --- a/ci/test_python_cudf.sh +++ b/ci/test_python_cudf.sh @@ -43,6 +43,7 @@ rapids-logger "pytest cudf" rapids-logger "pytest for cudf benchmarks" ./ci/run_cudf_pytest_benchmarks.sh \ + --benchmark-disable \ --numprocesses=8 \ --dist=worksteal \ --cov-config=.coveragerc \ @@ -52,6 +53,7 @@ rapids-logger "pytest for cudf benchmarks" rapids-logger "pytest for cudf benchmarks using pandas" ./ci/run_cudf_pandas_pytest_benchmarks.sh \ + --benchmark-disable \ --numprocesses=8 \ --dist=worksteal \ --cov-config=.coveragerc \ diff --git a/ci/test_wheel_cudf.sh b/ci/test_wheel_cudf.sh index e28ac0514a7..9cbd237511d 100755 --- a/ci/test_wheel_cudf.sh +++ b/ci/test_wheel_cudf.sh @@ -12,11 +12,38 @@ CUDF_WHEELHOUSE=$(RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-do LIBCUDF_WHEELHOUSE=$(RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-github cpp) PYLIBCUDF_WHEELHOUSE=$(RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-github python) -rapids-logger "Install cudf, pylibcudf, and test requirements" +rapids-logger "Install pylibcudf and its basic dependencies in a virtual environment" # generate constraints (possibly pinning to oldest support versions of dependencies) rapids-generate-pip-constraints py_test_cudf ./constraints.txt +RESULTS_DIR=${RAPIDS_TESTS_DIR:-"$(mktemp -d)"} +RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${RESULTS_DIR}/test-results"}/ +mkdir -p "${RAPIDS_TESTS_DIR}" + +# To test pylibcudf without its optional dependencies, we create a virtual environment +python -m venv env +. env/bin/activate +rapids-pip-retry install \ + -v \ + --constraint ./constraints.txt \ + --constraint "${PIP_CONSTRAINT}" \ + "$(echo "${LIBCUDF_WHEELHOUSE}"/libcudf_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)" \ + "$(echo "${PYLIBCUDF_WHEELHOUSE}"/pylibcudf_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)[test]" + +rapids-logger "pytest pylibcudf without optional dependencies" +pushd python/pylibcudf/tests +python -m pytest \ + --cache-clear \ + --numprocesses=8 \ + --dist=worksteal \ + . +popd + +deactivate + +rapids-logger "Install cudf, pylibcudf, and test requirements" + # notes: # # * echo to expand wildcard before adding `[test]` requires for pip @@ -29,12 +56,7 @@ rapids-pip-retry install \ --constraint "${PIP_CONSTRAINT}" \ "$(echo "${CUDF_WHEELHOUSE}"/cudf_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)[test]" \ "$(echo "${LIBCUDF_WHEELHOUSE}"/libcudf_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)" \ - "$(echo "${PYLIBCUDF_WHEELHOUSE}"/pylibcudf_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)[test]" - -RESULTS_DIR=${RAPIDS_TESTS_DIR:-"$(mktemp -d)"} -RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${RESULTS_DIR}/test-results"}/ -mkdir -p "${RAPIDS_TESTS_DIR}" - + "$(echo "${PYLIBCUDF_WHEELHOUSE}"/pylibcudf_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)[test, pyarrow, numpy]" rapids-logger "pytest pylibcudf" pushd python/pylibcudf/tests diff --git a/ci/test_wheel_cudf_polars.sh b/ci/test_wheel_cudf_polars.sh index 5ec9558e0de..b2683eeadbc 100755 --- a/ci/test_wheel_cudf_polars.sh +++ b/ci/test_wheel_cudf_polars.sh @@ -35,6 +35,8 @@ rapids-pip-retry install \ rapids-logger "Run cudf_polars tests" +POLARS_VERSIONS=$(python ci/utils/fetch_polars_versions.py --latest-patch-only dependencies.yaml) + # shellcheck disable=SC2317 function set_exitcode() { @@ -44,16 +46,50 @@ EXITCODE=0 trap set_exitcode ERR set +e -./ci/run_cudf_polars_pytests.sh \ - --cov=cudf_polars \ - --cov-fail-under=100 \ - --cov-report=term-missing:skip-covered \ - --cov-config=./pyproject.toml \ - --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf-polars.xml" +PASSED=() +FAILED=() + +read -r -a VERSIONS <<< "${POLARS_VERSIONS}" +LATEST_VERSION="${VERSIONS[-1]}" + +for version in "${VERSIONS[@]}"; do + rapids-logger "Installing polars==${version}" + pip install -U "polars==${version}" + + rapids-logger "Running tests for polars==${version}" + + if [ "${version}" == "${LATEST_VERSION}" ]; then + COVERAGE_ARGS=( + --cov=cudf_polars + --cov-fail-under=100 + --cov-report=term-missing:skip-covered + --cov-config=./pyproject.toml + ) + else + COVERAGE_ARGS=(--no-cov) + fi + + ./ci/run_cudf_polars_pytests.sh \ + "${COVERAGE_ARGS[@]}" \ + --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf-polars-${version}.xml" + + if [ $? -ne 0 ]; then + EXITCODE=1 + FAILED+=("${version}") + rapids-logger "Tests failed for polars==${version}" + else + PASSED+=("${version}") + rapids-logger "Tests passed for polars==${version}" + fi +done trap ERR set -e +rapids-logger "Polars test summary:" +rapids-logger "PASSED: ${PASSED[*]:-none}" +rapids-logger "FAILED: ${FAILED[*]:-none}" + if [ ${EXITCODE} != 0 ]; then rapids-logger "Testing FAILED: exitcode ${EXITCODE}" else diff --git a/ci/utils/fetch_polars_versions.py b/ci/utils/fetch_polars_versions.py new file mode 100644 index 00000000000..643081c5642 --- /dev/null +++ b/ci/utils/fetch_polars_versions.py @@ -0,0 +1,93 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import argparse +import json +import ssl +import urllib.request + +import certifi +import yaml +from packaging.specifiers import SpecifierSet +from packaging.version import Version + + +def get_polars_specifier(deps_yaml_path): + with open(deps_yaml_path, "r") as f: + deps = yaml.safe_load(f) + + try: + includes = deps["files"]["all"]["includes"] + if "run_cudf_polars" not in includes: + raise KeyError() + except KeyError: + raise RuntimeError("run_cudf_polars not found in dependencies.yaml") + + try: + pkgs = deps["dependencies"]["run_cudf_polars"]["common"] + for entry in pkgs: + for pkg in entry.get("packages", []): + if isinstance(pkg, str) and pkg.startswith("polars"): + spec = pkg.removeprefix("polars").strip() + if spec: + return spec + except KeyError: + pass + + raise RuntimeError("Polars specifier not found in dependencies.yaml") + + +def get_latest_versions_per_minor(versions): + latest = {} + for v in versions: + key = (v.major, v.minor) + if key not in latest or v > latest[key]: + latest[key] = v + return sorted(latest.values()) + + +def get_polars_versions(polars_range, latest_only=False): + url = "https://pypi.org/pypi/polars/json" + # Set a timeout for the request to avoid hanging + timeout = 10 # seconds + + try: + context = ssl.create_default_context(cafile=certifi.where()) + with urllib.request.urlopen( + url, timeout=timeout, context=context + ) as response: + data = json.loads(response.read()) + except Exception as e: + raise RuntimeError(f"Failed to fetch polars metadata from PyPI: {e}") + + all_versions = [Version(v) for v in data["releases"]] + specifier = SpecifierSet(polars_range) + matching = [v for v in all_versions if v in specifier] + + if latest_only: + matching = get_latest_versions_per_minor(matching) + + return [str(v) for v in sorted(matching)] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Filter polars versions by dependencies.yaml." + ) + parser.add_argument( + "deps_yaml", + nargs="?", + default="./dependencies.yaml", + help="Path to dependencies.yaml", + ) + parser.add_argument( + "--latest-patch-only", + action="store_true", + help="Return only the latest patch per minor version", + ) + args = parser.parse_args() + + polars_range = get_polars_specifier(args.deps_yaml) + versions = get_polars_versions( + polars_range, latest_only=args.latest_patch_only + ) + print(" ".join(versions)) diff --git a/cmake/RAPIDS.cmake b/cmake/RAPIDS.cmake index d112951d3c1..ddef819498d 100644 --- a/cmake/RAPIDS.cmake +++ b/cmake/RAPIDS.cmake @@ -18,9 +18,9 @@ cmake_minimum_required(VERSION 3.30.4 FATAL_ERROR) # Allow users to control which version is used -if(NOT rapids-cmake-version OR NOT rapids-cmake-version MATCHES [[^([0-9][0-9])\.([0-9][0-9])$]]) +if(NOT (rapids-cmake-branch OR rapids-cmake-version)) message( - FATAL_ERROR "The CMake variable rapids-cmake-version must be defined in the format MAJOR.MINOR." + FATAL_ERROR "The CMake variable `rapids-cmake-branch` or `rapids-cmake-version` must be defined" ) endif() @@ -33,7 +33,7 @@ endif() # Allow users to control which branch is fetched if(NOT rapids-cmake-branch) # Define a default branch if the user doesn't set one - set(rapids-cmake-branch "branch-${rapids-cmake-version}") + set(rapids-cmake-branch "release/${rapids-cmake-version}") endif() # Allow users to control the exact URL passed to FetchContent diff --git a/cmake/rapids_config.cmake b/cmake/rapids_config.cmake index abe468dce80..b2c54a3f27d 100644 --- a/cmake/rapids_config.cmake +++ b/cmake/rapids_config.cmake @@ -26,5 +26,19 @@ else() ) endif() -set(rapids-cmake-version "${RAPIDS_VERSION_MAJOR_MINOR}") +# Use STRINGS to trim whitespace/newlines +file(STRINGS "${CMAKE_CURRENT_LIST_DIR}/../RAPIDS_BRANCH" _rapids_branch) +if(NOT _rapids_branch) + message( + FATAL_ERROR + "Could not determine branch name to use for checking out rapids-cmake. The file \"${CMAKE_CURRENT_LIST_DIR}/../RAPIDS_BRANCH\" is missing." + ) +endif() + +if(NOT rapids-cmake-version) + set(rapids-cmake-version "${RAPIDS_VERSION_MAJOR_MINOR}") +endif() +if(NOT rapids-cmake-branch) + set(rapids-cmake-branch "${_rapids_branch}") +endif() include("${CMAKE_CURRENT_LIST_DIR}/RAPIDS.cmake") diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml index 0f437d3d61d..7a603a29ec9 100644 --- a/conda/environments/all_cuda-129_arch-aarch64.yaml +++ b/conda/environments/all_cuda-129_arch-aarch64.yaml @@ -12,7 +12,6 @@ dependencies: - c-compiler - cachetools - certifi -- clang-tools=20.1.4 - clang-tools==20.1.4 - clang==20.1.4 - cmake>=3.30.4 @@ -21,29 +20,30 @@ dependencies: - cuda-nvcc - cuda-nvrtc-dev - cuda-nvtx-dev -- cuda-python>=12.6.2,<13.0a0 +- cuda-python>=12.9.2,<13.0a0 - cuda-sanitizer-api - cuda-version=12.9 -- cupy>=12.0.0 +- cupy>=13.6.0 - cxx-compiler - cython>=3.0.3 -- dask-cuda==25.8.*,>=0.0.0a0 +- dask-cuda==25.10.*,>=0.0.0a0 - dlpack>=0.8,<1.0 - doxygen=1.9.1 - fastavro>=0.22.9 - flatbuffers==24.3.25 - fsspec>=0.6.0 -- gcc_linux-aarch64=13.* +- gcc_linux-aarch64=14.* - hypothesis>=6.131.7 - identify>=2.5.20 - include-what-you-use==0.24.0 - ipython - jupyter_client - libcurand-dev -- libkvikio==25.8.*,>=0.0.0a0 -- libnvcomp-dev==4.2.0.11 +- libkvikio==25.10.*,>=0.0.0a0 +- libnvcomp-dev==5.0.0.6 +- libnvjitlink-dev - librdkafka>=2.8.0,<2.9.0a0 -- librmm==25.8.*,>=0.0.0a0 +- librmm==25.10.*,>=0.0.0a0 - make - mmh3 - moto>=4.0.8 @@ -55,38 +55,37 @@ dependencies: - nbsphinx - ninja - notebook -- numba-cuda>=0.14.0,<0.15.0a0 -- numba>=0.59.1,<0.62.0a0 +- numba-cuda>=0.19.1,<0.20.0a0 +- numba>=0.60.0,<0.62.0a0 - numpy>=1.23,<3.0a0 - numpydoc -- nvidia-ml-py +- nvidia-ml-py>=12 - nvtx>=0.2.1 - openpyxl - packaging - pandas - pandas>=2.0,<2.4.0dev0 - pandoc -- polars>=1.28,<1.32 +- polars>=1.28,<1.33 - pre-commit -- pyarrow>=14.0.0,<20.0.0a0 +- pyarrow>=15.0.0 - pydata-sphinx-theme>=0.15.4 -- pynvjitlink>=0.0.0a0 -- pynvml>=12.0.0,<13.0.0a0 - pytest - pytest-benchmark - pytest-cases>=3.8.2 - pytest-cov -- pytest-rerunfailures +- pytest-httpserver +- pytest-rerunfailures!=16.0.0 - pytest-xdist - python-confluent-kafka>=2.8.0,<2.9.0a0 - python-xxhash - python>=3.10,<3.14 - pytorch>=2.4.0 -- rapids-build-backend>=0.3.0,<0.4.0.dev0 -- rapids-dask-dependency==25.8.*,>=0.0.0a0 +- rapids-build-backend>=0.4.0,<0.5.0.dev0 +- rapids-dask-dependency==25.10.*,>=0.0.0a0 - rapids-logger==0.1.*,>=0.0.0a0 - rich -- rmm==25.8.*,>=0.0.0a0 +- rmm==25.10.*,>=0.0.0a0 - s3fs>=2022.3.0 - scikit-build-core>=0.10.0 - scipy diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml index 06a0e885510..40ed1cbbc6b 100644 --- a/conda/environments/all_cuda-129_arch-x86_64.yaml +++ b/conda/environments/all_cuda-129_arch-x86_64.yaml @@ -12,7 +12,6 @@ dependencies: - c-compiler - cachetools - certifi -- clang-tools=20.1.4 - clang-tools==20.1.4 - clang==20.1.4 - cmake>=3.30.4 @@ -21,19 +20,19 @@ dependencies: - cuda-nvcc - cuda-nvrtc-dev - cuda-nvtx-dev -- cuda-python>=12.6.2,<13.0a0 +- cuda-python>=12.9.2,<13.0a0 - cuda-sanitizer-api - cuda-version=12.9 -- cupy>=12.0.0 +- cupy>=13.6.0 - cxx-compiler - cython>=3.0.3 -- dask-cuda==25.8.*,>=0.0.0a0 +- dask-cuda==25.10.*,>=0.0.0a0 - dlpack>=0.8,<1.0 - doxygen=1.9.1 - fastavro>=0.22.9 - flatbuffers==24.3.25 - fsspec>=0.6.0 -- gcc_linux-64=13.* +- gcc_linux-64=14.* - hypothesis>=6.131.7 - identify>=2.5.20 - include-what-you-use==0.24.0 @@ -41,10 +40,11 @@ dependencies: - jupyter_client - libcufile-dev - libcurand-dev -- libkvikio==25.8.*,>=0.0.0a0 -- libnvcomp-dev==4.2.0.11 +- libkvikio==25.10.*,>=0.0.0a0 +- libnvcomp-dev==5.0.0.6 +- libnvjitlink-dev - librdkafka>=2.8.0,<2.9.0a0 -- librmm==25.8.*,>=0.0.0a0 +- librmm==25.10.*,>=0.0.0a0 - make - mmh3 - moto>=4.0.8 @@ -56,38 +56,37 @@ dependencies: - nbsphinx - ninja - notebook -- numba-cuda>=0.14.0,<0.15.0a0 -- numba>=0.59.1,<0.62.0a0 +- numba-cuda>=0.19.1,<0.20.0a0 +- numba>=0.60.0,<0.62.0a0 - numpy>=1.23,<3.0a0 - numpydoc -- nvidia-ml-py +- nvidia-ml-py>=12 - nvtx>=0.2.1 - openpyxl - packaging - pandas - pandas>=2.0,<2.4.0dev0 - pandoc -- polars>=1.28,<1.32 +- polars>=1.28,<1.33 - pre-commit -- pyarrow>=14.0.0,<20.0.0a0 +- pyarrow>=15.0.0 - pydata-sphinx-theme>=0.15.4 -- pynvjitlink>=0.0.0a0 -- pynvml>=12.0.0,<13.0.0a0 - pytest - pytest-benchmark - pytest-cases>=3.8.2 - pytest-cov -- pytest-rerunfailures +- pytest-httpserver +- pytest-rerunfailures!=16.0.0 - pytest-xdist - python-confluent-kafka>=2.8.0,<2.9.0a0 - python-xxhash - python>=3.10,<3.14 - pytorch>=2.4.0 -- rapids-build-backend>=0.3.0,<0.4.0.dev0 -- rapids-dask-dependency==25.8.*,>=0.0.0a0 +- rapids-build-backend>=0.4.0,<0.5.0.dev0 +- rapids-dask-dependency==25.10.*,>=0.0.0a0 - rapids-logger==0.1.*,>=0.0.0a0 - rich -- rmm==25.8.*,>=0.0.0a0 +- rmm==25.10.*,>=0.0.0a0 - s3fs>=2022.3.0 - scikit-build-core>=0.10.0 - scipy diff --git a/conda/environments/all_cuda-130_arch-aarch64.yaml b/conda/environments/all_cuda-130_arch-aarch64.yaml new file mode 100644 index 00000000000..e58e93e1aa6 --- /dev/null +++ b/conda/environments/all_cuda-130_arch-aarch64.yaml @@ -0,0 +1,102 @@ +# This file is generated by `rapids-dependency-file-generator`. +# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. +channels: +- rapidsai +- rapidsai-nightly +- conda-forge +dependencies: +- aiobotocore>=2.2.0 +- boto3>=1.21.21 +- botocore>=1.24.21 +- breathe>=4.35.0 +- c-compiler +- cachetools +- certifi +- clang-tools==20.1.4 +- clang==20.1.4 +- cmake>=3.30.4 +- cramjam +- cuda-cudart-dev +- cuda-nvcc +- cuda-nvrtc-dev +- cuda-nvtx-dev +- cuda-python>=13.0.1,<14.0a0 +- cuda-sanitizer-api +- cuda-version=13.0 +- cupy>=13.6.0 +- cxx-compiler +- cython>=3.0.3 +- dask-cuda==25.10.*,>=0.0.0a0 +- dlpack>=0.8,<1.0 +- doxygen=1.9.1 +- fastavro>=0.22.9 +- flatbuffers==24.3.25 +- fsspec>=0.6.0 +- gcc_linux-aarch64=14.* +- hypothesis>=6.131.7 +- identify>=2.5.20 +- include-what-you-use==0.24.0 +- ipython +- jupyter_client +- libcurand-dev +- libkvikio==25.10.*,>=0.0.0a0 +- libnvcomp-dev==5.0.0.6 +- libnvjitlink-dev +- librdkafka>=2.8.0,<2.9.0a0 +- librmm==25.10.*,>=0.0.0a0 +- make +- mmh3 +- moto>=4.0.8 +- msgpack-python +- myst-nb +- nanoarrow +- nbconvert +- nbformat +- nbsphinx +- ninja +- notebook +- numba-cuda>=0.19.1,<0.20.0a0 +- numba>=0.60.0,<0.62.0a0 +- numpy>=1.23,<3.0a0 +- numpydoc +- nvidia-ml-py>=12 +- nvtx>=0.2.1 +- openpyxl +- packaging +- pandas +- pandas>=2.0,<2.4.0dev0 +- pandoc +- polars>=1.28,<1.33 +- pre-commit +- pyarrow>=15.0.0 +- pydata-sphinx-theme>=0.15.4 +- pytest +- pytest-benchmark +- pytest-cases>=3.8.2 +- pytest-cov +- pytest-httpserver +- pytest-rerunfailures!=16.0.0 +- pytest-xdist +- python-confluent-kafka>=2.8.0,<2.9.0a0 +- python-xxhash +- python>=3.10,<3.14 +- rapids-build-backend>=0.4.0,<0.5.0.dev0 +- rapids-dask-dependency==25.10.*,>=0.0.0a0 +- rapids-logger==0.1.*,>=0.0.0a0 +- rich +- rmm==25.10.*,>=0.0.0a0 +- s3fs>=2022.3.0 +- scikit-build-core>=0.10.0 +- scipy +- sphinx-autobuild +- sphinx-copybutton +- sphinx-markdown-tables +- sphinx-remove-toctrees +- sphinx>=8.1.0 +- sphinxcontrib-websupport +- streamz +- sysroot_linux-aarch64==2.28 +- typing_extensions>=4.0.0 +- zlib>=1.2.13 +- zstandard +name: all_cuda-130_arch-aarch64 diff --git a/conda/environments/all_cuda-130_arch-x86_64.yaml b/conda/environments/all_cuda-130_arch-x86_64.yaml new file mode 100644 index 00000000000..0d4dc2b2c2d --- /dev/null +++ b/conda/environments/all_cuda-130_arch-x86_64.yaml @@ -0,0 +1,103 @@ +# This file is generated by `rapids-dependency-file-generator`. +# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. +channels: +- rapidsai +- rapidsai-nightly +- conda-forge +dependencies: +- aiobotocore>=2.2.0 +- boto3>=1.21.21 +- botocore>=1.24.21 +- breathe>=4.35.0 +- c-compiler +- cachetools +- certifi +- clang-tools==20.1.4 +- clang==20.1.4 +- cmake>=3.30.4 +- cramjam +- cuda-cudart-dev +- cuda-nvcc +- cuda-nvrtc-dev +- cuda-nvtx-dev +- cuda-python>=13.0.1,<14.0a0 +- cuda-sanitizer-api +- cuda-version=13.0 +- cupy>=13.6.0 +- cxx-compiler +- cython>=3.0.3 +- dask-cuda==25.10.*,>=0.0.0a0 +- dlpack>=0.8,<1.0 +- doxygen=1.9.1 +- fastavro>=0.22.9 +- flatbuffers==24.3.25 +- fsspec>=0.6.0 +- gcc_linux-64=14.* +- hypothesis>=6.131.7 +- identify>=2.5.20 +- include-what-you-use==0.24.0 +- ipython +- jupyter_client +- libcufile-dev +- libcurand-dev +- libkvikio==25.10.*,>=0.0.0a0 +- libnvcomp-dev==5.0.0.6 +- libnvjitlink-dev +- librdkafka>=2.8.0,<2.9.0a0 +- librmm==25.10.*,>=0.0.0a0 +- make +- mmh3 +- moto>=4.0.8 +- msgpack-python +- myst-nb +- nanoarrow +- nbconvert +- nbformat +- nbsphinx +- ninja +- notebook +- numba-cuda>=0.19.1,<0.20.0a0 +- numba>=0.60.0,<0.62.0a0 +- numpy>=1.23,<3.0a0 +- numpydoc +- nvidia-ml-py>=12 +- nvtx>=0.2.1 +- openpyxl +- packaging +- pandas +- pandas>=2.0,<2.4.0dev0 +- pandoc +- polars>=1.28,<1.33 +- pre-commit +- pyarrow>=15.0.0 +- pydata-sphinx-theme>=0.15.4 +- pytest +- pytest-benchmark +- pytest-cases>=3.8.2 +- pytest-cov +- pytest-httpserver +- pytest-rerunfailures!=16.0.0 +- pytest-xdist +- python-confluent-kafka>=2.8.0,<2.9.0a0 +- python-xxhash +- python>=3.10,<3.14 +- rapids-build-backend>=0.4.0,<0.5.0.dev0 +- rapids-dask-dependency==25.10.*,>=0.0.0a0 +- rapids-logger==0.1.*,>=0.0.0a0 +- rich +- rmm==25.10.*,>=0.0.0a0 +- s3fs>=2022.3.0 +- scikit-build-core>=0.10.0 +- scipy +- sphinx-autobuild +- sphinx-copybutton +- sphinx-markdown-tables +- sphinx-remove-toctrees +- sphinx>=8.1.0 +- sphinxcontrib-websupport +- streamz +- sysroot_linux-64==2.28 +- typing_extensions>=4.0.0 +- zlib>=1.2.13 +- zstandard +name: all_cuda-130_arch-x86_64 diff --git a/conda/recipes/cudf-polars/recipe.yaml b/conda/recipes/cudf-polars/recipe.yaml index 46ab07ab81c..b5fd97b718b 100644 --- a/conda/recipes/cudf-polars/recipe.yaml +++ b/conda/recipes/cudf-polars/recipe.yaml @@ -43,14 +43,15 @@ requirements: host: - python =${{ py_version }} - pip - - rapids-build-backend >=0.3.0,<0.4.0.dev0 + - rapids-build-backend >=0.4.0,<0.5.0.dev0 - setuptools - cuda-version =${{ cuda_version }} run: - - nvidia-ml-py + # 'nvidia-ml-py' provides the 'pynvml' module + - nvidia-ml-py>=12 - python - pylibcudf =${{ version }} - - polars >=1.28,<1.32 + - polars >=1.28,<1.33 - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }} - if: python == "3.10" then: typing_extensions diff --git a/conda/recipes/cudf/conda_build_config.yaml b/conda/recipes/cudf/conda_build_config.yaml index c35ea54a784..5c68a5fefcb 100644 --- a/conda/recipes/cudf/conda_build_config.yaml +++ b/conda/recipes/cudf/conda_build_config.yaml @@ -1,8 +1,8 @@ c_compiler_version: - - 13 + - 14 cxx_compiler_version: - - 13 + - 14 c_stdlib: - sysroot diff --git a/conda/recipes/cudf/recipe.yaml b/conda/recipes/cudf/recipe.yaml index 98e82c95bce..98013e84597 100644 --- a/conda/recipes/cudf/recipe.yaml +++ b/conda/recipes/cudf/recipe.yaml @@ -52,10 +52,10 @@ requirements: - python =${{ py_version }} - pip - cython >=3.0.3 - - rapids-build-backend >=0.3.0,<0.4.0.dev0 + - rapids-build-backend >=0.4.0,<0.5.0.dev0 - scikit-build-core >=0.10.0 - dlpack >=0.8,<1.0 - - numba-cuda >=0.14.0,<0.15.0a0 + - numba-cuda >=0.19.1,<0.20.0a0 - libcudf =${{ version }} - pylibcudf =${{ version }} - rmm =${{ minor_version }} @@ -69,23 +69,19 @@ requirements: - python - typing_extensions >=4.0.0 - pandas >=2.0,<2.4.0dev0 - - cupy >=12.0.0 - - numba-cuda >=0.14.0,<0.15.0a0 - - numba >=0.59.1,<0.62.0a0 + - cupy >=13.6.0 + - numba-cuda >=0.19.1,<0.20.0a0 + - numba >=0.60.0,<0.62.0a0 - numpy >=1.23,<3.0a0 - - pyarrow>=14.0.0,<20.0.0a0 + - pyarrow>=15.0.0,<22.0.0a0 - libcudf =${{ version }} - pylibcudf =${{ version }} - ${{ pin_compatible("rmm", upper_bound="x.x") }} - fsspec >=0.6.0 - cuda-cudart - # Needed by Numba for CUDA support - - cuda-nvcc-impl - # TODO: Add nvjitlink here - # xref: https://github.com/rapidsai/cudf/issues/12822 - - cuda-nvrtc - - cuda-python >=12.6.2,<13.0a0 - - pynvjitlink + - if: cuda_major == "12" + then: cuda-python >=12.9.2,<13.0a0 + else: cuda-python >=13.0.1,<14.0a0 - if: linux and x86_64 then: - libcufile diff --git a/conda/recipes/cudf_kafka/conda_build_config.yaml b/conda/recipes/cudf_kafka/conda_build_config.yaml index c35ea54a784..5c68a5fefcb 100644 --- a/conda/recipes/cudf_kafka/conda_build_config.yaml +++ b/conda/recipes/cudf_kafka/conda_build_config.yaml @@ -1,8 +1,8 @@ c_compiler_version: - - 13 + - 14 cxx_compiler_version: - - 13 + - 14 c_stdlib: - sysroot diff --git a/conda/recipes/cudf_kafka/recipe.yaml b/conda/recipes/cudf_kafka/recipe.yaml index 0d2c8cc39cc..100dc270915 100644 --- a/conda/recipes/cudf_kafka/recipe.yaml +++ b/conda/recipes/cudf_kafka/recipe.yaml @@ -55,7 +55,7 @@ requirements: - cuda-version =${{ cuda_version }} - pylibcudf =${{ version }} - libcudf_kafka =${{ version }} - - rapids-build-backend >=0.3.0,<0.4.0.dev0 + - rapids-build-backend >=0.4.0,<0.5.0.dev0 - scikit-build-core >=0.10.0 - cuda-cudart-dev run: diff --git a/conda/recipes/custreamz/recipe.yaml b/conda/recipes/custreamz/recipe.yaml index 36535c4f472..4e8644b046e 100644 --- a/conda/recipes/custreamz/recipe.yaml +++ b/conda/recipes/custreamz/recipe.yaml @@ -28,7 +28,7 @@ requirements: host: - python =${{ py_version }} - pip - - rapids-build-backend >=0.3.0,<0.4.0.dev0 + - rapids-build-backend >=0.4.0,<0.5.0.dev0 - setuptools - python-confluent-kafka >=2.8.0,<2.9.0a0 - cudf_kafka =${{ version }} diff --git a/conda/recipes/dask-cudf/recipe.yaml b/conda/recipes/dask-cudf/recipe.yaml index eaa05196c9d..e5cf5e2db9a 100644 --- a/conda/recipes/dask-cudf/recipe.yaml +++ b/conda/recipes/dask-cudf/recipe.yaml @@ -28,13 +28,14 @@ requirements: host: - python =${{ py_version }} - pip - - rapids-build-backend >=0.3.0,<0.4.0.dev0 + - rapids-build-backend >=0.4.0,<0.5.0.dev0 - setuptools - cuda-version =${{ cuda_version }} run: - python - cudf =${{ version }} - - pynvml >=12.0.0,<13.0.0a0 + # 'nvidia-ml-py' provides the 'pynvml' module + - nvidia-ml-py>=12 - rapids-dask-dependency =${{ minor_version }} - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }} diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml index 5fc7c9eae1b..7acdfdd9698 100644 --- a/conda/recipes/libcudf/conda_build_config.yaml +++ b/conda/recipes/libcudf/conda_build_config.yaml @@ -1,8 +1,8 @@ c_compiler_version: - - 13 + - 14 cxx_compiler_version: - - 13 + - 14 cuda_compiler: - cuda-nvcc @@ -26,7 +26,7 @@ flatbuffers_version: - "=24.3.25" nvcomp_version: - - "=4.2.0.11" + - "=5.0.0.6" zlib_version: - ">=1.2.13" diff --git a/conda/recipes/libcudf/recipe.yaml b/conda/recipes/libcudf/recipe.yaml index 814a304ebef..3e97dc84a7d 100644 --- a/conda/recipes/libcudf/recipe.yaml +++ b/conda/recipes/libcudf/recipe.yaml @@ -68,6 +68,7 @@ cache: - cuda-nvrtc-dev - cuda-nvtx-dev - libcurand-dev + - libnvjitlink-dev - if: linux and x86_64 then: - libcufile-dev diff --git a/conda/recipes/pylibcudf/conda_build_config.yaml b/conda/recipes/pylibcudf/conda_build_config.yaml index c35ea54a784..5c68a5fefcb 100644 --- a/conda/recipes/pylibcudf/conda_build_config.yaml +++ b/conda/recipes/pylibcudf/conda_build_config.yaml @@ -1,8 +1,8 @@ c_compiler_version: - - 13 + - 14 cxx_compiler_version: - - 13 + - 14 c_stdlib: - sysroot diff --git a/conda/recipes/pylibcudf/recipe.yaml b/conda/recipes/pylibcudf/recipe.yaml index e5fec6983c4..bfaec91b72c 100644 --- a/conda/recipes/pylibcudf/recipe.yaml +++ b/conda/recipes/pylibcudf/recipe.yaml @@ -52,7 +52,7 @@ requirements: - python =${{ py_version }} - pip - cython >=3.0.3 - - rapids-build-backend >=0.3.0,<0.4.0.dev0 + - rapids-build-backend >=0.4.0,<0.5.0.dev0 - scikit-build-core >=0.10.0 - dlpack >=0.8,<1.0 - libcudf =${{ version }} @@ -67,14 +67,17 @@ requirements: - python - typing_extensions >=4.0.0 - pandas >=2.0,<2.4.0dev0 - - numpy >=1.23,<3.0a0 - - pyarrow>=14.0.0,<20.0.0a0 - libcudf =${{ version }} - ${{ pin_compatible("rmm", upper_bound="x.x") }} - fsspec >=0.6.0 - - cuda-python >=12.6.2,<13.0a0 + - if: cuda_major == "12" + then: cuda-python >=12.9.2,<13.0a0 + else: cuda-python >=13.0.1,<14.0a0 - nvtx >=0.2.1 - packaging + run_constraints: + - numpy >=1.23,<3.0a0 + - pyarrow>=15.0.0,<22.0.0a0 ignore_run_exports: from_package: - cuda-cudart-dev diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 28dbc2c4d78..12b08b136c6 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -53,7 +53,6 @@ option(BUILD_SHARED_LIBS "Build cuDF shared libraries" ON) option(JITIFY_USE_CACHE "Use a file cache for JIT compiled kernels" ON) option(CUDF_BUILD_TESTUTIL "Whether to build the test utilities contained in libcudf" ON) mark_as_advanced(CUDF_BUILD_TESTUTIL) -option(CUDF_USE_PROPRIETARY_NVCOMP "Download and use NVCOMP with proprietary extensions" ON) option(CUDF_EXPORT_NVCOMP "Export NVCOMP as a dependency" ON) option(CUDF_LARGE_STRINGS_DISABLED "Build with large string support disabled" OFF) mark_as_advanced(CUDF_LARGE_STRINGS_DISABLED) @@ -133,6 +132,11 @@ set(CUDF_CUDA_FLAGS "") set(CUDF_CXX_DEFINITIONS "") set(CUDF_CUDA_DEFINITIONS "") +# For now, disable CMake's automatic module scanning for C++ files. There is an sccache bug in the +# version RAPIDS uses in CI that causes it to handle the resulting -M* flags incorrectly with +# gcc>=14. We can remove this once we upgrade to a newer sccache version. +set(CMAKE_CXX_SCAN_FOR_MODULES OFF) + # Set logging level set(LIBCUDF_LOGGING_LEVEL "INFO" @@ -288,6 +292,9 @@ include(cmake/thirdparty/get_cccl.cmake) # find rmm include(cmake/thirdparty/get_rmm.cmake) +# find croaring +include(cmake/thirdparty/get_croaring.cmake) + # find flatbuffers include(cmake/thirdparty/get_flatbuffers.cmake) @@ -419,6 +426,7 @@ add_library( src/filling/fill.cu src/filling/repeat.cu src/filling/sequence.cu + src/groupby/common/m2_var_std.cu src/groupby/groupby.cu src/groupby/hash/compute_aggregations.cu src/groupby/hash/compute_aggregations_null.cu @@ -531,6 +539,7 @@ add_library( src/io/parquet/compact_protocol_writer.cpp src/io/parquet/decode_preprocess.cu src/io/parquet/experimental/dictionary_page_filter.cu + src/io/parquet/experimental/deletion_vectors.cu src/io/parquet/experimental/hybrid_scan.cpp src/io/parquet/experimental/hybrid_scan_chunking.cu src/io/parquet/experimental/hybrid_scan_helpers.cpp @@ -573,11 +582,14 @@ add_library( src/io/utilities/type_inference.cu src/io/utilities/trie.cu src/jit/cache.cpp + src/jit/helpers.cpp src/jit/parser.cpp + src/jit/row_ir.cpp src/jit/util.cpp src/join/conditional_join.cu src/join/cross_join.cu src/join/distinct_hash_join.cu + src/join/filtered_join.cu src/join/hash_join.cu src/join/join.cu src/join/join_utils.cu @@ -625,14 +637,16 @@ add_library( src/reductions/any.cu src/reductions/bitwise.cu src/reductions/collect_ops.cu + src/reductions/count.cpp src/reductions/histogram.cu src/reductions/max.cu src/reductions/mean.cu src/reductions/min.cu src/reductions/minmax.cu src/reductions/nth_element.cu - src/reductions/nunique.cu + src/reductions/nunique.cpp src/reductions/product.cu + src/reductions/quantile.cu src/reductions/reductions.cpp src/reductions/scan/rank_scan.cu src/reductions/scan/ewm.cu @@ -656,6 +670,7 @@ add_library( src/reductions/std.cu src/reductions/sum.cu src/reductions/sum_of_squares.cu + src/reductions/sum_with_overflow.cu src/reductions/var.cu src/replace/clamp.cu src/replace/nans.cu @@ -675,16 +690,22 @@ add_library( src/rolling/range_window_bounds.cpp src/rolling/rolling.cpp src/round/round.cu + src/row_operator/primitive_row_operators.cu + src/row_operator/row_operators.cu src/runtime/context.cpp src/scalar/scalar.cpp src/scalar/scalar_factories.cpp src/search/contains_column.cu src/search/contains_scalar.cu src/search/contains_table.cu + src/search/contains_table_impl.cu + src/search/contains_table_impl_nested.cu + src/search/contains_table_impl_primitive.cu src/search/search_ordered.cu src/sort/is_sorted.cu src/sort/rank.cu src/sort/segmented_sort.cu + src/sort/segmented_top_k.cu src/sort/sort.cu src/sort/sort_column.cu src/sort/sort_radix.cu @@ -723,6 +744,7 @@ add_library( src/strings/convert/convert_ipv4.cu src/strings/convert/convert_urls.cu src/strings/convert/convert_lists.cu + src/strings/convert/int_cast.cu src/strings/copying/concatenate.cu src/strings/copying/copying.cu src/strings/copying/copy_range.cu @@ -790,10 +812,6 @@ add_library( src/text/stemmer.cu src/text/bpe/byte_pair_encoding.cu src/text/bpe/load_merge_pairs.cu - src/text/subword/data_normalizer.cu - src/text/subword/load_hash_file.cu - src/text/subword/subword_tokenize.cu - src/text/subword/wordpiece_tokenizer.cu src/text/tokenize.cu src/text/vocabulary_tokenize.cu src/text/wordpiece_tokenize.cu diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 12ebdf8ef2b..0f6dde253ac 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -151,7 +151,7 @@ ConfigureNVBench( # ################################################################################################## # * contiguous_split benchmark ------------------------------------------------------------------- -ConfigureBench(CONTIGUOUS_SPLIT_NVBENCH contiguous_split/contiguous_split.cpp) +ConfigureNVBench(CONTIGUOUS_SPLIT_NVBENCH contiguous_split/contiguous_split.cpp) # ################################################################################################## # * lists scatter benchmark ----------------------------------------------------------------------- @@ -173,6 +173,10 @@ ConfigureNVBench(NDSH_Q06_NVBENCH ndsh/q06.cpp ndsh/utilities.cpp) ConfigureNVBench(NDSH_Q09_NVBENCH ndsh/q09.cpp ndsh/utilities.cpp) ConfigureNVBench(NDSH_Q10_NVBENCH ndsh/q10.cpp ndsh/utilities.cpp) +# ################################################################################################## +# * filter benchmark ------------------------------------------------------------------- +ConfigureNVBench(FILTER_NVBENCH filter/minmax_filter.cpp) + # ################################################################################################## # * stream_compaction benchmark ------------------------------------------------------------------- ConfigureNVBench( @@ -205,8 +209,17 @@ ConfigureNVBench(SEARCH_NVBENCH search/contains_scalar.cpp search/contains_table # ################################################################################################## # * sort benchmark -------------------------------------------------------------------------------- ConfigureNVBench( - SORT_NVBENCH sort/rank.cpp sort/rank_lists.cpp sort/rank_structs.cpp sort/segmented_sort.cpp - sort/sort.cpp sort/sort_lists.cpp sort/sort_strings.cpp sort/sort_structs.cpp sort/top_k.cpp + SORT_NVBENCH + sort/rank.cpp + sort/rank_lists.cpp + sort/rank_structs.cpp + sort/segmented_top_k.cpp + sort/segmented_sort.cpp + sort/sort.cpp + sort/sort_lists.cpp + sort/sort_strings.cpp + sort/sort_structs.cpp + sort/top_k.cpp ) # ################################################################################################## @@ -217,7 +230,7 @@ ConfigureNVBench(STRUCT_CREATION_NVBENCH structs/create_structs.cpp) # ################################################################################################## # * quantiles benchmark # -------------------------------------------------------------------------------- -ConfigureBench(QUANTILES_BENCH quantiles/quantiles.cpp) +ConfigureNVBench(QUANTILES_NVBENCH quantiles/quantiles.cpp) # ################################################################################################## # * tdigest benchmark @@ -250,23 +263,26 @@ ConfigureNVBench(REPLACE_NVBENCH replace/nulls.cpp) # ################################################################################################## # * filling benchmark ----------------------------------------------------------------------------- -ConfigureBench(FILL_BENCH filling/repeat.cpp) +ConfigureNVBench(FILL_NVBENCH filling/repeat.cpp) # ################################################################################################## # * groupby benchmark ----------------------------------------------------------------------------- -ConfigureBench( - GROUPBY_BENCH groupby/group_sum.cpp groupby/group_nth.cpp groupby/group_shift.cpp - groupby/group_struct_values.cpp groupby/group_no_requests.cpp groupby/group_scan.cpp -) - ConfigureNVBench( GROUPBY_NVBENCH + groupby/group_complex_keys.cpp groupby/group_histogram.cpp + groupby/group_m2_var_std.cpp groupby/group_max.cpp groupby/group_max_multithreaded.cpp + groupby/group_no_requests.cpp + groupby/group_nth.cpp groupby/group_nunique.cpp groupby/group_rank.cpp + groupby/group_scan.cpp + groupby/group_shift.cpp groupby/group_struct_keys.cpp + groupby/group_struct_values.cpp + groupby/group_sum.cpp ) # ################################################################################################## @@ -313,6 +329,19 @@ ConfigureNVBench( PARQUET_EXPERIMENTAL_READER_NVBENCH io/parquet/experimental/parquet_dictionary_page_filter.cpp ) +# ################################################################################################## +# * parquet deletion vector benchmark +# ---------------------------------------------------------------------- +ConfigureNVBench( + PARQUET_DELETION_VECTORS_NVBENCH io/parquet/experimental/parquet_deletion_vectors.cpp +) +target_compile_definitions( + PARQUET_DELETION_VECTORS_NVBENCH + PRIVATE DISABLENEON=1 ROARING_DISABLE_X64=1 ROARING_DISABLE_AVX=1 + CROARING_COMPILER_SUPPORTS_AVX512=0 +) +target_link_libraries(PARQUET_DELETION_VECTORS_NVBENCH PRIVATE roaring) + # ################################################################################################## # * parquet multithread reader benchmark # ---------------------------------------------------------------------- @@ -352,7 +381,9 @@ ConfigureNVBench( # ################################################################################################## # * transform benchmark # --------------------------------------------------------------------------------- -ConfigureNVBench(TRANSFORM_NVBENCH transform/polynomials.cpp transform/transform.cpp) +ConfigureNVBench( + TRANSFORM_NVBENCH transform/encode.cpp transform/polynomials.cpp transform/transform.cpp +) # ################################################################################################## # * nvtext benchmark ------------------------------------------------------------------- @@ -409,6 +440,11 @@ ConfigureNVBench( string/url_decode.cu ) +# ################################################################################################## +# * strings experimental benchmark --------------------------------------------------- +ConfigureNVBench(STRINGS_EXPERIMENTAL_NVBENCH string/experimental/stringview_compare.cu) +target_link_libraries(STRINGS_EXPERIMENTAL_NVBENCH PRIVATE nanoarrow) + # ################################################################################################## # * json benchmark ------------------------------------------------------------------- ConfigureNVBench(JSON_NVBENCH json/json.cu) diff --git a/cpp/benchmarks/ast/polynomials.cpp b/cpp/benchmarks/ast/polynomials.cpp index 1897752fa98..11e496e65e5 100644 --- a/cpp/benchmarks/ast/polynomials.cpp +++ b/cpp/benchmarks/ast/polynomials.cpp @@ -30,12 +30,28 @@ #include +namespace { + +enum class engine_type : uint8_t { AST = 0, JIT = 1 }; + +engine_type engine_from_string(std::string_view str) +{ + if (str == "ast") { + return engine_type::AST; + } else if (str == "jit") { + return engine_type::JIT; + } else { + CUDF_FAIL("unrecognized engine enum: " + std::string(str)); + } +} + template -static void BM_ast_polynomials(nvbench::state& state) +void BM_ast_polynomials(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); auto const order = static_cast(state.get_int64("order")); auto const null_probability = state.get_float64("null_probability"); + auto const engine = engine_from_string(state.get_string("engine")); CUDF_EXPECTS(order > 0, "Polynomial order must be greater than 0"); @@ -80,7 +96,18 @@ static void BM_ast_polynomials(nvbench::state& state) state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { cudf::scoped_range range{"benchmark_iteration"}; - cudf::compute_column(*table, tree.back(), launch.get_stream().get_stream()); + + switch (engine) { + case engine_type::AST: { + cudf::compute_column(*table, tree.back(), launch.get_stream().get_stream()); + break; + } + case engine_type::JIT: { + cudf::compute_column_jit(*table, tree.back(), launch.get_stream().get_stream()); + break; + } + default: CUDF_FAIL("Invalid engine type"); + } }); } @@ -90,7 +117,10 @@ static void BM_ast_polynomials(nvbench::state& state) .set_name(#name) \ .add_int64_axis("num_rows", {100'000, 1'000'000, 10'000'000, 100'000'000}) \ .add_int64_axis("order", {1, 2, 4, 8, 16, 32}) \ - .add_float64_axis("null_probability", {0.01}) + .add_float64_axis("null_probability", {0.01}) \ + .add_string_axis("engine", {"ast", "jit"}) + +} // namespace AST_POLYNOMIAL_BENCHMARK_DEFINE(ast_polynomials_float32, float); diff --git a/cpp/benchmarks/ast/transform.cpp b/cpp/benchmarks/ast/transform.cpp index 06ec7543213..973985149f7 100644 --- a/cpp/benchmarks/ast/transform.cpp +++ b/cpp/benchmarks/ast/transform.cpp @@ -49,11 +49,25 @@ enum class TreeType { // child column reference }; +enum class engine_type : uint8_t { AST = 0, JIT = 1 }; + +static engine_type engine_from_string(std::string_view str) +{ + if (str == "ast") { + return engine_type::AST; + } else if (str == "jit") { + return engine_type::JIT; + } else { + CUDF_FAIL("unrecognized engine enum: " + std::string(str)); + } +} + template static void BM_ast_transform(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); auto const tree_levels = static_cast(state.get_int64("tree_levels")); + auto const engine = engine_from_string(state.get_string("engine")); // Create table data auto const num_columns = reuse_columns ? 1 : tree_levels + 1; @@ -94,8 +108,19 @@ static void BM_ast_transform(nvbench::state& state) state.add_global_memory_reads(static_cast(num_rows) * (tree_levels + 1)); state.add_global_memory_writes(num_rows); - state.exec(nvbench::exec_tag::sync, - [&](nvbench::launch&) { cudf::compute_column(table, root_expression); }); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch&) { + switch (engine) { + case engine_type::AST: { + cudf::compute_column(table, root_expression); + break; + } + case engine_type::JIT: { + cudf::compute_column_jit(table, root_expression); + break; + } + default: CUDF_FAIL("Invalid engine type"); + } + }); } template @@ -105,6 +130,7 @@ static void BM_string_compare_ast_transform(nvbench::state& state) auto const num_rows = static_cast(state.get_int64("num_rows")); auto const tree_levels = static_cast(state.get_int64("tree_levels")); auto const hit_rate = static_cast(state.get_int64("hit_rate")); + auto const engine = engine_from_string(state.get_string("engine")); CUDF_EXPECTS(tree_levels > 0, "benchmarks require 1 or more comparisons"); @@ -155,8 +181,19 @@ static void BM_string_compare_ast_transform(nvbench::state& state) auto const& expression = tree.back(); - state.exec(nvbench::exec_tag::sync, - [&](nvbench::launch&) { cudf::compute_column(table, expression); }); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch&) { + switch (engine) { + case engine_type::AST: { + cudf::compute_column(table, expression); + break; + } + case engine_type::JIT: { + cudf::compute_column_jit(table, expression); + break; + } + default: CUDF_FAIL("Invalid engine type"); + } + }); } #define AST_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns, nullable) \ @@ -167,7 +204,8 @@ static void BM_string_compare_ast_transform(nvbench::state& state) NVBENCH_BENCH(name) \ .set_name(#name) \ .add_int64_axis("tree_levels", {1, 5, 10}) \ - .add_int64_axis("num_rows", {100'000, 1'000'000, 10'000'000, 100'000'000}) + .add_int64_axis("num_rows", {100'000, 1'000'000, 10'000'000, 100'000'000}) \ + .add_string_axis("engine", {"ast", "jit"}) AST_TRANSFORM_BENCHMARK_DEFINE( ast_int32_imbalanced_unique, int32_t, TreeType::IMBALANCED_LEFT, false, false); @@ -193,7 +231,8 @@ AST_TRANSFORM_BENCHMARK_DEFINE( .add_int64_axis("string_width", {32, 64, 128, 256}) \ .add_int64_axis("num_rows", {32768, 262144, 2097152}) \ .add_int64_axis("tree_levels", {1, 2, 3, 4}) \ - .add_int64_axis("hit_rate", {50, 100}) + .add_int64_axis("hit_rate", {50, 100}) \ + .add_string_axis("engine", {"ast", "jit"}) AST_STRING_COMPARE_TRANSFORM_BENCHMARK_DEFINE(ast_string_equal_logical_and, cudf::ast::ast_operator::EQUAL, diff --git a/cpp/benchmarks/common/generate_input.cu b/cpp/benchmarks/common/generate_input.cu index 55c5133fa03..03fe04fe5d6 100644 --- a/cpp/benchmarks/common/generate_input.cu +++ b/cpp/benchmarks/common/generate_input.cu @@ -549,7 +549,7 @@ struct create_rand_col_fn { thrust::minstd_rand& engine, cudf::size_type num_rows) { - if (profile.get_cardinality() == 0 || profile.get_cardinality() >= num_rows) { + if (profile.get_cardinality() >= num_rows) { return create_distinct_rows_column(profile, engine, num_rows); } return create_random_column(profile, engine, num_rows); @@ -640,14 +640,7 @@ std::unique_ptr create_distinct_rows_column(data_profile const& pr auto valid_dist = random_value_fn( distribution_params{1. - profile.get_null_probability().value_or(0)}); - cudf::data_type const dtype = [&]() { - if constexpr (cudf::is_fixed_point()) - return cudf::data_type{cudf::type_to_id(), 0}; - else - return cudf::data_type{cudf::type_to_id()}; - }(); - - auto init = cudf::make_default_constructed_scalar(dtype); + auto init = cudf::make_fixed_width_scalar(T{}); auto col = cudf::sequence(num_rows, *init); rmm::device_uvector null_mask(0, cudf::get_default_stream()); @@ -698,9 +691,8 @@ template <> std::unique_ptr create_distinct_rows_column( data_profile const& profile, thrust::minstd_rand& engine, cudf::size_type num_rows) { - auto col = create_random_column(profile, engine, num_rows); - auto int_col = cudf::sequence( - num_rows, *cudf::make_default_constructed_scalar(cudf::data_type{cudf::type_id::INT32})); + auto col = create_random_column(profile, engine, num_rows); + auto int_col = cudf::sequence(num_rows, *cudf::make_fixed_width_scalar(0)); auto int2strcol = cudf::strings::from_integers(int_col->view()); auto concat_col = cudf::strings::concatenate(cudf::table_view({col->view(), int2strcol->view()})); return std::move(cudf::sample(cudf::table_view({concat_col->view()}), num_rows)->release()[0]); @@ -793,8 +785,7 @@ std::unique_ptr create_distinct_rows_column( auto const dist_params = profile.get_distribution_params(); auto col = create_random_column(profile, engine, num_rows); std::vector> children; - children.push_back(cudf::sequence( - num_rows, *cudf::make_default_constructed_scalar(cudf::data_type{cudf::type_id::INT32}))); + children.push_back(cudf::sequence(num_rows, *cudf::make_fixed_width_scalar(0))); for (int lvl = dist_params.max_depth; lvl > 1; --lvl) { std::vector> parents; parents.push_back( @@ -891,12 +882,11 @@ std::unique_ptr create_distinct_rows_column( { auto const dist_params = profile.get_distribution_params(); auto col = create_random_column(profile, engine, num_rows); - auto child_column = cudf::sequence( - num_rows, *cudf::make_default_constructed_scalar(cudf::data_type{cudf::type_id::INT32})); + auto zero = cudf::make_fixed_width_scalar(0); + auto child_column = cudf::sequence(num_rows, *zero); for (int lvl = dist_params.max_depth; lvl > 0; --lvl) { - auto offsets_column = cudf::sequence( - num_rows + 1, *cudf::make_default_constructed_scalar(cudf::data_type{cudf::type_id::INT32})); - auto list_column = cudf::make_lists_column( + auto offsets_column = cudf::sequence(num_rows + 1, *zero); + auto list_column = cudf::make_lists_column( num_rows, std::move(offsets_column), std::move(child_column), 0, rmm::device_buffer{}); std::swap(child_column, list_column); } @@ -1021,7 +1011,8 @@ std::unique_ptr create_sequence_table(std::vector co auto columns = std::vector>(dtype_ids.size()); std::transform(dtype_ids.begin(), dtype_ids.end(), columns.begin(), [&](auto dtype) mutable { auto init = cudf::make_default_constructed_scalar(cudf::data_type{dtype}); - auto col = cudf::sequence(num_rows.count, *init); + init->set_valid_async(true); + auto col = cudf::sequence(num_rows.count, *init); auto [mask, count] = create_random_null_mask(num_rows.count, null_probability, seed_dist(seed_engine)); col->set_null_mask(std::move(mask), count); @@ -1063,12 +1054,10 @@ std::unique_ptr create_string_column(cudf::size_type num_rows, auto const num_matches = (static_cast(num_rows) * hit_rate) / 100; // Create a randomized gather-map to build a column out of the strings in data. - data_profile gather_profile = - data_profile_builder().cardinality(0).null_probability(0.0).distribution( - cudf::type_id::INT32, distribution_id::UNIFORM, 1, data_view.size() - 1); + data_profile gather_profile = data_profile_builder().cardinality(0).no_validity().distribution( + cudf::type_id::INT32, distribution_id::UNIFORM, 1, data_view.size() - 1); auto gather_table = create_random_table({cudf::type_id::INT32}, row_count{num_rows}, gather_profile); - gather_table->get_column(0).set_null_mask(rmm::device_buffer{}, 0); // Create scatter map by placing 0-index values throughout the gather-map auto scatter_data = cudf::sequence(num_matches, diff --git a/cpp/benchmarks/common/generate_input.hpp b/cpp/benchmarks/common/generate_input.hpp index 57834fd11d2..a89fb2429bf 100644 --- a/cpp/benchmarks/common/generate_input.hpp +++ b/cpp/benchmarks/common/generate_input.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -238,8 +238,11 @@ class data_profile { double bool_probability_true = 0.5; std::optional null_probability = 0.01; - cudf::size_type cardinality = 2000; - cudf::size_type avg_run_length = 4; + cudf::size_type cardinality = + 2000; /// Upper bound on the number of unique values generated if `0 <= cardinality < n`, where + /// `n` is the total number of values to be generated. If `cardinality >= n`, n` unique + /// values of the requested data type are generated. + cudf::size_type avg_run_length = 4; public: template + #include #include #include #include -#include #include #include #include @@ -156,7 +157,7 @@ std::unique_ptr generate_orders_independent(double scale_factor, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); cudf::size_type const o_num_rows = scale_factor * 1'500'000; // Generate the `o_orderkey` column @@ -280,7 +281,7 @@ std::unique_ptr generate_lineitem_partial(cudf::table_view const& o rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); auto const o_num_rows = orders_independent.num_rows(); // Generate the `lineitem` table. For each row in the `orders` table, // we have a random number (between 1 and 7) of rows in the `lineitem` table @@ -450,7 +451,7 @@ std::unique_ptr generate_orders_dependent(cudf::table_view const& l rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); auto const l_linestatus_mask = lineitem_partial.column(0); auto const l_orderkey = lineitem_partial.column(1); auto const l_extendedprice = lineitem_partial.column(6); @@ -543,7 +544,7 @@ std::unique_ptr generate_partsupp(double scale_factor, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); // Define the number of rows in the `part` and `partsupp` tables cudf::size_type const p_num_rows = scale_factor * 200'000; cudf::size_type const ps_num_rows = scale_factor * 800'000; @@ -591,7 +592,7 @@ std::unique_ptr generate_part(double scale_factor, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); cudf::size_type const num_rows = scale_factor * 200'000; // Generate the `p_partkey` column @@ -717,7 +718,7 @@ generate_orders_lineitem_part(double scale_factor, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); // Generate a table with the independent columns of the `orders` table auto orders_independent = generate_orders_independent(scale_factor, stream, mr); @@ -784,7 +785,7 @@ std::unique_ptr generate_supplier(double scale_factor, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); // Calculate the number of rows based on the scale factor cudf::size_type const num_rows = scale_factor * 10'000; @@ -845,7 +846,7 @@ std::unique_ptr generate_customer(double scale_factor, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); // Calculate the number of rows based on the scale factor cudf::size_type const num_rows = scale_factor * 150'000; @@ -912,7 +913,7 @@ std::unique_ptr generate_customer(double scale_factor, std::unique_ptr generate_nation(rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); // Define the number of rows constexpr cudf::size_type num_rows = 25; @@ -952,7 +953,7 @@ std::unique_ptr generate_nation(rmm::cuda_stream_view stream, std::unique_ptr generate_region(rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); // Define the number of rows constexpr cudf::size_type num_rows = 5; diff --git a/cpp/benchmarks/common/ndsh_data_generator/random_column_generator.cu b/cpp/benchmarks/common/ndsh_data_generator/random_column_generator.cu index 4246bd1a83b..aac50aeecb2 100644 --- a/cpp/benchmarks/common/ndsh_data_generator/random_column_generator.cu +++ b/cpp/benchmarks/common/ndsh_data_generator/random_column_generator.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,11 +16,12 @@ #include "random_column_generator.hpp" +#include + #include #include #include -#include #include #include @@ -91,7 +92,7 @@ std::unique_ptr generate_random_string_column(cudf::size_type lowe rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); auto offsets_begin = cudf::detail::make_counting_transform_iterator( 0, random_number_generator(lower, upper)); auto [offsets_column, computed_bytes] = cudf::strings::detail::make_offsets_child_column( @@ -119,7 +120,7 @@ std::unique_ptr generate_random_numeric_column(T lower, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); auto col = cudf::make_numeric_column( cudf::data_type{cudf::type_to_id()}, num_rows, cudf::mask_state::UNALLOCATED, stream, mr); cudf::size_type begin = 0; @@ -165,7 +166,7 @@ std::unique_ptr generate_primary_key_column(cudf::scalar const& st rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); return cudf::sequence(num_rows, start, stream, mr); } @@ -174,7 +175,7 @@ std::unique_ptr generate_repeat_string_column(std::string const& v rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); auto const scalar = cudf::string_scalar(value); return cudf::make_column_from_scalar(scalar, num_rows, stream, mr); } @@ -185,7 +186,7 @@ std::unique_ptr generate_random_string_column_from_set( rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); // Build a gather map of random strings to choose from // The size of the string sets always fits within 16-bit integers auto const indices = @@ -211,7 +212,7 @@ std::unique_ptr generate_repeat_sequence_column(T seq_length, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); auto pkey = generate_primary_key_column(cudf::numeric_scalar(0), num_rows, stream, mr); auto repeat_seq_zero_indexed = cudf::binary_operation(pkey->view(), diff --git a/cpp/benchmarks/common/ndsh_data_generator/table_helpers.cpp b/cpp/benchmarks/common/ndsh_data_generator/table_helpers.cpp index 7095c227649..4185bcf60e5 100644 --- a/cpp/benchmarks/common/ndsh_data_generator/table_helpers.cpp +++ b/cpp/benchmarks/common/ndsh_data_generator/table_helpers.cpp @@ -18,13 +18,14 @@ #include "random_column_generator.hpp" +#include + #include #include #include #include #include #include -#include #include #include #include @@ -55,7 +56,7 @@ std::unique_ptr add_calendrical_days(cudf::column_view const& time rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); auto const days_duration_type = cudf::cast(days, cudf::data_type{cudf::type_id::DURATION_DAYS}); auto const data_type = cudf::data_type{cudf::type_id::TIMESTAMP_DAYS}; return cudf::binary_operation( @@ -80,7 +81,7 @@ std::unique_ptr perform_left_join(cudf::table_view const& left_inpu rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); constexpr auto oob_policy = cudf::out_of_bounds_policy::NULLIFY; auto const left_selected = left_input.select(left_on); auto const right_selected = right_input.select(right_on); @@ -116,7 +117,7 @@ std::unique_ptr perform_left_join(cudf::table_view const& left_inpu rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); // Expression: (90000 + ((p_partkey/10) modulo 20001) + 100 * (p_partkey modulo 1000)) / 100 auto table = cudf::table_view({p_partkey}); auto p_partkey_col_ref = cudf::ast::column_reference(0); @@ -160,7 +161,7 @@ std::unique_ptr perform_left_join(cudf::table_view const& left_inpu rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); // Expression: (l_partkey + (i * (s/4 + (int)(l_partkey - 1)/s))) % s + 1 // Generate the `s` col @@ -232,7 +233,7 @@ std::unique_ptr perform_left_join(cudf::table_view const& left_inpu rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); // Expression: ps_suppkey = (ps_partkey + (i * (s/4 + (int)(ps_partkey - 1)/s))) % s + 1 // Generate the `s` col @@ -299,7 +300,7 @@ std::unique_ptr perform_left_join(cudf::table_view const& left_inpu rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); auto const sum_agg = cudf::make_sum_aggregation(); auto const l_num_rows_scalar = cudf::reduce(o_rep_freqs, *sum_agg, cudf::data_type{cudf::type_id::INT32}, stream, mr); @@ -322,7 +323,7 @@ std::unique_ptr perform_left_join(cudf::table_view const& left_inpu rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); auto const one = cudf::numeric_scalar(1); auto const one_minus_discount = cudf::binary_operation( one, discount, cudf::binary_operator::SUB, cudf::data_type{cudf::type_id::FLOAT64}, stream, mr); @@ -352,7 +353,7 @@ std::unique_ptr perform_left_join(cudf::table_view const& left_inpu [[nodiscard]] std::unique_ptr generate_address_column( cudf::size_type num_rows, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); return generate_random_string_column(10, 40, num_rows, stream, mr); } @@ -367,7 +368,7 @@ std::unique_ptr perform_left_join(cudf::table_view const& left_inpu rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); auto const part_a = cudf::strings::from_integers( generate_random_numeric_column(10, 34, num_rows, stream, mr)->view()); auto const part_b = cudf::strings::from_integers( diff --git a/cpp/benchmarks/common/nvtx_ranges.hpp b/cpp/benchmarks/common/nvtx_ranges.hpp new file mode 100644 index 00000000000..edcb0ab358c --- /dev/null +++ b/cpp/benchmarks/common/nvtx_ranges.hpp @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace cudf::benchmark { +/** + * @brief Tag type for the NVTX domain + */ +struct benchmark_domain { + static constexpr char const* name{"benchmarks"}; ///< Name of the domain +}; + +} // namespace cudf::benchmark + +/** + * @brief Convenience macro for generating an NVTX range in the `benchmarks` domain + * from the lifetime of a function. + * + * Uses the name of the immediately enclosing function returned by `__func__` to + * name the range. + * + * Example: + * ``` + * void some_function(){ + * CUDF_BENCHMARK_RANGE(); + * ... + * } + * ``` + */ +#define CUDF_BENCHMARK_RANGE() NVTX3_FUNC_RANGE_IN(cudf::benchmark::benchmark_domain) diff --git a/cpp/benchmarks/filling/repeat.cpp b/cpp/benchmarks/filling/repeat.cpp index 0abef46acac..92559b50bec 100644 --- a/cpp/benchmarks/filling/repeat.cpp +++ b/cpp/benchmarks/filling/repeat.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,57 +15,49 @@ */ #include -#include -#include +#include #include -class Repeat : public cudf::benchmark {}; +#include -template -void BM_repeat(benchmark::State& state) +namespace { +template +void nvbench_repeat(nvbench::state& state, nvbench::type_list) { - auto const n_rows = static_cast(state.range(0)); - auto const n_cols = static_cast(state.range(1)); + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const num_cols = static_cast(state.get_int64("num_cols")); + auto const nulls = state.get_int64("nulls"); auto const input_table = - create_sequence_table(cycle_dtypes({cudf::type_to_id()}, n_cols), - row_count{n_rows}, - nulls ? std::optional{1.0} : std::nullopt); + create_sequence_table(cycle_dtypes({cudf::type_to_id()}, num_cols), + row_count{num_rows}, + nulls ? std::optional{0.1} : std::nullopt); // Create table view - auto input = cudf::table_view(*input_table); + auto const input = input_table->view(); // repeat counts - using sizeT = cudf::size_type; data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution( - cudf::type_to_id(), distribution_id::UNIFORM, 0, 3); - auto repeat_count = create_random_column(cudf::type_to_id(), row_count{n_rows}, profile); + cudf::type_to_id(), distribution_id::UNIFORM, 0, 3); + auto counts = + create_random_column(cudf::type_to_id(), row_count{num_rows}, profile); - // warm up - auto output = cudf::repeat(input, *repeat_count); + auto output = cudf::repeat(input, counts->view()); - for (auto _ : state) { - cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0 - cudf::repeat(input, *repeat_count); - } + state.add_global_memory_reads(input_table->alloc_size()); + state.add_global_memory_writes(output->alloc_size()); + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); - auto data_bytes = - (input.num_columns() * input.num_rows() + output->num_columns() * output->num_rows()) * - sizeof(TypeParam); - auto null_bytes = - nulls ? input.num_columns() * cudf::bitmask_allocation_size_bytes(input.num_rows()) + - output->num_columns() * cudf::bitmask_allocation_size_bytes(output->num_rows()) - : 0; - state.SetBytesProcessed(state.iterations() * (data_bytes + null_bytes)); + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { auto result = cudf::repeat(input, counts->view()); }); } +} // namespace -#define REPEAT_BENCHMARK_DEFINE(name, type, nulls) \ - BENCHMARK_DEFINE_F(Repeat, name)(::benchmark::State & state) { BM_repeat(state); } \ - BENCHMARK_REGISTER_F(Repeat, name) \ - ->RangeMultiplier(8) \ - ->Ranges({{1 << 10, 1 << 26}, {1, 8}}) \ - ->UseManualTime() \ - ->Unit(benchmark::kMillisecond); +using Types = nvbench::type_list; -REPEAT_BENCHMARK_DEFINE(double_nulls, double, true); -REPEAT_BENCHMARK_DEFINE(double_no_nulls, double, false); +NVBENCH_BENCH_TYPES(nvbench_repeat, NVBENCH_TYPE_AXES(Types)) + .set_name("repeat") + .set_type_axes_names({"DataType"}) + .add_int64_power_of_two_axis("num_rows", {10, 14, 18, 22, 26}) + .add_int64_axis("num_cols", {1, 2, 4, 8}) + .add_int64_axis("nulls", {0, 1}); diff --git a/cpp/benchmarks/filter/minmax_filter.cpp b/cpp/benchmarks/filter/minmax_filter.cpp new file mode 100644 index 00000000000..1960922f5be --- /dev/null +++ b/cpp/benchmarks/filter/minmax_filter.cpp @@ -0,0 +1,178 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include +#include + +namespace { + +template +struct benchmark_data; + +template +struct benchmark_data { + static T dist_min() { return 0; } + + static T dist_max() { return 1; } + + static T filter_min() { return 0.05; } + + static T filter_max() { return 0.07; } +}; + +template +struct benchmark_data { + static T dist_min() { return -128; } + + static T dist_max() { return 127; } + + static T filter_min() { return -64; } + + static T filter_max() { return 64; } +}; + +enum class engine_type : uint8_t { AST = 0, JIT = 1 }; + +engine_type engine_from_string(std::string_view str) +{ + if (str == "ast") { + return engine_type::AST; + } else if (str == "jit") { + return engine_type::JIT; + } else { + CUDF_FAIL("unrecognized engine enum: " + std::string(str)); + } +} + +bool boolean_from_string(std::string_view str) +{ + if (str == "true") { + return true; + } else if (str == "false") { + return false; + } else { + CUDF_FAIL("unrecognized boolean value: " + std::string(str)); + } +} + +template +void BM_filter_min_max(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const engine_name = state.get_string("engine"); + auto const nullable = boolean_from_string(state.get_string("nullable")); + auto const engine = engine_from_string(engine_name); + + auto profile = data_profile{}; + profile.set_distribution_params(cudf::type_to_id(), + distribution_id::NORMAL, + benchmark_data::dist_min(), + benchmark_data::dist_max()); + profile.set_null_probability(nullable ? std::optional{0.3} : std::nullopt); + + auto const column = + create_random_column(cudf::type_to_id(), row_count{num_rows}, profile); + + std::string type_name = cudf::type_to_name(cudf::data_type{cudf::type_to_id()}); + + auto udf = std::format( + R"***( + __device__ void transform(bool * out, {0} c0, {0} min, {0} max) {{ + *out = (c0 >= min && c0 <= max); + }} + )***", + type_name); + + auto tree = cudf::ast::tree{}; + auto min_scalar = cudf::numeric_scalar{benchmark_data::filter_min()}; + auto max_scalar = cudf::numeric_scalar{benchmark_data::filter_max()}; + auto min_scalar_column = cudf::make_column_from_scalar(min_scalar, 1); + auto max_scalar_column = cudf::make_column_from_scalar(max_scalar, 1); + + { + auto& column_ref = tree.push(cudf::ast::column_reference{0}); + auto& min_literal = tree.push(cudf::ast::literal{min_scalar}); + auto& max_literal = tree.push(cudf::ast::literal{max_scalar}); + auto& filter_min = tree.push( + cudf::ast::operation{cudf::ast::ast_operator::GREATER_EQUAL, column_ref, min_literal}); + auto& filter_max = + tree.push(cudf::ast::operation{cudf::ast::ast_operator::LESS_EQUAL, column_ref, max_literal}); + tree.push(cudf::ast::operation{cudf::ast::ast_operator::LOGICAL_AND, filter_min, filter_max}); + } + + std::vector predicate_columns; + predicate_columns.push_back(column->view()); + predicate_columns.push_back(min_scalar_column->view()); + predicate_columns.push_back(max_scalar_column->view()); + + // Use the number of bytes read from global memory + state.add_global_memory_reads(static_cast(num_rows)); + state.add_global_memory_writes(num_rows); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto stream = launch.get_stream().get_stream(); + auto mr = cudf::get_current_device_resource_ref(); + + switch (engine) { + case engine_type::AST: { + auto input_table = cudf::table_view{{column->view()}}; + auto const filter_boolean = cudf::compute_column(input_table, tree.back(), stream, mr); + auto const result = + cudf::apply_boolean_mask(input_table, filter_boolean->view(), stream, mr); + } break; + case engine_type::JIT: { + auto result = cudf::filter(predicate_columns, + udf, + std::vector{predicate_columns[0]}, + false, + std::nullopt, + cudf::null_aware::NO, + stream, + mr); + } break; + default: CUDF_UNREACHABLE("Unrecognised engine type requested"); + } + }); +} + +#define FILTER_BENCHMARK_DEFINE(name, key_type) \ + static void name(::nvbench::state& st) { ::BM_filter_min_max(st); } \ + NVBENCH_BENCH(name) \ + .set_name(#name) \ + .add_string_axis("engine", {"ast", "jit"}) \ + .add_int64_axis("num_rows", {100'000, 1'000'000, 10'000'000, 100'000'000}) \ + .add_string_axis("nullable", {"true", "false"}) +} // namespace + +FILTER_BENCHMARK_DEFINE(filter_min_max_int32, int32_t); +FILTER_BENCHMARK_DEFINE(filter_min_max_int64, int64_t); +FILTER_BENCHMARK_DEFINE(filter_min_max_float32, float); +FILTER_BENCHMARK_DEFINE(filter_min_max_float64, double); diff --git a/cpp/benchmarks/fixture/nvbench_fixture.hpp b/cpp/benchmarks/fixture/nvbench_fixture.hpp index 63f09285a26..c67a281ff96 100644 --- a/cpp/benchmarks/fixture/nvbench_fixture.hpp +++ b/cpp/benchmarks/fixture/nvbench_fixture.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,6 +15,7 @@ */ #pragma once +#include #include #include #include @@ -97,6 +98,8 @@ struct nvbench_base_fixture { nvbench_base_fixture(int argc, char const* const* argv) { + cudf::initialize(cudf::init_flags::ALL); + for (int i = 1; i < argc - 1; ++i) { std::string arg = argv[i]; if (arg == detail::rmm_mode_param) { diff --git a/cpp/benchmarks/groupby/group_complex_keys.cpp b/cpp/benchmarks/groupby/group_complex_keys.cpp new file mode 100644 index 00000000000..8717f0f50d8 --- /dev/null +++ b/cpp/benchmarks/groupby/group_complex_keys.cpp @@ -0,0 +1,158 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include + +#include +#include +#include + +#include + +#include + +namespace { + +auto generate_int_keys(cudf::size_type num_cols, + cudf::size_type num_rows, + cudf::size_type value_key_ratio, + double null_probability) +{ + auto const create_column = [&] { + auto builder = + data_profile_builder() + .cardinality(num_rows / value_key_ratio) + .distribution(cudf::type_to_id(), distribution_id::UNIFORM, 0, num_rows); + if (null_probability > 0) { + builder.null_probability(null_probability); + } else { + builder.no_validity(); + } + return create_random_column( + cudf::type_to_id(), row_count{num_rows}, data_profile{builder}); + }; + std::vector> cols; + cols.reserve(num_cols); + for (cudf::size_type i = 0; i < num_cols; ++i) { + cols.emplace_back(create_column()); + } + return std::make_unique(std::move(cols)); +} + +auto generate_mixed_types_keys(cudf::size_type num_cols, + cudf::size_type num_rows, + cudf::size_type value_key_ratio, + double null_probability) +{ + constexpr auto max_str_length = 50; + constexpr auto max_list_size = 10; + constexpr auto nested_depth = 2; + + auto builder = data_profile_builder() + .cardinality(num_rows / value_key_ratio) + .distribution(cudf::type_id::INT32, distribution_id::UNIFORM, 0, num_rows) + .distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length) + .distribution(cudf::type_id::INT64, distribution_id::UNIFORM, 0, num_rows) + .distribution(cudf::type_id::LIST, distribution_id::UNIFORM, 0, max_list_size) + .list_depth(nested_depth) + .struct_depth(nested_depth); + if (null_probability > 0) { + builder.null_probability(null_probability); + } else { + builder.no_validity(); + } + + return create_random_table(cycle_dtypes({cudf::type_id::INT32, + cudf::type_id::STRING, + cudf::type_id::INT64, + cudf::type_id::LIST, + cudf::type_id::STRUCT}, + num_cols), + row_count{num_rows}, + data_profile{builder}); +} + +auto generate_vals(cudf::size_type num_rows, double null_probability) +{ + using Type = int64_t; + auto builder = data_profile_builder().cardinality(0).distribution( + cudf::type_to_id(), distribution_id::UNIFORM, 0, num_rows); + if (null_probability > 0) { + builder.null_probability(null_probability); + } else { + builder.no_validity(); + } + return create_random_column(cudf::type_to_id(), row_count{num_rows}, data_profile{builder}); +} + +template +void run_benchmark_complex_keys(nvbench::state& state) +{ + auto const n_cols = static_cast(state.get_int64("num_cols")); + auto const n_rows = static_cast(state.get_int64("num_rows")); + auto const value_key_ratio = static_cast(state.get_int64("value_key_ratio")); + auto const null_probability = state.get_float64("null_probability"); + + auto const keys_table = [&] { + if constexpr (is_int_keys) { + return generate_int_keys(n_cols, n_rows, value_key_ratio, null_probability); + } else { + return generate_mixed_types_keys(n_cols, n_rows, value_key_ratio, null_probability); + } + }(); + auto const vals = generate_vals(n_rows, null_probability); + + cudf::groupby::groupby gb_obj(keys_table->view()); + + std::vector requests; + requests.emplace_back(cudf::groupby::aggregation_request()); + requests[0].values = vals->view(); + requests[0].aggregations.push_back(cudf::make_min_aggregation()); + + auto const mem_stats_logger = cudf::memory_stats_logger(); + auto const stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch&) { + [[maybe_unused]] auto const result = gb_obj.aggregate(requests, stream); + }); + + state.add_buffer_size( + mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); +} + +} // namespace + +void bench_groupby_int_keys(nvbench::state& state) { run_benchmark_complex_keys(state); } +void bench_groupby_mixed_types_keys(nvbench::state& state) +{ + run_benchmark_complex_keys(state); +} + +#define RUN_BENCH(bench_func) \ + NVBENCH_BENCH(bench_func) \ + .set_name(#bench_func) \ + .add_int64_power_of_two_axis("num_rows", {12, 18, 24}) \ + .add_int64_axis("value_key_ratio", {20, 200}) \ + .add_float64_axis("null_probability", {0, 0.5}) + +RUN_BENCH(bench_groupby_int_keys).add_int64_axis("num_cols", {1, 2, 4, 8, 16}); + +// Not enough memory for more mixed types columns. +RUN_BENCH(bench_groupby_mixed_types_keys).add_int64_axis("num_cols", {1, 2, 3, 4, 5}); diff --git a/cpp/benchmarks/groupby/group_m2_var_std.cpp b/cpp/benchmarks/groupby/group_m2_var_std.cpp new file mode 100644 index 00000000000..fbe6ed3cf78 --- /dev/null +++ b/cpp/benchmarks/groupby/group_m2_var_std.cpp @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include + +#include + +namespace { + +template +void run_benchmark(nvbench::state& state, + cudf::size_type num_rows, + cudf::size_type value_key_ratio, + double null_probability) +{ + auto const keys = [&] { + data_profile const profile = + data_profile_builder() + .cardinality(num_rows / value_key_ratio) + .no_validity() + .distribution(cudf::type_to_id(), distribution_id::UNIFORM, 0, num_rows); + return create_random_column(cudf::type_to_id(), row_count{num_rows}, profile); + }(); + + auto const values = [&] { + auto builder = data_profile_builder().cardinality(0).distribution( + cudf::type_to_id(), distribution_id::UNIFORM, 0, num_rows); + if (null_probability > 0) { + builder.null_probability(null_probability); + } else { + builder.no_validity(); + } + return create_random_column( + cudf::type_to_id(), row_count{num_rows}, data_profile{builder}); + }(); + + // Vector of 1 request + std::vector requests(1); + requests.back().values = values->view(); + if constexpr (Agg == cudf::aggregation::Kind::M2) { + requests.back().aggregations.push_back(cudf::make_m2_aggregation()); + } else if constexpr (Agg == cudf::aggregation::Kind::VARIANCE) { + requests.back().aggregations.push_back( + cudf::make_variance_aggregation()); + } else if constexpr (Agg == cudf::aggregation::Kind::STD) { + requests.back().aggregations.push_back(cudf::make_std_aggregation()); + } else { + throw std::runtime_error("Unsupported aggregation kind."); + } + + auto const mem_stats_logger = cudf::memory_stats_logger(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch&) { + auto gb_obj = cudf::groupby::groupby(cudf::table_view({keys->view()})); + [[maybe_unused]] auto const result = gb_obj.aggregate(requests); + }); + + auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); + state.add_element_count(static_cast(num_rows) / elapsed_time, "rows/s"); + state.add_buffer_size( + mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); +} + +} // namespace + +template +void bench_groupby_m2_var_std(nvbench::state& state, + nvbench::type_list>) +{ + auto const value_key_ratio = static_cast(state.get_int64("value_key_ratio")); + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const null_probability = state.get_float64("null_probability"); + + run_benchmark(state, num_rows, value_key_ratio, null_probability); +} + +using Types = nvbench::type_list; +using AggKinds = nvbench::enum_type_list; + +NVBENCH_BENCH_TYPES(bench_groupby_m2_var_std, NVBENCH_TYPE_AXES(Types, AggKinds)) + .set_name("groupby_m2_var_std") + .add_int64_axis("value_key_ratio", {20, 100}) + .add_int64_axis("num_rows", {100'000, 10'000'000, 100'000'000}) + .add_float64_axis("null_probability", {0, 0.5}); diff --git a/cpp/benchmarks/groupby/group_no_requests.cpp b/cpp/benchmarks/groupby/group_no_requests.cpp index 34618acec75..bee307bff66 100644 --- a/cpp/benchmarks/groupby/group_no_requests.cpp +++ b/cpp/benchmarks/groupby/group_no_requests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,78 +15,61 @@ */ #include -#include -#include -#include #include #include #include #include -class Groupby : public cudf::benchmark {}; +#include -void BM_basic_no_requests(benchmark::State& state) +static void bench_groupby_no_requests(nvbench::state& state) { - cudf::size_type const column_size{(cudf::size_type)state.range(0)}; + auto const num_rows = static_cast(state.get_int64("num_rows")); data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution( cudf::type_to_id(), distribution_id::UNIFORM, 0, 100); auto keys_table = - create_random_table({cudf::type_to_id()}, row_count{column_size}, profile); + create_random_table({cudf::type_to_id()}, row_count{num_rows}, profile); std::vector requests; - for (auto _ : state) { - cuda_event_timer timer(state, true); + state.add_global_memory_reads(keys_table->alloc_size()); + + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { cudf::groupby::groupby gb_obj(*keys_table); auto result = gb_obj.aggregate(requests); - } -} - -BENCHMARK_DEFINE_F(Groupby, BasicNoRequest)(::benchmark::State& state) -{ - BM_basic_no_requests(state); + }); } -BENCHMARK_REGISTER_F(Groupby, BasicNoRequest) - ->UseManualTime() - ->Unit(benchmark::kMillisecond) - ->Arg(10000) - ->Arg(1000000) - ->Arg(10000000) - ->Arg(100000000); +NVBENCH_BENCH(bench_groupby_no_requests) + .set_name("no_requests") + .add_int64_axis("num_rows", {100'000, 1'000'000, 10'000'000, 100'000'000}); -void BM_pre_sorted_no_requests(benchmark::State& state) +static void bench_groupby_pre_sorted_no_requests(nvbench::state& state) { - cudf::size_type const column_size{(cudf::size_type)state.range(0)}; + auto const num_rows = static_cast(state.get_int64("num_rows")); data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution( cudf::type_to_id(), distribution_id::UNIFORM, 0, 100); auto keys_table = - create_random_table({cudf::type_to_id()}, row_count{column_size}, profile); + create_random_table({cudf::type_to_id()}, row_count{num_rows}, profile); auto sort_order = cudf::sorted_order(*keys_table); auto sorted_keys = cudf::gather(*keys_table, *sort_order); // No need to sort values using sort_order because they were generated randomly std::vector requests; + state.add_global_memory_reads(keys_table->alloc_size()); - for (auto _ : state) { - cuda_event_timer timer(state, true); + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { cudf::groupby::groupby gb_obj(*sorted_keys, cudf::null_policy::EXCLUDE, cudf::sorted::YES); auto result = gb_obj.aggregate(requests); - } -} - -BENCHMARK_DEFINE_F(Groupby, PreSortedNoRequests)(::benchmark::State& state) -{ - BM_pre_sorted_no_requests(state); + }); } -BENCHMARK_REGISTER_F(Groupby, PreSortedNoRequests) - ->UseManualTime() - ->Unit(benchmark::kMillisecond) - ->Arg(1000000) - ->Arg(10000000) - ->Arg(100000000); +NVBENCH_BENCH(bench_groupby_pre_sorted_no_requests) + .set_name("pre_sorted_no_requests") + .add_int64_axis("num_rows", {100'000, 1'000'000, 10'000'000, 100'000'000}); diff --git a/cpp/benchmarks/groupby/group_nth.cpp b/cpp/benchmarks/groupby/group_nth.cpp index f2c24433858..4a86d9ad1fd 100644 --- a/cpp/benchmarks/groupby/group_nth.cpp +++ b/cpp/benchmarks/groupby/group_nth.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,27 +15,24 @@ */ #include -#include -#include -#include #include #include #include #include -class Groupby : public cudf::benchmark {}; +#include -void BM_pre_sorted_nth(benchmark::State& state) +static void bench_groupby_nth(nvbench::state& state) { // const cudf::size_type num_columns{(cudf::size_type)state.range(0)}; - cudf::size_type const column_size{(cudf::size_type)state.range(0)}; + auto const num_rows = static_cast(state.get_int64("num_rows")); data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution( cudf::type_to_id(), distribution_id::UNIFORM, 0, 100); auto keys_table = - create_random_table({cudf::type_to_id()}, row_count{column_size}, profile); - auto vals = create_random_column(cudf::type_to_id(), row_count{column_size}, profile); + create_random_table({cudf::type_to_id()}, row_count{num_rows}, profile); + auto vals = create_random_column(cudf::type_to_id(), row_count{num_rows}, profile); auto sort_order = cudf::sorted_order(*keys_table); auto sorted_keys = cudf::gather(*keys_table, *sort_order); @@ -49,17 +46,15 @@ void BM_pre_sorted_nth(benchmark::State& state) requests[0].aggregations.push_back( cudf::make_nth_element_aggregation(-1)); - for (auto _ : state) { - cuda_event_timer timer(state, true); - auto result = gb_obj.aggregate(requests); - } -} + state.add_global_memory_reads(vals->alloc_size()); + auto groups = gb_obj.get_groups(); + state.add_global_memory_writes(groups.keys->alloc_size()); -BENCHMARK_DEFINE_F(Groupby, PreSortedNth)(::benchmark::State& state) { BM_pre_sorted_nth(state); } + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { auto result = gb_obj.aggregate(requests); }); +} -BENCHMARK_REGISTER_F(Groupby, PreSortedNth) - ->UseManualTime() - ->Unit(benchmark::kMillisecond) - ->Arg(1000000) /* 1M */ - ->Arg(10000000) /* 10M */ - ->Arg(100000000); /* 100M */ +NVBENCH_BENCH(bench_groupby_nth) + .set_name("nth") + .add_int64_axis("num_rows", {100'000, 1'000'000, 10'000'000, 100'000'000}); diff --git a/cpp/benchmarks/groupby/group_scan.cpp b/cpp/benchmarks/groupby/group_scan.cpp index 2ae5b6fc2b8..2c470073138 100644 --- a/cpp/benchmarks/groupby/group_scan.cpp +++ b/cpp/benchmarks/groupby/group_scan.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,9 +15,6 @@ */ #include -#include -#include -#include #include #include @@ -25,16 +22,16 @@ #include #include -class Groupby : public cudf::benchmark {}; +#include -void BM_basic_sum_scan(benchmark::State& state) +static void bench_groupby_sum_scan(nvbench::state& state) { - cudf::size_type const column_size{(cudf::size_type)state.range(0)}; + auto const num_rows = static_cast(state.get_int64("num_rows")); data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution( cudf::type_to_id(), distribution_id::UNIFORM, 0, 100); - auto keys = create_random_column(cudf::type_to_id(), row_count{column_size}, profile); - auto vals = create_random_column(cudf::type_to_id(), row_count{column_size}, profile); + auto keys = create_random_column(cudf::type_to_id(), row_count{num_rows}, profile); + auto vals = create_random_column(cudf::type_to_id(), row_count{num_rows}, profile); cudf::groupby::groupby gb_obj(cudf::table_view({keys->view(), keys->view(), keys->view()})); @@ -43,32 +40,25 @@ void BM_basic_sum_scan(benchmark::State& state) requests[0].values = vals->view(); requests[0].aggregations.push_back(cudf::make_sum_aggregation()); - for (auto _ : state) { - cuda_event_timer timer(state, true); - - auto result = gb_obj.scan(requests); - } + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { auto result = gb_obj.scan(requests); }); } -BENCHMARK_DEFINE_F(Groupby, BasicSumScan)(::benchmark::State& state) { BM_basic_sum_scan(state); } - -BENCHMARK_REGISTER_F(Groupby, BasicSumScan) - ->UseManualTime() - ->Unit(benchmark::kMillisecond) - ->Arg(1000000) - ->Arg(10000000) - ->Arg(100000000); +NVBENCH_BENCH(bench_groupby_sum_scan) + .set_name("sum_scan") + .add_int64_axis("num_rows", {100'000, 1'000'000, 10'000'000, 100'000'000}); -void BM_pre_sorted_sum_scan(benchmark::State& state) +static void bench_groupby_pre_sorted_sum_scan(nvbench::state& state) { - cudf::size_type const column_size{(cudf::size_type)state.range(0)}; + auto const num_rows = static_cast(state.get_int64("num_rows")); data_profile profile = data_profile_builder().cardinality(0).no_validity().distribution( cudf::type_to_id(), distribution_id::UNIFORM, 0, 100); auto keys_table = - create_random_table({cudf::type_to_id()}, row_count{column_size}, profile); + create_random_table({cudf::type_to_id()}, row_count{num_rows}, profile); profile.set_null_probability(0.1); - auto vals = create_random_column(cudf::type_to_id(), row_count{column_size}, profile); + auto vals = create_random_column(cudf::type_to_id(), row_count{num_rows}, profile); auto sort_order = cudf::sorted_order(*keys_table); auto sorted_keys = cudf::gather(*keys_table, *sort_order); @@ -81,21 +71,11 @@ void BM_pre_sorted_sum_scan(benchmark::State& state) requests[0].values = vals->view(); requests[0].aggregations.push_back(cudf::make_sum_aggregation()); - for (auto _ : state) { - cuda_event_timer timer(state, true); - - auto result = gb_obj.scan(requests); - } -} - -BENCHMARK_DEFINE_F(Groupby, PreSortedSumScan)(::benchmark::State& state) -{ - BM_pre_sorted_sum_scan(state); + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { auto result = gb_obj.scan(requests); }); } -BENCHMARK_REGISTER_F(Groupby, PreSortedSumScan) - ->UseManualTime() - ->Unit(benchmark::kMillisecond) - ->Arg(1000000) - ->Arg(10000000) - ->Arg(100000000); +NVBENCH_BENCH(bench_groupby_pre_sorted_sum_scan) + .set_name("pre_sorted_sum_scan") + .add_int64_axis("num_rows", {100'000, 1'000'000, 10'000'000, 100'000'000}); diff --git a/cpp/benchmarks/groupby/group_shift.cpp b/cpp/benchmarks/groupby/group_shift.cpp index eda2b3dd158..ea058412e06 100644 --- a/cpp/benchmarks/groupby/group_shift.cpp +++ b/cpp/benchmarks/groupby/group_shift.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,20 +15,17 @@ */ #include -#include -#include -#include #include #include #include #include -class Groupby : public cudf::benchmark {}; +#include -void BM_group_shift(benchmark::State& state) +static void bench_groupby_shift(nvbench::state& state) { - cudf::size_type const column_size{(cudf::size_type)state.range(0)}; + auto const num_rows = static_cast(state.get_int64("num_rows")); int const num_groups = 100; data_profile const profile = @@ -36,30 +33,26 @@ void BM_group_shift(benchmark::State& state) cudf::type_to_id(), distribution_id::UNIFORM, 0, num_groups); auto keys_table = - create_random_table({cudf::type_to_id()}, row_count{column_size}, profile); + create_random_table({cudf::type_to_id()}, row_count{num_rows}, profile); auto vals_table = - create_random_table({cudf::type_to_id()}, row_count{column_size}, profile); + create_random_table({cudf::type_to_id()}, row_count{num_rows}, profile); - cudf::groupby::groupby gb_obj(*keys_table); + cudf::groupby::groupby gb_obj(keys_table->view()); std::vector offsets{ - static_cast(column_size / float(num_groups) * 0.5)}; // forward shift half way + static_cast(num_rows / float(num_groups) * 0.5)}; // forward shift half way // null fill value auto fill_value = cudf::make_default_constructed_scalar(cudf::data_type(cudf::type_id::INT64)); - // non null fill value - // auto fill_value = cudf::make_fixed_width_scalar(static_cast(42)); - for (auto _ : state) { - cuda_event_timer timer(state, true); + state.add_global_memory_reads(keys_table->alloc_size()); + state.add_global_memory_writes(vals_table->alloc_size()); + + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { auto result = gb_obj.shift(*vals_table, offsets, {*fill_value}); - } + }); } -BENCHMARK_DEFINE_F(Groupby, Shift)(::benchmark::State& state) { BM_group_shift(state); } - -BENCHMARK_REGISTER_F(Groupby, Shift) - ->Arg(1000000) - ->Arg(10000000) - ->Arg(100000000) - ->UseManualTime() - ->Unit(benchmark::kMillisecond); +NVBENCH_BENCH(bench_groupby_shift) + .set_name("shift") + .add_int64_axis("num_rows", {1'000'000, 10'000'000, 100'000'000}); diff --git a/cpp/benchmarks/groupby/group_struct_values.cpp b/cpp/benchmarks/groupby/group_struct_values.cpp index 024fd3708fd..b0afbeeaa3f 100644 --- a/cpp/benchmarks/groupby/group_struct_values.cpp +++ b/cpp/benchmarks/groupby/group_struct_values.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,14 +15,14 @@ */ #include -#include -#include #include #include #include #include +#include + static constexpr cudf::size_type num_struct_members = 8; static constexpr cudf::size_type max_int = 100; static constexpr cudf::size_type max_str_length = 32; @@ -44,51 +44,57 @@ static auto create_data_table(cudf::size_type n_rows) } // Max aggregation/scan technically has the same performance as min. -template -void BM_groupby_min_struct(benchmark::State& state) +static void bench_groupby_min_struct(nvbench::state& state) { - auto const n_rows = static_cast(state.range(0)); - auto data_cols = create_data_table(n_rows)->release(); + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto data_cols = create_data_table(num_rows)->release(); auto const keys_view = data_cols.front()->view(); auto const values = cudf::make_structs_column(keys_view.size(), std::move(data_cols), 0, rmm::device_buffer()); - using RequestType = std::conditional_t, - cudf::groupby::aggregation_request, - cudf::groupby::scan_request>; - auto gb_obj = cudf::groupby::groupby(cudf::table_view({keys_view})); - auto requests = std::vector(); - requests.emplace_back(RequestType()); + auto requests = std::vector(); + requests.emplace_back(cudf::groupby::aggregation_request()); + requests.front().aggregations.push_back(cudf::make_min_aggregation()); requests.front().values = values->view(); - requests.front().aggregations.push_back(cudf::make_min_aggregation()); - - for (auto _ : state) { - [[maybe_unused]] auto const timer = cuda_event_timer(state, true); - if constexpr (std::is_same_v) { - [[maybe_unused]] auto const result = gb_obj.aggregate(requests); - } else { - [[maybe_unused]] auto const result = gb_obj.scan(requests); - } - } + + state.add_global_memory_reads(values->alloc_size()); + state.add_global_memory_writes(values->alloc_size()); + + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { auto result = gb_obj.aggregate(requests); }); } -class Groupby : public cudf::benchmark {}; +NVBENCH_BENCH(bench_groupby_min_struct) + .set_name("min_struct") + .add_int64_axis("num_rows", {10'000, 100'000, 1'000'000, 10'000'000, 100'000'000}); + +static void bench_groupby_min_struct_scan(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto data_cols = create_data_table(num_rows)->release(); + + auto const keys_view = data_cols.front()->view(); + auto const values = + cudf::make_structs_column(keys_view.size(), std::move(data_cols), 0, rmm::device_buffer()); -#define MIN_RANGE 10'000 -#define MAX_RANGE 10'000'000 + auto gb_obj = cudf::groupby::groupby(cudf::table_view({keys_view})); + auto requests = std::vector(); + requests.emplace_back(cudf::groupby::scan_request()); + requests.front().aggregations.push_back( + cudf::make_min_aggregation()); + requests.front().values = values->view(); + + state.add_global_memory_reads(values->alloc_size()); + state.add_global_memory_writes(values->alloc_size()); -#define REGISTER_BENCHMARK(name, op_type) \ - BENCHMARK_DEFINE_F(Groupby, name)(::benchmark::State & state) \ - { \ - BM_groupby_min_struct(state); \ - } \ - BENCHMARK_REGISTER_F(Groupby, name) \ - ->UseManualTime() \ - ->Unit(benchmark::kMillisecond) \ - ->RangeMultiplier(4) \ - ->Ranges({{MIN_RANGE, MAX_RANGE}}); + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { auto result = gb_obj.scan(requests); }); +} -REGISTER_BENCHMARK(Aggregation, cudf::groupby_aggregation) -REGISTER_BENCHMARK(Scan, cudf::groupby_scan_aggregation) +NVBENCH_BENCH(bench_groupby_min_struct_scan) + .set_name("min_struct_scan") + .add_int64_axis("num_rows", {10'000, 100'000, 1'000'000, 10'000'000, 100'000'000}); diff --git a/cpp/benchmarks/groupby/group_sum.cpp b/cpp/benchmarks/groupby/group_sum.cpp index b3fd881ccbc..d4fdb612323 100644 --- a/cpp/benchmarks/groupby/group_sum.cpp +++ b/cpp/benchmarks/groupby/group_sum.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,25 +15,22 @@ */ #include -#include -#include -#include #include #include #include #include -class Groupby : public cudf::benchmark {}; +#include -void BM_basic_sum(benchmark::State& state) +static void bench_groupby_basic_sum(nvbench::state& state) { - cudf::size_type const column_size{(cudf::size_type)state.range(0)}; + auto const num_rows = static_cast(state.get_int64("num_rows")); data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution( cudf::type_to_id(), distribution_id::UNIFORM, 0, 100); - auto keys = create_random_column(cudf::type_to_id(), row_count{column_size}, profile); - auto vals = create_random_column(cudf::type_to_id(), row_count{column_size}, profile); + auto keys = create_random_column(cudf::type_to_id(), row_count{num_rows}, profile); + auto vals = create_random_column(cudf::type_to_id(), row_count{num_rows}, profile); cudf::groupby::groupby gb_obj(cudf::table_view({keys->view(), keys->view(), keys->view()})); @@ -42,33 +39,29 @@ void BM_basic_sum(benchmark::State& state) requests[0].values = vals->view(); requests[0].aggregations.push_back(cudf::make_sum_aggregation()); - for (auto _ : state) { - cuda_event_timer timer(state, true); + state.add_global_memory_reads(vals->alloc_size()); + auto groups = gb_obj.get_groups(); + state.add_global_memory_writes(groups.keys->alloc_size()); - auto result = gb_obj.aggregate(requests); - } + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { auto const result = gb_obj.aggregate(requests); }); } -BENCHMARK_DEFINE_F(Groupby, Basic)(::benchmark::State& state) { BM_basic_sum(state); } +NVBENCH_BENCH(bench_groupby_basic_sum) + .set_name("sum") + .add_int64_axis("num_rows", {100'000, 1'000'000, 10'000'000, 100'000'000}); -BENCHMARK_REGISTER_F(Groupby, Basic) - ->UseManualTime() - ->Unit(benchmark::kMillisecond) - ->Arg(10000) - ->Arg(1000000) - ->Arg(10000000) - ->Arg(100000000); - -void BM_pre_sorted_sum(benchmark::State& state) +static void bench_groupby_pre_sorted_sum(nvbench::state& state) { - cudf::size_type const column_size{(cudf::size_type)state.range(0)}; + auto const num_rows = static_cast(state.get_int64("num_rows")); data_profile profile = data_profile_builder().cardinality(0).no_validity().distribution( cudf::type_to_id(), distribution_id::UNIFORM, 0, 100); auto keys_table = - create_random_table({cudf::type_to_id()}, row_count{column_size}, profile); + create_random_table({cudf::type_to_id()}, row_count{num_rows}, profile); profile.set_null_probability(0.1); - auto vals = create_random_column(cudf::type_to_id(), row_count{column_size}, profile); + auto vals = create_random_column(cudf::type_to_id(), row_count{num_rows}, profile); auto sort_order = cudf::sorted_order(*keys_table); auto sorted_keys = cudf::gather(*keys_table, *sort_order); @@ -81,18 +74,15 @@ void BM_pre_sorted_sum(benchmark::State& state) requests[0].values = vals->view(); requests[0].aggregations.push_back(cudf::make_sum_aggregation()); - for (auto _ : state) { - cuda_event_timer timer(state, true); + state.add_global_memory_reads(vals->alloc_size()); + auto groups = gb_obj.get_groups(); + state.add_global_memory_writes(groups.keys->alloc_size()); - auto result = gb_obj.aggregate(requests); - } + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { auto const result = gb_obj.aggregate(requests); }); } -BENCHMARK_DEFINE_F(Groupby, PreSorted)(::benchmark::State& state) { BM_pre_sorted_sum(state); } - -BENCHMARK_REGISTER_F(Groupby, PreSorted) - ->UseManualTime() - ->Unit(benchmark::kMillisecond) - ->Arg(1000000) - ->Arg(10000000) - ->Arg(100000000); +NVBENCH_BENCH(bench_groupby_pre_sorted_sum) + .set_name("pre_sorted_sum") + .add_int64_axis("num_rows", {100'000, 1'000'000, 10'000'000, 100'000'000}); diff --git a/cpp/benchmarks/io/orc/orc_reader_input.cpp b/cpp/benchmarks/io/orc/orc_reader_input.cpp index cafd3cc5c39..6a6b48bebac 100644 --- a/cpp/benchmarks/io/orc/orc_reader_input.cpp +++ b/cpp/benchmarks/io/orc/orc_reader_input.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -87,14 +87,14 @@ void orc_read_common(cudf::size_type num_rows_to_read, } // namespace -template -void BM_orc_read_data(nvbench::state& state, - nvbench::type_list, nvbench::enum_type>) +template +void BM_orc_read_data(nvbench::state& state, nvbench::type_list>) { auto const d_type = get_type_or_group(static_cast(DataType)); cudf::size_type const cardinality = state.get_int64("cardinality"); cudf::size_type const run_length = state.get_int64("run_length"); - cuio_source_sink_pair source_sink(IOType); + auto const source_type = retrieve_io_type_enum(state.get_string("io_type")); + cuio_source_sink_pair source_sink(source_type); auto const num_rows_written = [&]() { auto const tbl = create_random_table( @@ -112,16 +112,18 @@ void BM_orc_read_data(nvbench::state& state, orc_read_common(num_rows_written, source_sink, state); } -template +template void orc_read_io_compression(nvbench::state& state) { - auto const d_type = get_type_or_group({static_cast(data_type::INTEGRAL_SIGNED), - static_cast(data_type::FLOAT), - static_cast(data_type::DECIMAL), - static_cast(data_type::TIMESTAMP), - static_cast(data_type::STRING), - static_cast(data_type::LIST), - static_cast(data_type::STRUCT)}); + auto const source_type = retrieve_io_type_enum(state.get_string("io_type")); + auto const compression = retrieve_compression_type_enum(state.get_string("compression_type")); + auto const d_type = get_type_or_group({static_cast(data_type::INTEGRAL_SIGNED), + static_cast(data_type::FLOAT), + static_cast(data_type::DECIMAL), + static_cast(data_type::TIMESTAMP), + static_cast(data_type::STRING), + static_cast(data_type::LIST), + static_cast(data_type::STRUCT)}); auto const [cardinality, run_length] = [&]() -> std::pair { if constexpr (chunked_read) { @@ -131,7 +133,7 @@ void orc_read_io_compression(nvbench::state& state) static_cast(state.get_int64("run_length"))}; } }(); - cuio_source_sink_pair source_sink(IOType); + cuio_source_sink_pair source_sink(source_type); auto const num_rows_written = [&]() { auto const tbl = create_random_table( @@ -142,7 +144,7 @@ void orc_read_io_compression(nvbench::state& state) cudf::io::orc_writer_options opts = cudf::io::orc_writer_options::builder(source_sink.make_sink_info(), view) - .compression(Compression); + .compression(compression); cudf::io::write_orc(opts); return view.num_rows(); }(); @@ -150,20 +152,14 @@ void orc_read_io_compression(nvbench::state& state) orc_read_common(num_rows_written, source_sink, state); } -template -void BM_orc_read_io_compression( - nvbench::state& state, - nvbench::type_list, nvbench::enum_type>) +void BM_orc_read_io_compression(nvbench::state& state) { - return orc_read_io_compression(state); + return orc_read_io_compression(state); } -template -void BM_orc_chunked_read_io_compression(nvbench::state& state, - nvbench::type_list>) +void BM_orc_chunked_read_io_compression(nvbench::state& state) { - // Only run benchmark using HOST_BUFFER IO. - return orc_read_io_compression(state); + return orc_read_io_compression(state); } using d_type_list = nvbench::enum_type_list; -using io_list = - nvbench::enum_type_list; - -using compression_list = - nvbench::enum_type_list; - -NVBENCH_BENCH_TYPES(BM_orc_read_data, - NVBENCH_TYPE_AXES(d_type_list, nvbench::enum_type_list)) +NVBENCH_BENCH_TYPES(BM_orc_read_data, NVBENCH_TYPE_AXES(d_type_list)) .set_name("orc_read_decode") - .set_type_axes_names({"data_type", "io"}) + .set_type_axes_names({"data_type"}) + .add_string_axis("io_type", {"DEVICE_BUFFER"}) .set_min_samples(4) .add_int64_axis("cardinality", {0, 1000}) .add_int64_axis("run_length", {1, 32}); -NVBENCH_BENCH_TYPES(BM_orc_read_io_compression, NVBENCH_TYPE_AXES(io_list, compression_list)) +NVBENCH_BENCH(BM_orc_read_io_compression) .set_name("orc_read_io_compression") - .set_type_axes_names({"io", "compression"}) + .add_string_axis("io_type", {"FILEPATH", "HOST_BUFFER", "DEVICE_BUFFER"}) + .add_string_axis("compression_type", {"SNAPPY", "ZSTD", "ZLIB", "NONE"}) .set_min_samples(4) .add_int64_axis("cardinality", {0, 1000}) .add_int64_axis("run_length", {1, 32}); // Should have the same parameters as `BM_orc_read_io_compression` for comparison. -NVBENCH_BENCH_TYPES(BM_orc_chunked_read_io_compression, NVBENCH_TYPE_AXES(compression_list)) +NVBENCH_BENCH(BM_orc_chunked_read_io_compression) .set_name("orc_chunked_read_io_compression") - .set_type_axes_names({"compression"}) + .add_string_axis("io_type", {"DEVICE_BUFFER"}) + .add_string_axis("compression_type", {"SNAPPY", "ZSTD", "ZLIB", "NONE"}) .set_min_samples(4) // The input has approximately 520MB and 127K rows. // The limits below are given in MBs. diff --git a/cpp/benchmarks/io/orc/orc_writer.cpp b/cpp/benchmarks/io/orc/orc_writer.cpp index b795f3e3164..2021ed9e48d 100644 --- a/cpp/benchmarks/io/orc/orc_writer.cpp +++ b/cpp/benchmarks/io/orc/orc_writer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -82,10 +82,7 @@ void BM_orc_write_encode(nvbench::state& state, nvbench::type_list -void BM_orc_write_io_compression( - nvbench::state& state, - nvbench::type_list, nvbench::enum_type>) +void BM_orc_write_io_compression(nvbench::state& state) { auto const d_type = get_type_or_group({static_cast(data_type::INTEGRAL_SIGNED), static_cast(data_type::FLOAT), @@ -97,8 +94,8 @@ void BM_orc_write_io_compression( cudf::size_type const cardinality = state.get_int64("cardinality"); cudf::size_type const run_length = state.get_int64("run_length"); - auto const compression = Compression; - auto const sink_type = IO; + auto const sink_type = retrieve_io_type_enum(state.get_string("io_type")); + auto const compression = retrieve_compression_type_enum(state.get_string("compression_type")); auto const tbl = create_random_table(cycle_dtypes(d_type, num_cols), @@ -131,10 +128,9 @@ void BM_orc_write_io_compression( state.add_buffer_size(encoded_file_size, "encoded_file_size", "encoded_file_size"); } -template -void BM_orc_write_statistics( - nvbench::state& state, - nvbench::type_list, nvbench::enum_type>) +template +void BM_orc_write_statistics(nvbench::state& state, + nvbench::type_list>) { auto const d_type = get_type_or_group({static_cast(data_type::INTEGRAL_SIGNED), static_cast(data_type::FLOAT), @@ -143,7 +139,7 @@ void BM_orc_write_statistics( static_cast(data_type::STRING), static_cast(data_type::LIST)}); - auto const compression = Compression; + auto const compression = retrieve_compression_type_enum(state.get_string("compression_type")); auto const stats_freq = Statistics; auto const tbl = create_random_table(d_type, table_size_bytes{data_size}); @@ -183,11 +179,6 @@ using d_type_list = nvbench::enum_type_list; -using io_list = nvbench::enum_type_list; - -using compression_list = - nvbench::enum_type_list; - using stats_list = nvbench::enum_type_list; @@ -199,14 +190,16 @@ NVBENCH_BENCH_TYPES(BM_orc_write_encode, NVBENCH_TYPE_AXES(d_type_list)) .add_int64_axis("cardinality", {0, 1000}) .add_int64_axis("run_length", {1, 32}); -NVBENCH_BENCH_TYPES(BM_orc_write_io_compression, NVBENCH_TYPE_AXES(io_list, compression_list)) +NVBENCH_BENCH(BM_orc_write_io_compression) .set_name("orc_write_io_compression") - .set_type_axes_names({"io", "compression"}) + .add_string_axis("io_type", {"FILEPATH", "HOST_BUFFER", "VOID"}) + .add_string_axis("compression_type", {"SNAPPY", "ZSTD", "ZLIB", "NONE"}) .set_min_samples(4) .add_int64_axis("cardinality", {0, 1000}) .add_int64_axis("run_length", {1, 32}); -NVBENCH_BENCH_TYPES(BM_orc_write_statistics, NVBENCH_TYPE_AXES(stats_list, compression_list)) +NVBENCH_BENCH_TYPES(BM_orc_write_statistics, NVBENCH_TYPE_AXES(stats_list)) .set_name("orc_write_statistics") - .set_type_axes_names({"statistics", "compression"}) + .set_type_axes_names({"statistics"}) + .add_string_axis("compression_type", {"SNAPPY", "ZSTD", "ZLIB", "NONE"}) .set_min_samples(4); diff --git a/cpp/benchmarks/io/parquet/experimental/parquet_deletion_vectors.cpp b/cpp/benchmarks/io/parquet/experimental/parquet_deletion_vectors.cpp new file mode 100644 index 00000000000..d4ef7cf8309 --- /dev/null +++ b/cpp/benchmarks/io/parquet/experimental/parquet_deletion_vectors.cpp @@ -0,0 +1,255 @@ +/* + * Copyright (c) 2022-2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +#include + +namespace { +/** + * @brief Serializes a roaring64 bitmap to a vector of cuda::std::byte + * + * @param deletion_vector Pointer to the roaring64 bitmap to serialize + * + * @return Host vector of bytes containing the serialized roaring64 bitmap + */ +auto serialize_roaring_bitmap(roaring64_bitmap_t const* roaring_bitmap) +{ + auto const num_bytes = roaring64_bitmap_portable_size_in_bytes(roaring_bitmap); + CUDF_EXPECTS(num_bytes > 0, "Roaring64 bitmap is empty"); + auto serialized_bitmap = thrust::host_vector(num_bytes); + std::ignore = roaring64_bitmap_portable_serialize( + roaring_bitmap, reinterpret_cast(serialized_bitmap.data())); + return serialized_bitmap; +} + +/** + * @brief Builds a host vector of expected row indices from the specified row group offsets and + * row counts + * + * @param row_group_offsets Row group offsets + * @param row_group_num_rows Number of rows in each row group + * @param num_rows Total number of table rows + * + * @return Host vector of expected row indices + */ +auto build_row_indices(cudf::host_span row_group_offsets, + cudf::host_span row_group_num_rows, + cudf::size_type num_rows) +{ + auto const num_row_groups = static_cast(row_group_num_rows.size()); + + // Row group span offsets + auto row_group_span_offsets = thrust::host_vector(num_row_groups + 1); + row_group_span_offsets[0] = 0; + thrust::inclusive_scan( + row_group_num_rows.begin(), row_group_num_rows.end(), row_group_span_offsets.begin() + 1); + + // Expected row indices data + auto expected_row_indices = thrust::host_vector(num_rows); + std::fill(expected_row_indices.begin(), expected_row_indices.end(), 1); + + // Scatter row group row offsets to expected row indices + thrust::scatter(row_group_offsets.begin(), + row_group_offsets.end(), + row_group_span_offsets.begin(), + expected_row_indices.begin()); + + // Inclusive scan to compute the rest of the expected row indices + std::for_each( + thrust::counting_iterator(0), thrust::counting_iterator(num_row_groups), [&](auto i) { + auto start_row_index = row_group_span_offsets[i]; + auto end_row_index = row_group_span_offsets[i + 1]; + thrust::inclusive_scan(expected_row_indices.begin() + start_row_index, + expected_row_indices.begin() + end_row_index, + expected_row_indices.begin() + start_row_index); + }); + + return expected_row_indices; +} + +/** + * @brief Builds a roaring64 deletion vector and a (host) row mask vector based on the specified + * probability of a row being deleted + * + * @param row_group_offsets Row group row offsets + * @param row_group_num_rows Number of rows in each row group + * @param num_rows Number of rows in the table + * @param deletion_probability The probability of a row being deleted + * + * @return Serialized roaring64 bitmap buffer + */ +auto build_deletion_vector(cudf::host_span row_group_offsets, + cudf::host_span row_group_num_rows, + cudf::size_type num_rows, + float deletion_probability) +{ + std::mt19937 engine{0xbaLL}; + std::bernoulli_distribution dist(deletion_probability); + + auto row_indices = build_row_indices(row_group_offsets, row_group_num_rows, num_rows); + + CUDF_EXPECTS(std::cmp_equal(row_indices.size(), num_rows), + "Row indices vector must have the same number of rows as the table"); + + auto input_row_mask = thrust::host_vector(num_rows); + std::generate(input_row_mask.begin(), input_row_mask.end(), [&]() { return dist(engine); }); + + auto deletion_vector = roaring64_bitmap_create(); + + // Context for the roaring64 bitmap for faster (bulk) add operations + auto roaring64_context = + roaring64_bulk_context_t{.high_bytes = {0, 0, 0, 0, 0, 0}, .leaf = nullptr}; + + std::for_each(thrust::counting_iterator(0), + thrust::counting_iterator(num_rows), + [&](auto row_idx) { + // Insert provided host row index if the row is deleted in the row mask + if (not input_row_mask[row_idx]) { + roaring64_bitmap_add_bulk( + deletion_vector, &roaring64_context, row_indices[row_idx]); + } + }); + + return serialize_roaring_bitmap(deletion_vector); +} + +auto setup_table_and_deletion_vector(nvbench::state& state) +{ + auto const num_columns = static_cast(state.get_int64("num_cols")); + auto const rows_per_row_group = + static_cast(state.get_int64("rows_per_row_group")); + auto const num_row_groups = static_cast(state.get_int64("num_row_groups")); + auto const deletion_probability = static_cast(state.get_float64("deletion_probability")); + auto const source_type = retrieve_io_type_enum(state.get_string("io_type")); + auto const num_rows = rows_per_row_group * num_row_groups; + + cuio_source_sink_pair source_sink(source_type); + + // Create a table and write it to parquet sink + { + auto const d_types = std::vector{ + cudf::type_id::FLOAT64, + cudf::type_id::DURATION_MICROSECONDS, + cudf::type_id::TIMESTAMP_MILLISECONDS, + cudf::type_id::STRING, + }; + + auto const table = create_random_table(cycle_dtypes(d_types, num_columns), + row_count{num_rows}, + data_profile_builder().null_probability(0.10), + 0xbad); + cudf::io::parquet_writer_options write_opts = + cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), table->view()) + .row_group_size_rows(rows_per_row_group) + .compression(cudf::io::compression_type::NONE); + cudf::io::write_parquet(write_opts); + } + + // Row offsets for each row group - arbitrary, only used to build the index column + auto row_group_offsets = thrust::host_vector(num_row_groups); + row_group_offsets[0] = static_cast(std::llround(2e9)); + std::for_each( + thrust::counting_iterator(1), + thrust::counting_iterator(num_row_groups), + [&](auto i) { row_group_offsets[i] = std::llround(row_group_offsets[i - 1] + 0.5e9); }); + + // Row group splits + auto row_group_splits = thrust::host_vector(num_row_groups - 1); + { + std::mt19937 engine{0xf00d}; + std::uniform_int_distribution dist{1, num_rows}; + std::generate(row_group_splits.begin(), row_group_splits.end(), [&]() { return dist(engine); }); + std::sort(row_group_splits.begin(), row_group_splits.end()); + } + + // Number of rows in each row group + auto row_group_num_rows = thrust::host_vector{}; + { + row_group_num_rows.reserve(num_row_groups); + auto previous_split = cudf::size_type{0}; + std::transform(row_group_splits.begin(), + row_group_splits.end(), + std::back_inserter(row_group_num_rows), + [&](auto current_split) { + auto current_split_size = current_split - previous_split; + previous_split = current_split; + return current_split_size; + }); + row_group_num_rows.push_back(num_rows - row_group_splits.back()); + } + + auto deletion_vector = + build_deletion_vector(row_group_offsets, row_group_num_rows, num_rows, deletion_probability); + + return std::tuple{std::move(source_sink), + std::move(row_group_offsets), + std::move(row_group_num_rows), + std::move(deletion_vector)}; +} + +} // namespace + +void BM_parquet_deletion_vectors(nvbench::state& state) +{ + auto const num_row_groups = static_cast(state.get_int64("num_row_groups")); + auto const rows_per_row_group = + static_cast(state.get_int64("rows_per_row_group")); + auto const num_rows = rows_per_row_group * num_row_groups; + + auto [source_sink, row_group_offsets, row_group_num_rows, deletion_vector] = + setup_table_and_deletion_vector(state); + + cudf::io::parquet_reader_options read_opts = + cudf::io::parquet_reader_options::builder(source_sink.make_source_info()); + + auto mem_stats_logger = cudf::memory_stats_logger(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + state.exec( + nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { + try_drop_l3_cache(); + + timer.start(); + auto const result = cudf::io::parquet::experimental::read_parquet_and_apply_deletion_vector( + read_opts, deletion_vector, row_group_offsets, row_group_num_rows); + timer.stop(); + }); + + auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); + state.add_element_count(static_cast(num_rows) / time, "rows_per_second"); + state.add_buffer_size( + mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); + state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size"); +} + +NVBENCH_BENCH(BM_parquet_deletion_vectors) + .set_name("parquet_deletion_vectors") + .set_min_samples(4) + .add_int64_power_of_two_axis("num_row_groups", nvbench::range(4, 14, 2)) + .add_int64_axis("rows_per_row_group", {5'000, 10'000}) + .add_string_axis("io_type", {"DEVICE_BUFFER"}) + .add_float64_axis("deletion_probability", {0.15, 0.5, 0.75}) + .add_int64_axis("num_cols", {4}); diff --git a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp index d1699daff04..5bcae5cab23 100644 --- a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp +++ b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp @@ -420,7 +420,7 @@ NVBENCH_BENCH_TYPES(BM_parquet_read_data, NVBENCH_TYPE_AXES(d_type_list)) NVBENCH_BENCH(BM_parquet_read_io_compression) .set_name("parquet_read_io_compression") .add_string_axis("io_type", {"FILEPATH", "HOST_BUFFER", "DEVICE_BUFFER"}) - .add_string_axis("compression_type", {"SNAPPY", "NONE"}) + .add_string_axis("compression_type", {"SNAPPY", "ZSTD", "NONE"}) .set_min_samples(4) .add_int64_axis("cardinality", {0, 1000}) .add_int64_axis("run_length", {1, 32}) @@ -488,5 +488,6 @@ NVBENCH_BENCH(BM_parquet_read_long_strings) .add_string_axis("io_type", {"DEVICE_BUFFER"}) .set_min_samples(4) .add_int64_axis("cardinality", {0, 1000}) + .add_int64_axis("data_size", {512 << 20}) .add_int64_power_of_two_axis("avg_string_length", nvbench::range(4, 16, 2)); // 16, 64, ... -> 64k diff --git a/cpp/benchmarks/io/parquet/parquet_writer.cpp b/cpp/benchmarks/io/parquet/parquet_writer.cpp index 84e4b8b93c0..a81ae82cae4 100644 --- a/cpp/benchmarks/io/parquet/parquet_writer.cpp +++ b/cpp/benchmarks/io/parquet/parquet_writer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -82,10 +82,7 @@ void BM_parq_write_encode(nvbench::state& state, nvbench::type_list -void BM_parq_write_io_compression( - nvbench::state& state, - nvbench::type_list, nvbench::enum_type>) +void BM_parq_write_io_compression(nvbench::state& state) { auto const data_types = get_type_or_group({static_cast(data_type::INTEGRAL), static_cast(data_type::FLOAT), @@ -99,8 +96,8 @@ void BM_parq_write_io_compression( cudf::size_type const cardinality = state.get_int64("cardinality"); cudf::size_type const run_length = state.get_int64("run_length"); - auto const compression = Compression; - auto const sink_type = IO; + auto const sink_type = retrieve_io_type_enum(state.get_string("io_type")); + auto const compression = retrieve_compression_type_enum(state.get_string("compression_type")); auto const tbl = create_random_table(cycle_dtypes(data_types, num_cols), @@ -133,13 +130,12 @@ void BM_parq_write_io_compression( state.add_buffer_size(encoded_file_size, "encoded_file_size", "encoded_file_size"); } -template -void BM_parq_write_varying_options( - nvbench::state& state, - nvbench::type_list, nvbench::enum_type>) +template +void BM_parq_write_varying_options(nvbench::state& state, + nvbench::type_list>) { auto const enable_stats = Statistics; - auto const compression = Compression; + auto const compression = retrieve_compression_type_enum(state.get_string("compression_type")); auto const file_path = state.get_string("file_path"); auto const data_types = get_type_or_group({static_cast(data_type::INTEGRAL_SIGNED), @@ -191,11 +187,6 @@ using d_type_list = nvbench::enum_type_list; -using io_list = nvbench::enum_type_list; - -using compression_list = - nvbench::enum_type_list; - using stats_list = nvbench::enum_type_list, std::unique_ptr> generate_i static_cast(build_table_numrows / multiplicity); double const null_probability = Nullable ? 0.3 : 0; - auto const profile = - data_profile{data_profile_builder().null_probability(null_probability).cardinality(0)}; + auto const profile = data_profile{data_profile_builder() + .null_probability(null_probability) + .cardinality(unique_rows_build_table_numrows + 1)}; auto unique_rows_build_table = create_random_table(key_types, row_count{unique_rows_build_table_numrows + 1}, profile, 1); @@ -227,7 +228,7 @@ std::pair, std::unique_ptr> generate_i auto probe_cols = probe_table->release(); for (auto i = 0; i < num_payload_cols; i++) { build_cols.emplace_back(cudf::sequence(build_table_numrows, *init)); - probe_cols.emplace_back(cudf::sequence(build_table_numrows, *init)); + probe_cols.emplace_back(cudf::sequence(probe_table_numrows, *init)); } return std::pair{std::make_unique(std::move(build_cols)), diff --git a/cpp/benchmarks/join/left_join.cu b/cpp/benchmarks/join/left_join.cu index 8e5bfd1cfe0..8c715c09345 100644 --- a/cpp/benchmarks/join/left_join.cu +++ b/cpp/benchmarks/join/left_join.cu @@ -16,7 +16,8 @@ #include "join_common.hpp" -#include +#include +#include auto const num_keys = 1; @@ -26,12 +27,24 @@ void nvbench_left_anti_join(nvbench::state& state, nvbench::enum_type, nvbench::enum_type>) { + auto const num_operations = static_cast(state.get_int64("num_operations")); + auto const reuse_left_table = state.get_string("reuse_table") == "left" + ? cudf::set_as_build_table::LEFT + : cudf::set_as_build_table::RIGHT; + if (reuse_left_table == cudf::set_as_build_table::LEFT) { + state.skip("Not yet implemented"); + return; + } auto dtypes = cycle_dtypes(get_type_or_group(static_cast(DataType)), num_keys); - auto join = [](cudf::table_view const& left, - cudf::table_view const& right, - cudf::null_equality compare_nulls) { - return cudf::left_anti_join(left, right, compare_nulls); + auto join = [num_operations, reuse_left_table](cudf::table_view const& left, + cudf::table_view const& right, + cudf::null_equality compare_nulls) { + cudf::filtered_join obj(right, compare_nulls, reuse_left_table, cudf::get_default_stream()); + for (auto i = 0; i < num_operations - 1; i++) { + [[maybe_unused]] auto result = obj.anti_join(left); + } + return obj.anti_join(left); }; BM_join(state, dtypes, join); @@ -43,12 +56,24 @@ void nvbench_left_semi_join(nvbench::state& state, nvbench::enum_type, nvbench::enum_type>) { + auto const num_operations = static_cast(state.get_int64("num_operations")); + auto const reuse_left_table = state.get_string("reuse_table") == "left" + ? cudf::set_as_build_table::LEFT + : cudf::set_as_build_table::RIGHT; + if (reuse_left_table == cudf::set_as_build_table::LEFT) { + state.skip("Not yet implemented"); + return; + } auto dtypes = cycle_dtypes(get_type_or_group(static_cast(DataType)), num_keys); - auto join = [](cudf::table_view const& left, - cudf::table_view const& right, - cudf::null_equality compare_nulls) { - return cudf::left_semi_join(left, right, compare_nulls); + auto join = [num_operations, reuse_left_table](cudf::table_view const& left, + cudf::table_view const& right, + cudf::null_equality compare_nulls) { + cudf::filtered_join obj(right, compare_nulls, reuse_left_table, cudf::get_default_stream()); + for (auto i = 0; i < num_operations - 1; i++) { + [[maybe_unused]] auto result = obj.semi_join(left); + } + return obj.semi_join(left); }; BM_join(state, dtypes, join); } @@ -60,7 +85,9 @@ NVBENCH_BENCH_TYPES(nvbench_left_anti_join, .set_name("left_anti_join") .set_type_axes_names({"Nullable", "NullEquality", "DataType"}) .add_int64_axis("left_size", JOIN_SIZE_RANGE) - .add_int64_axis("right_size", JOIN_SIZE_RANGE); + .add_int64_axis("right_size", JOIN_SIZE_RANGE) + .add_int64_axis("num_operations", {4}) + .add_string_axis("reuse_table", {"left", "right"}); NVBENCH_BENCH_TYPES(nvbench_left_semi_join, NVBENCH_TYPE_AXES(JOIN_NULLABLE_RANGE, @@ -69,4 +96,6 @@ NVBENCH_BENCH_TYPES(nvbench_left_semi_join, .set_name("left_semi_join") .set_type_axes_names({"Nullable", "NullEquality", "DataType"}) .add_int64_axis("left_size", JOIN_SIZE_RANGE) - .add_int64_axis("right_size", JOIN_SIZE_RANGE); + .add_int64_axis("right_size", JOIN_SIZE_RANGE) + .add_int64_axis("num_operations", {4}) + .add_string_axis("reuse_table", {"left", "right"}); diff --git a/cpp/benchmarks/join/mixed_join.cu b/cpp/benchmarks/join/mixed_join.cu index c68a35155e4..6399e84fc9a 100644 --- a/cpp/benchmarks/join/mixed_join.cu +++ b/cpp/benchmarks/join/mixed_join.cu @@ -20,6 +20,30 @@ auto const num_keys = 2; +void create_complex_ast_expression(cudf::ast::tree& tree, cudf::size_type ast_levels) +{ + CUDF_EXPECTS(ast_levels > 0, "Number of AST levels must be greater than 0"); + + // For mixed joins, the conditional tables only have 1 column each (column 0) + // So we'll create multiple comparisons of the same column to stress the AST evaluation + tree.push(cudf::ast::column_reference(0)); + tree.push(cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT)); + + tree.push(cudf::ast::operation(cudf::ast::ast_operator::EQUAL, tree.at(0), tree.at(1))); + + if (ast_levels == 1) { return; } + + // For multiple levels, create additional comparisons of the same columns + // This will create expressions like: (col0_L == col0_R) && (col0_L == col0_R) && ... + // Total operators created: (2 * ast_levels - 1) = ast_levels EQUAL + (ast_levels-1) LOGICAL_AND + for (cudf::size_type i = 1; i < ast_levels; i++) { + tree.push(cudf::ast::operation(cudf::ast::ast_operator::EQUAL, tree.at(0), tree.at(1))); + + tree.push(cudf::ast::operation( + cudf::ast::ast_operator::LOGICAL_AND, tree.at(tree.size() - 2), tree.back())); + } +} + template void nvbench_mixed_inner_join(nvbench::state& state, nvbench::type_list, @@ -140,6 +164,36 @@ void nvbench_mixed_left_anti_join(nvbench::state& state, BM_join(state, dtypes, join); } +template +void nvbench_mixed_inner_join_complex_ast(nvbench::state& state, + nvbench::type_list, + nvbench::enum_type, + nvbench::enum_type>) +{ + auto const ast_levels = static_cast(state.get_int64("ast_levels")); + + auto join = [ast_levels](cudf::table_view const& left_equality_input, + cudf::table_view const& right_equality_input, + cudf::table_view const& left_conditional_input, + cudf::table_view const& right_conditional_input, + cudf::ast::operation binary_pred, + cudf::null_equality compare_nulls) { + // Create complex AST expression with multiple levels + cudf::ast::tree tree; + create_complex_ast_expression(tree, ast_levels); + + return cudf::mixed_inner_join(left_equality_input, + right_equality_input, + left_conditional_input, + right_conditional_input, + tree.back(), + compare_nulls); + }; + + auto dtypes = cycle_dtypes(get_type_or_group(static_cast(DataType)), num_keys); + BM_join(state, dtypes, join); +} + NVBENCH_BENCH_TYPES(nvbench_mixed_inner_join, NVBENCH_TYPE_AXES(JOIN_NULLABLE_RANGE, DEFAULT_JOIN_NULL_EQUALITY, @@ -149,6 +203,16 @@ NVBENCH_BENCH_TYPES(nvbench_mixed_inner_join, .add_int64_axis("left_size", JOIN_SIZE_RANGE) .add_int64_axis("right_size", JOIN_SIZE_RANGE); +NVBENCH_BENCH_TYPES(nvbench_mixed_inner_join_complex_ast, + NVBENCH_TYPE_AXES(JOIN_NULLABLE_RANGE, + DEFAULT_JOIN_NULL_EQUALITY, + DEFAULT_JOIN_DATATYPES)) + .set_name("mixed_inner_join_complex_ast") + .set_type_axes_names({"Nullable", "NullEquality", "DataType"}) + .add_int64_axis("left_size", JOIN_SIZE_RANGE) + .add_int64_axis("right_size", JOIN_SIZE_RANGE) + .add_int64_axis("ast_levels", {1, 5, 10}); + NVBENCH_BENCH_TYPES(nvbench_mixed_left_join, NVBENCH_TYPE_AXES(JOIN_NULLABLE_RANGE, DEFAULT_JOIN_NULL_EQUALITY, diff --git a/cpp/benchmarks/ndsh/q01.cpp b/cpp/benchmarks/ndsh/q01.cpp index 485e8e5497c..a77ab5dde12 100644 --- a/cpp/benchmarks/ndsh/q01.cpp +++ b/cpp/benchmarks/ndsh/q01.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -67,17 +68,17 @@ rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) { - auto const one = cudf::numeric_scalar(1); + auto const one = discount.type().id() == cudf::type_id::DECIMAL64 + ? cudf::make_fixed_point_scalar(1L, numeric::scale_type{0}) + : cudf::make_fixed_width_scalar(1); auto const one_minus_discount = - cudf::binary_operation(one, discount, cudf::binary_operator::SUB, discount.type(), stream, mr); - auto const disc_price_type = cudf::data_type{cudf::type_id::FLOAT64}; - auto disc_price = cudf::binary_operation(extendedprice, - one_minus_discount->view(), - cudf::binary_operator::MUL, - disc_price_type, - stream, - mr); - return disc_price; + cudf::binary_operation(*one, discount, cudf::binary_operator::SUB, discount.type(), stream, mr); + return cudf::binary_operation(extendedprice, + one_minus_discount->view(), + cudf::binary_operator::MUL, + discount.type(), + stream, + mr); } /** @@ -94,17 +95,16 @@ rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) { - auto const one = cudf::numeric_scalar(1); + auto const one = tax.type().id() == cudf::type_id::DECIMAL64 + ? cudf::make_fixed_point_scalar(1L, numeric::scale_type{0}) + : cudf::make_fixed_width_scalar(1); auto const one_plus_tax = - cudf::binary_operation(one, tax, cudf::binary_operator::ADD, tax.type(), stream, mr); - auto const charge_type = cudf::data_type{cudf::type_id::FLOAT64}; - auto charge = cudf::binary_operation( - disc_price, one_plus_tax->view(), cudf::binary_operator::MUL, charge_type, stream, mr); - return charge; + cudf::binary_operation(*one, tax, cudf::binary_operator::ADD, tax.type(), stream, mr); + return cudf::binary_operation( + disc_price, one_plus_tax->view(), cudf::binary_operator::MUL, tax.type(), stream, mr); } -void run_ndsh_q1(nvbench::state& state, - std::unordered_map& sources) +void run_ndsh_q1(nvbench::state& state, cudf::io::source_info const& source) { // Define the column projections and filter predicate for `lineitem` table std::vector const lineitem_cols = {"l_returnflag", @@ -124,8 +124,7 @@ void run_ndsh_q1(nvbench::state& state, cudf::ast::ast_operator::LESS_EQUAL, shipdate_ref, shipdate_upper_literal); // Read out the `lineitem` table from parquet file - auto lineitem = read_parquet( - sources.at("lineitem").make_source_info(), lineitem_cols, std::move(lineitem_pred)); + auto lineitem = read_parquet(source, lineitem_cols, std::move(lineitem_pred)); // Calculate the discount price and charge columns and append to lineitem table auto disc_price = @@ -169,14 +168,27 @@ void run_ndsh_q1(nvbench::state& state, void ndsh_q1(nvbench::state& state) { // Generate the required parquet files in device buffers - double const scale_factor = state.get_float64("scale_factor"); + auto const scale_factor = state.get_float64("scale_factor"); + auto const filename = state.get_string("filename"); + if (!filename.empty() && scale_factor != 1.0) { + state.skip("Only scale_factor=1 supported with filename input"); + return; + } std::unordered_map sources; - generate_parquet_data_sources(scale_factor, {"lineitem"}, sources); + auto source = [&] { + if (filename.empty()) { + generate_parquet_data_sources(scale_factor, {"lineitem"}, sources); + return sources.at("lineitem").make_source_info(); + } + return cudf::io::source_info(filename); + }(); auto stream = cudf::get_default_stream(); state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); - state.exec(nvbench::exec_tag::sync, - [&](nvbench::launch& launch) { run_ndsh_q1(state, sources); }); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { run_ndsh_q1(state, source); }); } -NVBENCH_BENCH(ndsh_q1).set_name("ndsh_q1").add_float64_axis("scale_factor", {0.01, 0.1, 1}); +NVBENCH_BENCH(ndsh_q1) + .set_name("ndsh_q1") + .add_string_axis("filename", {""}) + .add_float64_axis("scale_factor", {0.01, 0.1, 1}); diff --git a/cpp/benchmarks/ndsh/q09.cpp b/cpp/benchmarks/ndsh/q09.cpp index 15dbcbd485e..7bb333a4956 100644 --- a/cpp/benchmarks/ndsh/q09.cpp +++ b/cpp/benchmarks/ndsh/q09.cpp @@ -16,6 +16,8 @@ #include "utilities.hpp" +#include + #include #include #include @@ -115,7 +117,7 @@ struct q9_data { rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); auto const one = cudf::numeric_scalar(1); auto const one_minus_discount = @@ -147,7 +149,7 @@ struct q9_data { rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); std::string udf = R"***( @@ -161,6 +163,7 @@ struct q9_data { cudf::data_type{cudf::type_id::FLOAT64}, false, std::nullopt, + cudf::null_aware::NO, stream, mr); } @@ -173,7 +176,7 @@ struct q9_data { rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); cudf::ast::tree tree; cudf::table_view table{std::vector{discount, extendedprice, supplycost, quantity}}; @@ -244,7 +247,7 @@ q9_data load_data(std::unordered_map& source std::unique_ptr join_data(q9_data const& data) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); // Generating the `profit` table // Filter the part table using `p_name like '%green%'` diff --git a/cpp/benchmarks/ndsh/utilities.cpp b/cpp/benchmarks/ndsh/utilities.cpp index 766d30bdfeb..074bc2d75f8 100644 --- a/cpp/benchmarks/ndsh/utilities.cpp +++ b/cpp/benchmarks/ndsh/utilities.cpp @@ -16,13 +16,12 @@ #include "utilities.hpp" -#include "common/ndsh_data_generator/ndsh_data_generator.hpp" -#include "common/table_utilities.hpp" -#include "cudf/detail/utilities/integer_utils.hpp" +#include +#include +#include #include #include -#include #include #include #include @@ -137,7 +136,7 @@ table_with_names& table_with_names::append(std::unique_ptr& col, cudf::table_view table_with_names::select(std::vector const& col_names) const { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); std::vector col_indices; for (auto const& col_name : col_names) { col_indices.push_back(column_id(col_name)); @@ -147,7 +146,7 @@ cudf::table_view table_with_names::select(std::vector const& col_na void table_with_names::to_parquet(std::string const& filepath) const { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); auto const sink_info = cudf::io::sink_info(filepath); cudf::io::table_metadata metadata; metadata.schema_info = @@ -165,7 +164,7 @@ std::unique_ptr join_and_gather(cudf::table_view const& left_input, std::vector const& right_on, cudf::null_equality compare_nulls) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); constexpr auto oob_policy = cudf::out_of_bounds_policy::DONT_CHECK; auto const left_selected = left_input.select(left_on); auto const right_selected = right_input.select(right_on); @@ -200,7 +199,7 @@ std::unique_ptr apply_inner_join( std::vector const& right_on, cudf::null_equality compare_nulls) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); std::vector left_on_indices; std::vector right_on_indices; std::transform( @@ -230,7 +229,7 @@ std::unique_ptr apply_inner_join( std::unique_ptr apply_filter(std::unique_ptr const& table, cudf::ast::operation const& predicate) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); auto const boolean_mask = cudf::compute_column(table->table(), predicate); auto result_table = cudf::apply_boolean_mask(table->table(), boolean_mask->view()); return std::make_unique(std::move(result_table), table->column_names()); @@ -239,7 +238,7 @@ std::unique_ptr apply_filter(std::unique_ptr std::unique_ptr apply_mask(std::unique_ptr const& table, std::unique_ptr const& mask) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); auto result_table = cudf::apply_boolean_mask(table->table(), mask->view()); return std::make_unique(std::move(result_table), table->column_names()); } @@ -247,7 +246,7 @@ std::unique_ptr apply_mask(std::unique_ptr c std::unique_ptr apply_groupby(std::unique_ptr const& table, groupby_context_t const& ctx) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); auto const keys = table->select(ctx.keys); cudf::groupby::groupby groupby_obj(keys); std::vector result_column_names; @@ -291,7 +290,7 @@ std::unique_ptr apply_orderby(std::unique_ptr const& sort_keys, std::vector const& sort_key_orders) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); std::vector column_views; for (auto& key : sort_keys) { column_views.push_back(table->column(key)); @@ -305,7 +304,7 @@ std::unique_ptr apply_reduction(cudf::column_view const& colum cudf::aggregation::Kind const& agg_kind, std::string const& col_name) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); auto const agg = cudf::make_sum_aggregation(); auto const result = cudf::reduce(column, *agg, column.type()); cudf::size_type const len = 1; @@ -322,7 +321,7 @@ std::unique_ptr read_parquet( std::vector const& columns, std::unique_ptr const& predicate) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); auto builder = cudf::io::parquet_reader_options_builder(source_info); if (!columns.empty()) { builder.columns(columns); } if (predicate) { builder.filter(*predicate); } @@ -358,7 +357,7 @@ void write_to_parquet_device_buffer(std::unique_ptr const& table, std::vector const& col_names, cuio_source_sink_pair& source) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); auto const stream = cudf::get_default_stream(); // Prepare the table metadata @@ -410,7 +409,7 @@ void generate_parquet_data_sources(double scale_factor, std::vector const& table_names, std::unordered_map& sources) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); // Set the memory resource to the managed pool auto old_mr = cudf::get_current_device_resource(); diff --git a/cpp/benchmarks/ndsh/utilities.hpp b/cpp/benchmarks/ndsh/utilities.hpp index 82c43d8a5ee..70288bb1d2f 100644 --- a/cpp/benchmarks/ndsh/utilities.hpp +++ b/cpp/benchmarks/ndsh/utilities.hpp @@ -18,7 +18,6 @@ #include "io/cuio_common.hpp" -#include #include #include diff --git a/cpp/benchmarks/quantiles/quantiles.cpp b/cpp/benchmarks/quantiles/quantiles.cpp index 24f9cc9c68e..25eae2d68e4 100644 --- a/cpp/benchmarks/quantiles/quantiles.cpp +++ b/cpp/benchmarks/quantiles/quantiles.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,55 +15,51 @@ */ #include -#include -#include #include +#include #include #include #include -class Quantiles : public cudf::benchmark {}; +#include -static void BM_quantiles(benchmark::State& state, bool nulls) +static void bench_quantiles(nvbench::state& state) { - using Type = int; + cudf::size_type const num_rows{static_cast(state.get_int64("num_rows"))}; + cudf::size_type const num_cols{static_cast(state.get_int64("num_cols"))}; + cudf::size_type const num_quantiles{ + static_cast(state.get_int64("num_quantiles"))}; + bool const nulls{static_cast(state.get_int64("nulls"))}; - cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; - cudf::size_type const n_cols{(cudf::size_type)state.range(1)}; - cudf::size_type const n_quantiles{(cudf::size_type)state.range(2)}; + auto const data_type = cudf::type_to_id(); // Create columns with values in the range [0,100) - data_profile profile = data_profile_builder().cardinality(0).distribution( - cudf::type_to_id(), distribution_id::UNIFORM, 0, 100); + data_profile profile = + data_profile_builder().cardinality(0).distribution(data_type, distribution_id::UNIFORM, 0, 100); profile.set_null_probability(nulls ? std::optional{0.01} : std::nullopt); // 1% nulls or no null mask (<0) - auto input_table = create_random_table( - cycle_dtypes({cudf::type_to_id()}, n_cols), row_count{n_rows}, profile); + auto input_table = + create_random_table(cycle_dtypes({data_type}, num_cols), row_count{num_rows}, profile); auto input = cudf::table_view(*input_table); - std::vector q(n_quantiles); - thrust::tabulate( - thrust::seq, q.begin(), q.end(), [n_quantiles](auto i) { return i * (1.0f / n_quantiles); }); + std::vector q(num_quantiles); + thrust::tabulate(thrust::seq, q.begin(), q.end(), [num_quantiles](auto i) { + return i * (1.0f / num_quantiles); + }); - for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::get_default_stream()); + auto stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); - auto result = cudf::quantiles(input, q); - // auto result = (stable) ? cudf::stable_sorted_order(input) : cudf::sorted_order(input); - } + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch&) { auto result = cudf::quantiles(input, q); }); } -#define QUANTILES_BENCHMARK_DEFINE(name, nulls) \ - BENCHMARK_DEFINE_F(Quantiles, name) \ - (::benchmark::State & st) { BM_quantiles(st, nulls); } \ - BENCHMARK_REGISTER_F(Quantiles, name) \ - ->RangeMultiplier(4) \ - ->Ranges({{1 << 16, 1 << 26}, {1, 8}, {1, 12}}) \ - ->UseManualTime() \ - ->Unit(benchmark::kMillisecond); - -QUANTILES_BENCHMARK_DEFINE(no_nulls, false) -QUANTILES_BENCHMARK_DEFINE(nulls, true) +NVBENCH_BENCH(bench_quantiles) + .set_name("quantiles") + .add_int64_power_of_two_axis("num_rows", {16, 18, 20, 22, 24, 26}) + .add_int64_axis("num_cols", {1, 2, 4, 8}) + .add_int64_axis("num_quantiles", {1, 4, 8, 12}) + .add_int64_axis("nulls", {0, 1}); diff --git a/cpp/benchmarks/sort/segmented_top_k.cpp b/cpp/benchmarks/sort/segmented_top_k.cpp new file mode 100644 index 00000000000..6e6507d2f51 --- /dev/null +++ b/cpp/benchmarks/sort/segmented_top_k.cpp @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include + +#include + +template +void bench_segmented_top_k(nvbench::state& state, nvbench::type_list) +{ + auto const ordered = static_cast(state.get_int64("ordered")); + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const segment = static_cast(state.get_int64("segment")); + auto const k = static_cast(state.get_int64("k")); + auto const data_type = cudf::type_to_id(); + + data_profile const profile = data_profile_builder().no_validity().distribution( + data_type, distribution_id::UNIFORM, 0, segment); + auto const input = create_random_column(data_type, row_count{num_rows}, profile); + + auto const segments = cudf::sequence((num_rows / segment) + 1, + cudf::numeric_scalar(0), + cudf::numeric_scalar(segment)); + + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + state.add_global_memory_reads(num_rows); + state.add_global_memory_writes(segments->size() * k); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + if (ordered) { + cudf::segmented_top_k_order(input->view(), segments->view(), k); + } else { + cudf::segmented_top_k(input->view(), segments->view(), k); + } + }); +} + +NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_s, "time_s", "time_s"); + +using Types = nvbench::type_list; + +NVBENCH_BENCH_TYPES(bench_segmented_top_k, NVBENCH_TYPE_AXES(Types)) + .set_name("segmented_top_k") + .add_int64_axis("num_rows", {262144, 2097152, 16777216, 67108864}) + .add_int64_axis("segment", {1024, 2048}) + .add_int64_axis("k", {100, 1000}) + .add_int64_axis("ordered", {0, 1}); diff --git a/cpp/benchmarks/string/experimental/stringview_compare.cu b/cpp/benchmarks/string/experimental/stringview_compare.cu new file mode 100644 index 00000000000..f012e647dac --- /dev/null +++ b/cpp/benchmarks/string/experimental/stringview_compare.cu @@ -0,0 +1,500 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +namespace { + +// Runtime switch to use ArrowStringView instead of cudf's Arrow string format. +// +// Set to anything to use ArrowStringView, and unset to use cudf. +// Example command line to generate ArrowStringView numbers for sv_hash benchmark: +// CUDF_BM_ARROWSTRINGVIEW=1 benchmarks/STRINGS_EXPERIMENTAL_NVBENCH -d 0 -b sv_hash +// +// This will generate nvbench benchmark outputs that can be compared directly +// using the `nvbench compare.py` script. +auto const BM_ARROWSTRINGVIEW = "CUDF_BM_ARROWSTRINGVIEW"; + +/** + * Creates ArrowBinaryView objects from a strings column. + */ +struct strings_to_binary_view { + cudf::column_device_view d_strings; + cudf::detail::input_offsetalator d_offsets; + ArrowBinaryView* d_items; // output + + __device__ void operator()(cudf::size_type idx) const + { + auto& item = d_items[idx]; + if (d_strings.is_null(idx)) { + item.inlined.size = 0; // not used in this benchmark + return; + } + + auto const d_str = d_strings.element(idx); + item.inlined.size = d_str.size_bytes(); + // copy the string data to the inlined buffer if it fits + if (d_str.size_bytes() <= NANOARROW_BINARY_VIEW_INLINE_SIZE) { + thrust::copy(thrust::seq, d_str.data(), d_str.data() + d_str.size_bytes(), item.inlined.data); + thrust::uninitialized_fill(thrust::seq, + item.inlined.data + item.inlined.size, + item.inlined.data + NANOARROW_BINARY_VIEW_INLINE_SIZE, + 0); + } else { + // otherwise, copy the prefix and set the offset to the data buffer + thrust::copy(thrust::seq, + d_str.data(), + d_str.data() + NANOARROW_BINARY_VIEW_PREFIX_SIZE, + item.ref.prefix); + auto const offset = d_offsets[idx]; + item.ref.buffer_index = 0; // only one buffer in this benchmark + item.ref.offset = static_cast(offset); + } + } +}; + +/** + * Returns a string_view from an ArrowBinaryView. + * This helps in the comparison by both implementations using `cudf::string_view` + * as the base type so the actual operations are the same and only the + * format (how the data is organized) is different. + */ +__device__ cudf::string_view get_string_view(ArrowBinaryView const& item, char const* d_chars) +{ + auto const data = item.inlined.size <= NANOARROW_BINARY_VIEW_INLINE_SIZE + ? reinterpret_cast(item.inlined.data) + : d_chars + item.ref.offset; + return cudf::string_view(data, item.inlined.size); +} + +/** + * Hashes a string from an ArrowBinaryView. + */ +struct hash_arrow_sv { + ArrowBinaryView* d_items; + char const* d_chars; + __device__ cudf::hash_value_type operator()(cudf::size_type idx) const + { + auto& item = d_items[idx]; + auto const d_str = get_string_view(item, d_chars); + auto const hasher = cudf::hashing::detail::MurmurHash3_x86_32{0}; + return hasher(d_str); + } +}; + +/** + * Checks if a string from an ArrowBinaryView starts with a target string. + */ +struct starts_arrow_sv { + ArrowBinaryView* d_items; + char const* d_chars; + cudf::size_type tgt_size; + __device__ bool operator()(cudf::size_type idx) const + { + // note that this requires tgt_size <= 26 + auto const d_tgt = cudf::string_view("abcdefghijklmnopqrstuvwxyz", tgt_size); + auto& item = d_items[idx]; + auto const size = item.inlined.size; + auto const data = (size <= NANOARROW_BINARY_VIEW_INLINE_SIZE) || (tgt_size <= 4) + ? reinterpret_cast(item.inlined.data) + : d_chars + item.ref.offset; + auto const d_str = cudf::string_view(data, size); + return d_str.size_bytes() >= d_tgt.size_bytes() && + d_tgt.compare(d_str.data(), d_tgt.size_bytes()) == 0; + } +}; + +/** + * Compares two strings from ArrowBinaryView objects. + */ +struct compare_arrow_sv { + ArrowBinaryView* d_items; + char const* d_chars; + __device__ bool operator()(cudf::size_type lhs, cudf::size_type rhs) + { + auto& item_lhs = d_items[lhs]; + auto& item_rhs = d_items[rhs]; + + // shortcut to check preview bytes + auto pv_lhs = reinterpret_cast(item_lhs.inlined.data)[0]; + auto pv_rhs = reinterpret_cast(item_rhs.inlined.data)[0]; + if (pv_lhs != pv_rhs) { + return cudf::hashing::detail::swap_endian(pv_lhs) < + cudf::hashing::detail::swap_endian(pv_rhs); + } + + // prefix matches so check how many bytes are left to compare + constexpr auto prefix_size = static_cast(sizeof(uint32_t)); + auto const size_lhs = item_lhs.inlined.size; + auto const size_rhs = item_rhs.inlined.size; + // if no bytes left to compare, we are done (strings are equal) + if (size_lhs <= prefix_size && size_rhs <= prefix_size) { return false; } + + // compare the remaining bytes + auto const d_str_lhs = cudf::string_view( + get_string_view(item_lhs, d_chars).data() + prefix_size, size_lhs - prefix_size); + auto const d_str_rhs = cudf::string_view( + get_string_view(item_rhs, d_chars).data() + prefix_size, size_rhs - prefix_size); + + return d_str_lhs < d_str_rhs; + } +}; + +/** + * Hashes a string from a cudf column + */ +struct hash_sv { + cudf::column_device_view d_strings; + __device__ cudf::hash_value_type operator()(cudf::size_type idx) const + { + auto const d_str = d_strings.element(idx); + auto const hasher = cudf::hashing::detail::MurmurHash3_x86_32{0}; + return hasher(d_str); + } +}; + +/** + * Checks if a string from a cudf column starts with a target string + */ +struct starts_sv { + cudf::column_device_view d_strings; + cudf::size_type tgt_size; + __device__ bool operator()(cudf::size_type idx) const + { + auto const d_str = d_strings.element(idx); + auto const d_tgt = cudf::string_view("abcdefghijklmnopqrstuvwxyz", tgt_size); + return d_str.size_bytes() >= d_tgt.size_bytes() && + d_tgt.compare(d_str.data(), d_tgt.size_bytes()) == 0; + } +}; + +/** + * Compares two strings from a cudf column + */ +struct compare_sv { + cudf::column_device_view d_strings; + __device__ bool operator()(cudf::size_type lhs, cudf::size_type rhs) + { + auto const d_str_lhs = d_strings.element(lhs); + auto const d_str_rhs = d_strings.element(rhs); + return d_str_lhs < d_str_rhs; + } +}; + +/** + * Creates an ArrowBinaryView vector and data buffer from a strings column. + */ +std::pair, rmm::device_buffer> create_sv_array( + cudf::strings_column_view const& input, rmm::cuda_stream_view stream) +{ + auto const d_strings = cudf::column_device_view::create(input.parent(), stream); + auto d_offsets = + cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset()); + + // count the (longer) strings that will need to be stored in the data buffer + auto const num_longer_strings = thrust::count_if( + rmm::exec_policy_nosync(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(input.size()), + [d_offsets] __device__(auto idx) { + return d_offsets[idx + 1] - d_offsets[idx] > NANOARROW_BINARY_VIEW_INLINE_SIZE; + }); + + // gather all the long-ish strings into a single strings column + auto [unused_col, longer_strings] = [&] { + if (num_longer_strings == input.size()) { + // we can use the input column as is for the remainder of this function + return std::pair{cudf::make_empty_column(cudf::type_id::STRING), input}; + } + auto indices = cudf::detail::make_counting_transform_iterator( + 0, + cuda::proclaim_return_type( + [d_strings = *d_strings] __device__(auto idx) { + if (d_strings.is_null(idx)) { + return cudf::strings::detail::string_index_pair{nullptr, 0}; + } + auto const d_str = d_strings.element(idx); + return (d_str.size_bytes() > NANOARROW_BINARY_VIEW_INLINE_SIZE) + ? cudf::strings::detail::string_index_pair{d_str.data(), d_str.size_bytes()} + : cudf::strings::detail::string_index_pair{"", 0}; + })); + auto longer_strings = cudf::strings::detail::make_strings_column( + indices, indices + input.size(), stream, cudf::get_current_device_resource_ref()); + stream.synchronize(); + auto const sv = cudf::strings_column_view(longer_strings->view()); + return std::pair{std::move(longer_strings), sv}; + }(); + auto [first, last] = cudf::strings::detail::get_first_and_last_offset(longer_strings, stream); + auto const longer_chars_size = last - first; + + // Make sure only one buffer is needed. + // Using a single data buffer makes the two formats more similar focusing on the layout. + constexpr int64_t max_size = std::numeric_limits::max() / 2; + auto const num_buffers = cudf::util::div_rounding_up_safe(longer_chars_size, max_size); + CUDF_EXPECTS(num_buffers <= 1, "num_buffers must be <= 1"); + + // now build BinaryView objects from the strings in device memory + // (for-each works better than transform due to the prefix/data of the ArrowBinaryView) + auto d_items = rmm::device_uvector(input.size(), stream); + thrust::for_each_n(rmm::exec_policy_nosync(stream), + thrust::counting_iterator(0), + input.size(), + strings_to_binary_view{*d_strings, d_offsets, d_items.data()}); + + rmm::device_buffer data_buffer(longer_chars_size, stream); + auto const chars_data = longer_strings.chars_begin(stream); + CUDF_CUDA_TRY(cudaMemcpyAsync( + data_buffer.data(), chars_data, longer_chars_size, cudaMemcpyDefault, stream.value())); + + return std::pair{std::move(d_items), std::move(data_buffer)}; +} +} // namespace + +static void BM_sv_hash(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const max_width = static_cast(state.get_int64("max_width")); + auto const min_width = state.get_int64("fw") ? max_width : 1; // fw = fixed width + + data_profile const profile = + data_profile_builder() + .distribution(cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width) + .no_validity(); + auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); + auto col_view = column->view(); + auto stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + state.add_global_memory_writes(num_rows * sizeof(cudf::hash_value_type)); + auto output = rmm::device_uvector(num_rows, stream); + auto begin = thrust::make_counting_iterator(0); + auto end = thrust::make_counting_iterator(num_rows); + + if (std::getenv(BM_ARROWSTRINGVIEW)) { + auto [d_items, data_buffer] = create_sv_array(col_view, stream); + auto const d_chars = reinterpret_cast(data_buffer.data()); + state.add_global_memory_reads(num_rows * sizeof(ArrowBinaryView) + data_buffer.size()); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + thrust::transform(rmm::exec_policy(stream), + begin, + end, + output.begin(), + hash_arrow_sv{d_items.data(), d_chars}); + }); + } else { + auto d_strings = cudf::column_device_view::create(col_view, stream); + auto col_size = column->alloc_size(); + state.add_global_memory_reads(col_size); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + thrust::transform(rmm::exec_policy(stream), begin, end, output.begin(), hash_sv{*d_strings}); + }); + } +} + +static void BM_sv_starts(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const max_width = static_cast(state.get_int64("max_width")); + auto const min_width = state.get_int64("fw") ? max_width : 1; + auto const tgt_size = static_cast(state.get_int64("tgt_size")); + + data_profile const profile = + data_profile_builder() + .distribution(cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width) + .no_validity(); + auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); + auto col_view = column->view(); + auto stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + state.add_global_memory_writes(num_rows * sizeof(bool)); + auto output = rmm::device_uvector(num_rows, stream); + auto begin = thrust::make_counting_iterator(0); + auto end = thrust::make_counting_iterator(num_rows); + + if (std::getenv(BM_ARROWSTRINGVIEW)) { + auto [d_items, data_buffer] = create_sv_array(col_view, stream); + auto const d_chars = reinterpret_cast(data_buffer.data()); + state.add_global_memory_reads(num_rows * sizeof(ArrowBinaryView) + data_buffer.size()); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + thrust::transform(rmm::exec_policy(stream), + begin, + end, + output.begin(), + starts_arrow_sv{d_items.data(), d_chars, tgt_size}); + }); + } else { + auto d_strings = cudf::column_device_view::create(col_view, stream); + auto col_size = column->alloc_size(); + state.add_global_memory_reads(col_size); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + thrust::transform( + rmm::exec_policy(stream), begin, end, output.begin(), starts_sv{*d_strings, tgt_size}); + }); + } +} + +static void BM_sv_sort(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const max_width = static_cast(state.get_int64("max_width")); + auto const card = static_cast(state.get_int64("card")); + + auto h_data = std::vector(card); + std::transform(thrust::counting_iterator(0), + thrust::counting_iterator(card), + h_data.begin(), + [max_width](auto idx) { + auto const fmt = std::format("{{:0{}d}}", max_width); + return std::vformat(fmt, std::make_format_args(idx)); + }); + auto d_data = cudf::test::strings_column_wrapper(h_data.begin(), h_data.end()).release(); + + data_profile gather_profile = data_profile_builder().cardinality(0).no_validity().distribution( + cudf::type_id::INT32, distribution_id::UNIFORM, 0, d_data->size() - 1); + auto gather_map = create_random_column(cudf::type_id::INT32, row_count{num_rows}, gather_profile); + + auto table = cudf::gather(cudf::table_view({d_data->view()}), gather_map->view()); + auto column = std::move(table->release().front()); + + auto col_view = column->view(); + auto stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + state.add_global_memory_writes(num_rows * sizeof(cudf::size_type)); + + // indices are the keys that are sorted (not inplace) + auto keys = rmm::device_uvector(num_rows, stream); + auto in_keys = thrust::make_counting_iterator(0); + auto out_keys = keys.begin(); + auto tmp_bytes = std::size_t{0}; + + if (std::getenv(BM_ARROWSTRINGVIEW)) { + auto [d_items, data_buffer] = create_sv_array(col_view, stream); + auto const d_chars = reinterpret_cast(data_buffer.data()); + auto comparator = compare_arrow_sv{d_items.data(), d_chars}; + cub::DeviceMergeSort::SortKeysCopy( + nullptr, tmp_bytes, in_keys, out_keys, num_rows, comparator, stream.value()); + auto tmp_stg = rmm::device_buffer(tmp_bytes, stream); + state.add_global_memory_reads(num_rows * sizeof(ArrowBinaryView) + data_buffer.size()); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + cub::DeviceMergeSort::SortKeysCopy( + tmp_stg.data(), tmp_bytes, in_keys, out_keys, num_rows, comparator, stream.value()); + }); + } else { + auto d_strings = cudf::column_device_view::create(col_view, stream); + auto col_size = column->alloc_size(); + state.add_global_memory_reads(col_size); + auto comparator = compare_sv{*d_strings}; + cub::DeviceMergeSort::SortKeysCopy( + nullptr, tmp_bytes, in_keys, out_keys, num_rows, comparator, stream.value()); + auto tmp_stg = rmm::device_buffer(tmp_bytes, stream); + state.add_global_memory_reads(col_size); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + cub::DeviceMergeSort::SortKeysCopy( + tmp_stg.data(), tmp_bytes, in_keys, out_keys, num_rows, comparator, stream.value()); + }); + } +} + +static void BM_sv_gather(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const width = static_cast(state.get_int64("width")); + auto const map_rows = static_cast(state.get_int64("map_rows")); + + data_profile profile = data_profile_builder().no_validity().distribution( + cudf::type_id::STRING, distribution_id::NORMAL, width, width); + auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); + auto col_view = column->view(); + + data_profile map_profile = data_profile_builder().cardinality(0).no_validity().distribution( + cudf::type_id::INT32, distribution_id::UNIFORM, 0, num_rows - 1); + auto map = create_random_column(cudf::type_id::INT32, row_count{map_rows}, map_profile); + auto map_view = map->view(); + + auto stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + + if (std::getenv(BM_ARROWSTRINGVIEW)) { + auto [d_items, data_buffer] = create_sv_array(col_view, stream); + + auto begin = map_view.begin(); + auto end = map_view.end(); + auto input = d_items.data(); + auto output = rmm::device_uvector(map_view.size(), stream); + + state.add_global_memory_writes(map_rows * sizeof(ArrowBinaryView)); + state.add_global_memory_reads(map_rows * sizeof(ArrowBinaryView)); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + thrust::gather(rmm::exec_policy(stream), begin, end, input, output.begin()); + }); + } else { + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + cudf::gather( + cudf::table_view({col_view}), map_view, cudf::out_of_bounds_policy::DONT_CHECK, stream); + }); + } +} + +NVBENCH_BENCH(BM_sv_hash) + .set_name("sv_hash") + .add_int64_axis("num_rows", {100'000, 1'000'000, 10'000'000}) + .add_int64_axis("max_width", {5, 10, 15, 20, 30, 60}) + .add_int64_axis("fw", {1, 0}); + +NVBENCH_BENCH(BM_sv_starts) + .set_name("sv_starts") + .add_int64_axis("num_rows", {100'000, 1'000'000, 10'000'000}) + .add_int64_axis("max_width", {10, 20, 30, 60}) + .add_int64_axis("tgt_size", {4, 8, 16}) + .add_int64_axis("fw", {1, 0}); + +NVBENCH_BENCH(BM_sv_sort) + .set_name("sv_sort") + .add_int64_axis("num_rows", {100'000, 1'000'000, 10'000'000}) + .add_int64_axis("max_width", {10, 20, 30, 60}) + .add_int64_axis("card", {100, 1000}); + +NVBENCH_BENCH(BM_sv_gather) + .set_name("sv_gather") + .add_int64_axis("num_rows", {100'000, 1'000'000, 10'000'000}) + .add_int64_axis("width", {6, 12, 24, 48}) + .add_int64_axis("map_rows", {10'000, 100'000}); diff --git a/cpp/benchmarks/transform/encode.cpp b/cpp/benchmarks/transform/encode.cpp new file mode 100644 index 00000000000..dea40a0865d --- /dev/null +++ b/cpp/benchmarks/transform/encode.cpp @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +template +static void bench_encode(nvbench::state& state, nvbench::type_list) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const width = static_cast(state.get_int64("width")); + auto const nulls = state.get_float64("nulls"); + auto const data_type = cudf::type_to_id(); + + auto range = data_type == cudf::type_id::STRING ? (width / 10) : width; + data_profile const profile = + data_profile_builder().cardinality(0).null_probability(nulls).distribution( + data_type, distribution_id::UNIFORM, 0, range); + auto input = create_random_column(data_type, row_count{num_rows}, profile); + auto tv = cudf::table_view({input->view()}); + + auto alloc_size = input->alloc_size(); + state.add_global_memory_reads(alloc_size); + state.add_global_memory_writes(num_rows); + auto stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch&) { auto result = cudf::encode(tv, stream); }); +} + +NVBENCH_DECLARE_TYPE_STRINGS(cudf::string_view, "string_view", "string_view"); + +using Types = nvbench::type_list; + +NVBENCH_BENCH_TYPES(bench_encode, NVBENCH_TYPE_AXES(Types)) + .set_name("encode") + .add_int64_axis("width", {10, 100}) + .add_int64_axis("num_rows", {262144, 2097152, 16777216, 67108864}) + .add_float64_axis("nulls", {0, 0.1}); diff --git a/cpp/benchmarks/transform/polynomials.cpp b/cpp/benchmarks/transform/polynomials.cpp index a3e09535fa3..5b03ae1dfbe 100644 --- a/cpp/benchmarks/transform/polynomials.cpp +++ b/cpp/benchmarks/transform/polynomials.cpp @@ -95,6 +95,7 @@ static void BM_transform_polynomials(nvbench::state& state) cudf::data_type{cudf::type_to_id()}, false, std::nullopt, + cudf::null_aware::NO, launch.get_stream().get_stream()); }); } diff --git a/cpp/benchmarks/transform/transform.cpp b/cpp/benchmarks/transform/transform.cpp index f487f191d94..a404a9e9582 100644 --- a/cpp/benchmarks/transform/transform.cpp +++ b/cpp/benchmarks/transform/transform.cpp @@ -101,6 +101,7 @@ static void BM_transform(nvbench::state& state) cudf::data_type{cudf::type_to_id()}, false, std::nullopt, + cudf::null_aware::NO, launch.get_stream().get_stream()); }); } diff --git a/cpp/cmake/Modules/ConfigureCUDA.cmake b/cpp/cmake/Modules/ConfigureCUDA.cmake index a6987803c3b..e0c5cedf2ee 100644 --- a/cpp/cmake/Modules/ConfigureCUDA.cmake +++ b/cpp/cmake/Modules/ConfigureCUDA.cmake @@ -37,13 +37,8 @@ if(DISABLE_DEPRECATION_WARNINGS) endif() # make sure we produce smallest binary size -list(APPEND CUDF_CUDA_FLAGS -Xfatbin=-compress-all) -if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA" - AND (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.9 AND CMAKE_CUDA_COMPILER_VERSION - VERSION_LESS 13.0) -) - list(APPEND CUDF_CUDA_FLAGS -Xfatbin=--compress-level=3) -endif() +include(${rapids-cmake-dir}/cuda/enable_fatbin_compression.cmake) +rapids_cuda_enable_fatbin_compression(VARIABLE CUDF_CUDA_FLAGS TUNE_FOR rapids) # Option to enable line info in CUDA device compilation to allow introspection when profiling / # memchecking diff --git a/cpp/cmake/Modules/JitifyPreprocessKernels.cmake b/cpp/cmake/Modules/JitifyPreprocessKernels.cmake index 388c748c694..7b34de90a29 100644 --- a/cpp/cmake/Modules/JitifyPreprocessKernels.cmake +++ b/cpp/cmake/Modules/JitifyPreprocessKernels.cmake @@ -33,6 +33,8 @@ function(jit_preprocess_files) get_filename_component(jit_output_directory "${ARG_OUTPUT}" DIRECTORY) list(APPEND JIT_PREPROCESSED_FILES "${ARG_OUTPUT}") + get_filename_component(ARG_OUTPUT_DIR "${ARG_OUTPUT}" DIRECTORY) + # Note: need to pass _FILE_OFFSET_BITS=64 in COMMAND due to a limitation in how conda builds # glibc add_custom_command( @@ -43,11 +45,10 @@ function(jit_preprocess_files) COMMAND ${CMAKE_COMMAND} -E make_directory "${jit_output_directory}" COMMAND "${CMAKE_COMMAND}" -E env LD_LIBRARY_PATH=${CUDAToolkit_LIBRARY_DIR} - $ ${ARG_FILE} -o - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit_preprocessed_files -i -std=c++20 + $ ${ARG_FILE} -o ${ARG_OUTPUT_DIR} -i -std=c++20 -remove-unused-globals -D_FILE_OFFSET_BITS=64 -D__CUDACC_RTC__ -DCUDF_RUNTIME_JIT -I${CUDF_SOURCE_DIR}/include -I${CUDF_SOURCE_DIR}/src ${includes} - --no-preinclude-workarounds --no-replace-pragma-once + --no-preinclude-workarounds --no-replace-pragma-once --diag-suppress=47 --device-int128 COMMENT "Custom command to JIT-compile files." ) endforeach() diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake index c519fa687c3..1a52d263ea1 100644 --- a/cpp/cmake/thirdparty/get_arrow.cmake +++ b/cpp/cmake/thirdparty/get_arrow.cmake @@ -23,17 +23,26 @@ include_guard(GLOBAL) # This function finds arrow and sets any additional necessary environment variables. -function(find_and_configure_arrow VERSION BUILD_STATIC EXCLUDE_FROM_ALL ENABLE_PARQUET) +function(find_and_configure_arrow VERSION BUILD_STATIC EXCLUDE_FROM_ALL ENABLE_PARQUET + ENABLE_COMPUTE +) if(BUILD_STATIC) if(TARGET arrow_static) set(ARROW_FOUND TRUE PARENT_SCOPE ) - set(ARROW_LIBRARIES - arrow_static - PARENT_SCOPE - ) + if(ENABLE_COMPUTE) + set(ARROW_LIBRARIES + arrow_static arrow_compute_static + PARENT_SCOPE + ) + else() + set(ARROW_LIBRARIES + arrow_static + PARENT_SCOPE + ) + endif() return() endif() else() @@ -42,10 +51,17 @@ function(find_and_configure_arrow VERSION BUILD_STATIC EXCLUDE_FROM_ALL ENABLE_P TRUE PARENT_SCOPE ) - set(ARROW_LIBRARIES - arrow_shared - PARENT_SCOPE - ) + if(ENABLE_COMPUTE) + set(ARROW_LIBRARIES + arrow_shared arrow_compute_shared + PARENT_SCOPE + ) + else() + set(ARROW_LIBRARIES + arrow_shared + PARENT_SCOPE + ) + endif() return() endif() endif() @@ -82,8 +98,9 @@ function(find_and_configure_arrow VERSION BUILD_STATIC EXCLUDE_FROM_ALL ENABLE_P rapids_cpm_find( Arrow ${VERSION} - GLOBAL_TARGETS arrow_shared parquet_shared arrow_acero_shared arrow_dataset_shared arrow_static - parquet_static arrow_acero_static arrow_dataset_static + GLOBAL_TARGETS + arrow_shared parquet_shared arrow_acero_shared arrow_dataset_shared arrow_compute_shared + arrow_static parquet_static arrow_acero_static arrow_dataset_static arrow_compute_static CPM_ARGS GIT_REPOSITORY https://github.com/apache/arrow.git GIT_TAG apache-arrow-${VERSION} @@ -91,6 +108,7 @@ function(find_and_configure_arrow VERSION BUILD_STATIC EXCLUDE_FROM_ALL ENABLE_P EXCLUDE_FROM_ALL ${EXCLUDE_FROM_ALL} OPTIONS "CMAKE_VERBOSE_MAKEFILE ON" "ARROW_ACERO ON" + "ARROW_COMPUTE ${ENABLE_COMPUTE}" "ARROW_IPC ON" "ARROW_DATASET ON" "ARROW_WITH_BACKTRACE ON" @@ -124,9 +142,17 @@ function(find_and_configure_arrow VERSION BUILD_STATIC EXCLUDE_FROM_ALL ENABLE_P ) if(BUILD_STATIC) - set(ARROW_LIBRARIES arrow_static) + if(ENABLE_COMPUTE) + set(ARROW_LIBRARIES arrow_static arrow_compute_static) + else() + set(ARROW_LIBRARIES arrow_static) + endif() else() - set(ARROW_LIBRARIES arrow_shared) + if(ENABLE_COMPUTE) + set(ARROW_LIBRARIES arrow_shared arrow_compute_shared) + else() + set(ARROW_LIBRARIES arrow_shared) + endif() endif() # Arrow_DIR: set if CPM found Arrow on the system/conda/etc. @@ -145,6 +171,9 @@ function(find_and_configure_arrow VERSION BUILD_STATIC EXCLUDE_FROM_ALL ENABLE_P # us set(ArrowDataset_DIR "${Arrow_DIR}") find_package(ArrowDataset REQUIRED QUIET) + # Set this to enable `find_package(ArrowCompute)` + set(ArrowCompute_DIR "${Arrow_DIR}") + find_package(ArrowCompute REQUIRED QUIET) endif() # Arrow_ADDED: set if CPM downloaded Arrow from Github elseif(Arrow_ADDED) @@ -243,7 +272,7 @@ function(find_and_configure_arrow VERSION BUILD_STATIC EXCLUDE_FROM_ALL ENABLE_P BUILD Arrow VERSION ${VERSION} EXPORT_SET arrow_targets - GLOBAL_TARGETS arrow_shared arrow_static + GLOBAL_TARGETS arrow_shared arrow_static arrow_compute_static arrow_compute_shared NAMESPACE cudf:: FINAL_CODE_BLOCK arrow_code_string ) @@ -288,6 +317,7 @@ function(find_and_configure_arrow VERSION BUILD_STATIC EXCLUDE_FROM_ALL ENABLE_P NAMESPACE cudf:: FINAL_CODE_BLOCK arrow_dataset_code_string ) + set(parquet_code_string [=[ if (TARGET cudf::parquet_shared AND (NOT TARGET parquet_shared)) @@ -320,6 +350,7 @@ function(find_and_configure_arrow VERSION BUILD_STATIC EXCLUDE_FROM_ALL ENABLE_P rapids_export_package(BUILD Parquet cudf-exports) rapids_export_package(BUILD ArrowDataset cudf-exports) endif() + rapids_export_package(BUILD ArrowCompute cudf-exports) include("${rapids-cmake-dir}/export/find_package_root.cmake") rapids_export_find_package_root( @@ -335,6 +366,9 @@ function(find_and_configure_arrow VERSION BUILD_STATIC EXCLUDE_FROM_ALL ENABLE_P EXPORT_SET cudf-exports CONDITION ENABLE_PARQUET ) + rapids_export_find_package_root( + BUILD ArrowCompute [=[${CMAKE_CURRENT_LIST_DIR}]=] EXPORT_SET cudf-exports + ) endif() set(ARROW_LIBRARIES @@ -347,7 +381,7 @@ if(NOT DEFINED CUDF_VERSION_Arrow) set(CUDF_VERSION_Arrow # This version must be kept in sync with the libarrow version pinned for builds in # dependencies.yaml. - 19.0.0 + 21.0.0 CACHE STRING "The version of Arrow to find (or build)" ) endif() @@ -366,7 +400,11 @@ if(NOT DEFINED CUDF_ENABLE_ARROW_PARQUET) set(CUDF_ENABLE_ARROW_PARQUET OFF) endif() +if(NOT DEFINED CUDF_ENABLE_ARROW_COMPUTE) + set(CUDF_ENABLE_ARROW_COMPUTE OFF) +endif() + find_and_configure_arrow( ${CUDF_VERSION_Arrow} ${CUDF_USE_ARROW_STATIC} ${CUDF_EXCLUDE_ARROW_FROM_ALL} - ${CUDF_ENABLE_ARROW_PARQUET} + ${CUDF_ENABLE_ARROW_PARQUET} ${CUDF_ENABLE_ARROW_COMPUTE} ) diff --git a/cpp/cmake/thirdparty/get_croaring.cmake b/cpp/cmake/thirdparty/get_croaring.cmake new file mode 100644 index 00000000000..993006b135d --- /dev/null +++ b/cpp/cmake/thirdparty/get_croaring.cmake @@ -0,0 +1,47 @@ +# ============================================================================= +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +# Use CPM to clone CRoaring and set up the necessary targets and include directories. +function(find_and_configure_roaring VERSION) + rapids_cpm_find( + roaring ${VERSION} + GLOBAL_TARGETS roaring + CPM_ARGS + GIT_REPOSITORY https://github.com/RoaringBitmap/CRoaring.git + GIT_TAG v${VERSION} + GIT_SHALLOW TRUE + OPTIONS "ROARING_BUILD_STATIC ON" + "BUILD_SHARED_LIBS OFF" + "ENABLE_ROARING_TESTS OFF" + "ENABLE_ROARING_MICROBENCHMARKS OFF" + "ROARING_DISABLE_NEON ON" + "ROARING_DISABLE_X64 ON" + "ROARING_DISABLE_AVX2 ON" + "ROARING_DISABLE_AVX512 ON" + ) + if(roaring_ADDED) + set_target_properties(roaring PROPERTIES POSITION_INDEPENDENT_CODE ON) + endif() + + if(DEFINED roaring_SOURCE_DIR) + set(roaring_INCLUDE_DIR + "${roaring_SOURCE_DIR}" + PARENT_SCOPE + ) + endif() + +endfunction() + +set(roaring_VERSION_cudf "4.3.11") +find_and_configure_roaring(${roaring_VERSION_cudf}) diff --git a/cpp/cmake/thirdparty/get_jitify.cmake b/cpp/cmake/thirdparty/get_jitify.cmake index d98abdf8824..5db4e3e907f 100644 --- a/cpp/cmake/thirdparty/get_jitify.cmake +++ b/cpp/cmake/thirdparty/get_jitify.cmake @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -18,9 +18,9 @@ function(find_and_configure_jitify) rapids_cpm_find( jitify 2.0.0 - GIT_REPOSITORY https://github.com/rapidsai/jitify.git - GIT_TAG jitify2 - GIT_SHALLOW TRUE + GIT_REPOSITORY https://github.com/NVIDIA/jitify.git + GIT_TAG 44e978b21fc8bdb6b2d7d8d179523c8350db72e5 # jitify2 branch as of 23rd Aug 2025 + GIT_SHALLOW FALSE DOWNLOAD_ONLY TRUE ) set(JITIFY_INCLUDE_DIR diff --git a/cpp/cmake/thirdparty/get_nvcomp.cmake b/cpp/cmake/thirdparty/get_nvcomp.cmake index 33b1b45fb44..bb5c0c3c215 100644 --- a/cpp/cmake/thirdparty/get_nvcomp.cmake +++ b/cpp/cmake/thirdparty/get_nvcomp.cmake @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -20,7 +20,7 @@ function(find_and_configure_nvcomp) if(CUDF_EXPORT_NVCOMP) set(export_args BUILD_EXPORT_SET cudf-exports INSTALL_EXPORT_SET cudf-exports) endif() - rapids_cpm_nvcomp(${export_args} USE_PROPRIETARY_BINARY ${CUDF_USE_PROPRIETARY_NVCOMP}) + rapids_cpm_nvcomp(${export_args} USE_PROPRIETARY_BINARY ON) # Per-thread default stream if(TARGET nvcomp AND CUDF_USE_PER_THREAD_DEFAULT_STREAM) diff --git a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md index 9c319b9048e..52e3a47cb9a 100644 --- a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md +++ b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md @@ -655,6 +655,21 @@ kernel<<<...>>>(int_scalar.data(),...); int host_value = int_scalar.value(); ``` +##### cudf::detail::device_scalar +Acts as a drop-in replacement for `rmm::device_scalar`, with the key difference +being the use of pinned host memory as a bounce buffer for data transfers. +It is recommended for internal use to avoid the implicit synchronization overhead caused by +memcpy operations on pageable host memory. + +```c++ +// Same as the case with rmm::device_scalar above +cudf::detail::device_scalar int_scalar{42, stream, mr}; +kernel<<<...>>>(int_scalar.data(),...); + +// Note: This device-to-host transfer uses host-pinned bounce buffer for efficient memcpy +int host_value = int_scalar.value(); +``` + #### rmm::device_vector Allocates a specified number of elements of the specified type. If no initialization value is diff --git a/cpp/examples/basic/CMakeLists.txt b/cpp/examples/basic/CMakeLists.txt index 3ca878f1497..6e9604ff15e 100644 --- a/cpp/examples/basic/CMakeLists.txt +++ b/cpp/examples/basic/CMakeLists.txt @@ -18,10 +18,15 @@ include(../fetch_dependencies.cmake) include(rapids-cmake) rapids_cmake_build_type("Release") +# For now, disable CMake's automatic module scanning for C++ files. There is an sccache bug in the +# version RAPIDS uses in CI that causes it to handle the resulting -M* flags incorrectly with +# gcc>=14. We can remove this once we upgrade to a newer sccache version. +set(CMAKE_CXX_SCAN_FOR_MODULES OFF) + # Configure your project here add_executable(basic_example src/process_csv.cpp) target_link_libraries(basic_example PRIVATE cudf::cudf) target_compile_features(basic_example PRIVATE cxx_std_20) -install(TARGETS basic_example DESTINATION bin/examples/libcudf) -install(FILES ${CMAKE_CURRENT_LIST_DIR}/4stock_5day.csv DESTINATION bin/examples/libcudf) +install(TARGETS basic_example DESTINATION bin/examples/libcudf/basic) +install(FILES ${CMAKE_CURRENT_LIST_DIR}/4stock_5day.csv DESTINATION bin/examples/libcudf/basic) diff --git a/cpp/examples/billion_rows/CMakeLists.txt b/cpp/examples/billion_rows/CMakeLists.txt index c0de82ac85a..ed83be6216a 100644 --- a/cpp/examples/billion_rows/CMakeLists.txt +++ b/cpp/examples/billion_rows/CMakeLists.txt @@ -18,6 +18,11 @@ include(../fetch_dependencies.cmake) include(rapids-cmake) rapids_cmake_build_type("Release") +# For now, disable CMake's automatic module scanning for C++ files. There is an sccache bug in the +# version RAPIDS uses in CI that causes it to handle the resulting -M* flags incorrectly with +# gcc>=14. We can remove this once we upgrade to a newer sccache version. +set(CMAKE_CXX_SCAN_FOR_MODULES OFF) + list(APPEND CUDF_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr) add_library(groupby_results OBJECT groupby_results.cpp) @@ -30,7 +35,7 @@ target_link_libraries( $ ) target_compile_features(brc PRIVATE cxx_std_20) -install(TARGETS brc DESTINATION bin/examples/libcudf) +install(TARGETS brc DESTINATION bin/examples/libcudf/billion_rows) add_executable(brc_chunks brc_chunks.cpp) target_link_libraries( @@ -38,7 +43,7 @@ target_link_libraries( $ ) target_compile_features(brc_chunks PRIVATE cxx_std_20) -install(TARGETS brc_chunks DESTINATION bin/examples/libcudf) +install(TARGETS brc_chunks DESTINATION bin/examples/libcudf/billion_rows) add_executable(brc_pipeline brc_pipeline.cpp) target_link_libraries( @@ -46,4 +51,4 @@ target_link_libraries( $ ) target_compile_features(brc_pipeline PRIVATE cxx_std_20) -install(TARGETS brc_pipeline DESTINATION bin/examples/libcudf) +install(TARGETS brc_pipeline DESTINATION bin/examples/libcudf/billion_rows) diff --git a/cpp/examples/build.sh b/cpp/examples/build.sh index e6ceaf5b6e6..7296a1afd04 100755 --- a/cpp/examples/build.sh +++ b/cpp/examples/build.sh @@ -63,4 +63,3 @@ build_example string_transforms build_example nested_types build_example parquet_io build_example billion_rows -build_example interop diff --git a/cpp/examples/interop/CMakeLists.txt b/cpp/examples/interop/CMakeLists.txt deleted file mode 100644 index 1ea9779d4cc..00000000000 --- a/cpp/examples/interop/CMakeLists.txt +++ /dev/null @@ -1,29 +0,0 @@ -# Copyright (c) 2024-2025, NVIDIA CORPORATION. - -cmake_minimum_required(VERSION 3.30.4 FATAL_ERROR) - -include(../set_cuda_architecture.cmake) - -rapids_cuda_init_architectures(interop_example) - -project( - interop_example - VERSION 0.0.1 - LANGUAGES CXX CUDA -) - -include(../fetch_dependencies.cmake) - -include(rapids-cmake) -rapids_cmake_build_type("Release") - -# The Arrow CMake is currently broken if the build type is not set -set(CMAKE_BUILD_TYPE Release) -# No need to install Arrow libs when only the final example executable is shipped. -set(CUDF_EXCLUDE_ARROW_FROM_ALL ON) -include(../../cmake/thirdparty/get_arrow.cmake) - -add_executable(interop interop.cpp) -target_link_libraries(interop PRIVATE cudf::cudf) -target_compile_features(interop PRIVATE cxx_std_20) -target_link_libraries(interop PRIVATE ${ARROW_LIBRARIES}) diff --git a/cpp/examples/interop/interop.cpp b/cpp/examples/interop/interop.cpp deleted file mode 100644 index b01b04489a6..00000000000 --- a/cpp/examples/interop/interop.cpp +++ /dev/null @@ -1,177 +0,0 @@ -/* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include - -#include - -#include -#include - -// Helper functions to create StringViews -inline arrow::StringViewType::c_type to_inline_string_view(const void* data, int32_t const& size) -{ - arrow::StringViewType::c_type out; - out.inlined = {size, {}}; - memcpy(&out.inlined.data, data, size); - return out; -} -inline arrow::StringViewType::c_type to_inline_string_view(std::string_view const& v) -{ - return to_inline_string_view(v.data(), static_cast(v.size())); -} -inline arrow::StringViewType::c_type to_string_view(const void* data, - int32_t const& size, - int32_t const& buffer_index, - int32_t const& offset) -{ - if (size <= arrow::StringViewType::kInlineSize) { return to_inline_string_view(data, size); } - arrow::StringViewType::c_type out; - out.ref = {size, {}, buffer_index, offset}; - memcpy(&out.ref.prefix, data, sizeof(out.ref.prefix)); - return out; -} -inline arrow::StringViewType::c_type to_string_view(std::string_view const& v, - int32_t const& buffer_index, - int32_t const& offset) -{ - return to_string_view(v.data(), static_cast(v.size()), buffer_index, offset); -} - -/** - * @brief Create a StringViewArray - * - * @param data_buffers The data buffers - * @param views The string views - * @param validate Whether to validate the array - */ -arrow::Result> make_string_view_array( - arrow::BufferVector const& data_buffers, - std::vector const& views, - bool validate = true) -{ - auto const length = static_cast(views.size()); - auto const arr = std::make_shared( - arrow::utf8_view(), length, arrow::Buffer::FromVector(views), std::move(data_buffers)); - if (validate) { RETURN_NOT_OK(arr->ValidateFull()); } - return arr; -} - -/** - * @brief Convert a vector of strings into a vector of the - * constituent chars and a vector of offsets. - * - * @param strings The vector of strings - */ -auto make_chars_and_offsets(std::vector const& strings) -{ - std::vector chars{}; - std::vector offsets(1, 0); - for (auto& str : strings) { - chars.insert(chars.end(), std::cbegin(str), std::cend(str)); - auto const last_offset = static_cast(offsets.back()); - auto const next_offset = last_offset + str.length(); - CUDF_EXPECTS( - next_offset < static_cast(std::numeric_limits::max()), - "Cannot use arrow_string_view_to_cudf_column to build a large strings column"); - offsets.push_back(static_cast(next_offset)); - } - return std::make_tuple(std::move(chars), std::move(offsets)); -}; - -/** - * @brief Convert an Arrow StringViewArray to a cudf::column - * - * @param array The Arrow StringViewArray - * @param stream The CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate the returned column's device memory - */ -std::unique_ptr arrow_string_view_to_cudf_column( - std::shared_ptr const& array, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) -{ - // Convert the string views into chars and offsets - std::vector strings; - for (auto i = 0; i < array->length(); i++) { - strings.push_back(array->GetString(i)); - } - auto const [chars, offsets] = make_chars_and_offsets(strings); - - // Copy the chars vector to the device - rmm::device_uvector d_chars(chars.size(), stream, mr); - CUDF_CUDA_TRY(cudaMemcpyAsync( - d_chars.data(), chars.data(), chars.size() * sizeof(char), cudaMemcpyDefault, stream.value())); - - // Copy the offsets vector to the device - // and wrap it in a cudf::column - rmm::device_uvector d_offsets(offsets.size(), stream, mr); - CUDF_CUDA_TRY(cudaMemcpyAsync(d_offsets.data(), - offsets.data(), - offsets.size() * sizeof(cudf::size_type), - cudaMemcpyDefault, - stream.value())); - auto offsets_col = - std::make_unique(std::move(d_offsets), rmm::device_buffer{0, stream, mr}, 0); - - // Create a string column out of the chars and offsets - return cudf::make_strings_column(array->length(), - std::move(offsets_col), - d_chars.release(), - 0, - rmm::device_buffer{0, stream, mr}); -} - -int main(int argc, char** argv) -{ - std::vector> data_buffers; - std::vector views; - - // Define the data buffers and string views - auto const buffer_a = - arrow::Buffer::FromString("hello rapids teamapache arrow interopnvidiacudf"); - data_buffers.push_back(buffer_a); - views.push_back(to_string_view("hello rapid steam", 0, 0)); - views.push_back(to_string_view("apache arrow interop", 0, 17)); - views.push_back(to_inline_string_view("nvidia")); - views.push_back(to_inline_string_view("cudf")); - - // Create a StringViewArray - auto const string_view_col = make_string_view_array(data_buffers, views, true).ValueOrDie(); - std::cout << string_view_col->ToString() << std::endl; - - // Convert the StringViewArray to a cudf::column - auto const cudf_col = arrow_string_view_to_cudf_column(string_view_col); - - // Write the cudf::column as CSV - auto const tbl_view = cudf::table_view({cudf_col->view()}); - std::vector const names = {"col_a"}; - - std::vector h_buffer; - cudf::io::csv_writer_options writer_options = - cudf::io::csv_writer_options::builder(cudf::io::sink_info(&h_buffer), tbl_view) - .include_header(not names.empty()) - .names(names); - - cudf::io::write_csv(writer_options); - auto const result = std::string(h_buffer.data(), h_buffer.size()); - std::cout << result << std::endl; - - return 0; -} diff --git a/cpp/examples/nested_types/CMakeLists.txt b/cpp/examples/nested_types/CMakeLists.txt index b5d71d3262d..6532585f496 100644 --- a/cpp/examples/nested_types/CMakeLists.txt +++ b/cpp/examples/nested_types/CMakeLists.txt @@ -18,10 +18,15 @@ include(../fetch_dependencies.cmake) include(rapids-cmake) rapids_cmake_build_type("Release") +# For now, disable CMake's automatic module scanning for C++ files. There is an sccache bug in the +# version RAPIDS uses in CI that causes it to handle the resulting -M* flags incorrectly with +# gcc>=14. We can remove this once we upgrade to a newer sccache version. +set(CMAKE_CXX_SCAN_FOR_MODULES OFF) + # Configure your project here add_executable(deduplication deduplication.cpp) target_link_libraries(deduplication PRIVATE cudf::cudf) target_compile_features(deduplication PRIVATE cxx_std_20) -install(TARGETS deduplication DESTINATION bin/examples/libcudf) -install(FILES ${CMAKE_CURRENT_LIST_DIR}/example.json DESTINATION bin/examples/libcudf) +install(TARGETS deduplication DESTINATION bin/examples/libcudf/nested_types) +install(FILES ${CMAKE_CURRENT_LIST_DIR}/example.json DESTINATION bin/examples/libcudf/nested_types) diff --git a/cpp/examples/parquet_io/CMakeLists.txt b/cpp/examples/parquet_io/CMakeLists.txt index 7bcd22445dd..3c381d1b4c0 100644 --- a/cpp/examples/parquet_io/CMakeLists.txt +++ b/cpp/examples/parquet_io/CMakeLists.txt @@ -18,6 +18,11 @@ include(../fetch_dependencies.cmake) include(rapids-cmake) rapids_cmake_build_type("Release") +# For now, disable CMake's automatic module scanning for C++ files. There is an sccache bug in the +# version RAPIDS uses in CI that causes it to handle the resulting -M* flags incorrectly with +# gcc>=14. We can remove this once we upgrade to a newer sccache version. +set(CMAKE_CXX_SCAN_FOR_MODULES OFF) + add_library(parquet_io_utils OBJECT common_utils.cpp io_source.cpp) target_compile_features(parquet_io_utils PRIVATE cxx_std_20) target_link_libraries(parquet_io_utils PRIVATE cudf::cudf) @@ -29,7 +34,7 @@ target_link_libraries( $ ) target_compile_features(parquet_io PRIVATE cxx_std_20) -install(TARGETS parquet_io DESTINATION bin/examples/libcudf) +install(TARGETS parquet_io DESTINATION bin/examples/libcudf/parquet_io) # Build and install parquet_io_multithreaded add_executable(parquet_io_multithreaded parquet_io_multithreaded.cpp) @@ -38,7 +43,7 @@ target_link_libraries( $ ) target_compile_features(parquet_io_multithreaded PRIVATE cxx_std_20) -install(TARGETS parquet_io_multithreaded DESTINATION bin/examples/libcudf) +install(TARGETS parquet_io_multithreaded DESTINATION bin/examples/libcudf/parquet_io) # Install the example.parquet file -install(FILES ${CMAKE_CURRENT_LIST_DIR}/example.parquet DESTINATION bin/examples/libcudf) +install(FILES ${CMAKE_CURRENT_LIST_DIR}/example.parquet DESTINATION bin/examples/libcudf/parquet_io) diff --git a/cpp/examples/string_transforms/CMakeLists.txt b/cpp/examples/string_transforms/CMakeLists.txt index c1f3aff2e5c..fb31c93ba21 100644 --- a/cpp/examples/string_transforms/CMakeLists.txt +++ b/cpp/examples/string_transforms/CMakeLists.txt @@ -18,6 +18,11 @@ include(../fetch_dependencies.cmake) include(rapids-cmake) rapids_cmake_build_type("Release") +# For now, disable CMake's automatic module scanning for C++ files. There is an sccache bug in the +# version RAPIDS uses in CI that causes it to handle the resulting -M* flags incorrectly with +# gcc>=14. We can remove this once we upgrade to a newer sccache version. +set(CMAKE_CXX_SCAN_FOR_MODULES OFF) + list(APPEND CUDF_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr) add_executable(compute_checksum_jit compute_checksum_jit.cpp) @@ -28,7 +33,7 @@ target_compile_options( target_link_libraries( compute_checksum_jit PRIVATE cudf::cudf $ ) -install(TARGETS compute_checksum_jit DESTINATION bin/examples/libcudf) +install(TARGETS compute_checksum_jit DESTINATION bin/examples/libcudf/string_transformers) add_executable(extract_email_jit extract_email_jit.cpp) target_compile_features(extract_email_jit PRIVATE cxx_std_20) @@ -36,7 +41,7 @@ target_compile_options(extract_email_jit PRIVATE "$<$:${C target_link_libraries( extract_email_jit PRIVATE cudf::cudf $ ) -install(TARGETS extract_email_jit DESTINATION bin/examples/libcudf) +install(TARGETS extract_email_jit DESTINATION bin/examples/libcudf/string_transformers) add_executable(extract_email_precompiled extract_email_precompiled.cpp) target_compile_features(extract_email_precompiled PRIVATE cxx_std_20) @@ -46,13 +51,13 @@ target_compile_options( target_link_libraries( extract_email_precompiled PRIVATE cudf::cudf $ ) -install(TARGETS extract_email_precompiled DESTINATION bin/examples/libcudf) +install(TARGETS extract_email_precompiled DESTINATION bin/examples/libcudf/string_transformers) add_executable(format_phone_jit format_phone_jit.cpp) target_compile_features(format_phone_jit PRIVATE cxx_std_20) target_compile_options(format_phone_jit PRIVATE "$<$:${CUDF_CUDA_FLAGS}>") target_link_libraries(format_phone_jit PRIVATE cudf::cudf $) -install(TARGETS format_phone_jit DESTINATION bin/examples/libcudf) +install(TARGETS format_phone_jit DESTINATION bin/examples/libcudf/string_transformers) add_executable(format_phone_precompiled format_phone_precompiled.cpp) target_compile_features(format_phone_precompiled PRIVATE cxx_std_20) @@ -62,7 +67,7 @@ target_compile_options( target_link_libraries( format_phone_precompiled PRIVATE cudf::cudf $ ) -install(TARGETS format_phone_precompiled DESTINATION bin/examples/libcudf) +install(TARGETS format_phone_precompiled DESTINATION bin/examples/libcudf/string_transformers) add_executable(localize_phone_jit localize_phone_jit.cpp) target_compile_features(localize_phone_jit PRIVATE cxx_std_20) @@ -70,7 +75,7 @@ target_compile_options(localize_phone_jit PRIVATE "$<$:${ target_link_libraries( localize_phone_jit PRIVATE cudf::cudf $ ) -install(TARGETS localize_phone_jit DESTINATION bin/examples/libcudf) +install(TARGETS localize_phone_jit DESTINATION bin/examples/libcudf/string_transformers) add_executable(localize_phone_precompiled localize_phone_precompiled.cpp) target_compile_features(localize_phone_precompiled PRIVATE cxx_std_20) @@ -80,6 +85,8 @@ target_compile_options( target_link_libraries( localize_phone_precompiled PRIVATE cudf::cudf $ ) -install(TARGETS localize_phone_precompiled DESTINATION bin/examples/libcudf) +install(TARGETS localize_phone_precompiled DESTINATION bin/examples/libcudf/string_transformers) -install(FILES ${CMAKE_CURRENT_LIST_DIR}/info.csv DESTINATION bin/examples/libcudf) +install(FILES ${CMAKE_CURRENT_LIST_DIR}/info.csv + DESTINATION bin/examples/libcudf/string_transformers +) diff --git a/cpp/examples/string_transforms/compute_checksum_jit.cpp b/cpp/examples/string_transforms/compute_checksum_jit.cpp index f1fbc289406..e23729a930d 100644 --- a/cpp/examples/string_transforms/compute_checksum_jit.cpp +++ b/cpp/examples/string_transforms/compute_checksum_jit.cpp @@ -49,8 +49,14 @@ std::tuple, std::vector> transform( auto name = table.column(0); auto email = table.column(1); - auto result = cudf::transform( - {name, email}, udf, cudf::data_type{cudf::type_id::UINT16}, false, std::nullopt, stream, mr); + auto result = cudf::transform({name, email}, + udf, + cudf::data_type{cudf::type_id::UINT16}, + false, + std::nullopt, + cudf::null_aware::NO, + stream, + mr); return std::make_tuple(std::move(result), transformed); } diff --git a/cpp/examples/string_transforms/extract_email_jit.cpp b/cpp/examples/string_transforms/extract_email_jit.cpp index 7a20946dcd5..686dc5814b3 100644 --- a/cpp/examples/string_transforms/extract_email_jit.cpp +++ b/cpp/examples/string_transforms/extract_email_jit.cpp @@ -64,8 +64,14 @@ __device__ void email_provider(cudf::string_view* out, auto transformed = std::vector{1}; auto emails = table.column(1); - auto providers = cudf::transform( - {emails, *alt}, udf, cudf::data_type{cudf::type_id::STRING}, false, std::nullopt, stream, mr); + auto providers = cudf::transform({emails, *alt}, + udf, + cudf::data_type{cudf::type_id::STRING}, + false, + std::nullopt, + cudf::null_aware::NO, + stream, + mr); return {std::move(providers), std::move(transformed)}; } diff --git a/cpp/examples/string_transforms/format_phone_jit.cpp b/cpp/examples/string_transforms/format_phone_jit.cpp index 680d82a489b..dfbf106fec4 100644 --- a/cpp/examples/string_transforms/format_phone_jit.cpp +++ b/cpp/examples/string_transforms/format_phone_jit.cpp @@ -132,6 +132,7 @@ __device__ void e164_format(void* scratch, cudf::data_type{cudf::type_id::STRING}, false, scratch.data(), + cudf::null_aware::NO, stream, mr); diff --git a/cpp/examples/string_transforms/localize_phone_jit.cpp b/cpp/examples/string_transforms/localize_phone_jit.cpp index 1c065ca11f0..cd720ec47d0 100644 --- a/cpp/examples/string_transforms/localize_phone_jit.cpp +++ b/cpp/examples/string_transforms/localize_phone_jit.cpp @@ -155,6 +155,7 @@ __device__ void format_phone(void* scratch, cudf::data_type{cudf::type_id::STRING}, false, scratch.data(), + cudf::null_aware::NO, stream, mr); diff --git a/cpp/examples/strings/CMakeLists.txt b/cpp/examples/strings/CMakeLists.txt index 7d55d60c062..1a7eb60d571 100644 --- a/cpp/examples/strings/CMakeLists.txt +++ b/cpp/examples/strings/CMakeLists.txt @@ -18,12 +18,17 @@ include(../fetch_dependencies.cmake) include(rapids-cmake) rapids_cmake_build_type("Release") +# For now, disable CMake's automatic module scanning for C++ files. There is an sccache bug in the +# version RAPIDS uses in CI that causes it to handle the resulting -M* flags incorrectly with +# gcc>=14. We can remove this once we upgrade to a newer sccache version. +set(CMAKE_CXX_SCAN_FOR_MODULES OFF) + list(APPEND CUDF_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr) add_executable(libcudf_apis libcudf_apis.cpp) target_compile_features(libcudf_apis PRIVATE cxx_std_20) target_link_libraries(libcudf_apis PRIVATE cudf::cudf $) -install(TARGETS libcudf_apis DESTINATION bin/examples/libcudf) +install(TARGETS libcudf_apis DESTINATION bin/examples/libcudf/strings) add_executable(custom_with_malloc custom_with_malloc.cu) target_compile_features(custom_with_malloc PRIVATE cxx_std_20) @@ -31,18 +36,18 @@ target_compile_options(custom_with_malloc PRIVATE "$<$:${ target_link_libraries( custom_with_malloc PRIVATE cudf::cudf $ ) -install(TARGETS custom_with_malloc DESTINATION bin/examples/libcudf) +install(TARGETS custom_with_malloc DESTINATION bin/examples/libcudf/strings) add_executable(custom_prealloc custom_prealloc.cu) target_compile_features(custom_prealloc PRIVATE cxx_std_20) target_compile_options(custom_prealloc PRIVATE "$<$:${CUDF_CUDA_FLAGS}>") target_link_libraries(custom_prealloc PRIVATE cudf::cudf $) -install(TARGETS custom_prealloc DESTINATION bin/examples/libcudf) +install(TARGETS custom_prealloc DESTINATION bin/examples/libcudf/strings) add_executable(custom_optimized custom_optimized.cu) target_compile_features(custom_optimized PRIVATE cxx_std_20) target_compile_options(custom_optimized PRIVATE "$<$:${CUDF_CUDA_FLAGS}>") target_link_libraries(custom_optimized PRIVATE cudf::cudf $) -install(TARGETS custom_optimized DESTINATION bin/examples/libcudf) +install(TARGETS custom_optimized DESTINATION bin/examples/libcudf/strings) -install(FILES ${CMAKE_CURRENT_LIST_DIR}/names.csv DESTINATION bin/examples/libcudf) +install(FILES ${CMAKE_CURRENT_LIST_DIR}/names.csv DESTINATION bin/examples/libcudf/strings) diff --git a/cpp/examples/versions.cmake b/cpp/examples/versions.cmake index 8d4a8335b47..4ef1f84f8c6 100644 --- a/cpp/examples/versions.cmake +++ b/cpp/examples/versions.cmake @@ -12,4 +12,4 @@ # the License. # ============================================================================= -set(CUDF_TAG branch-25.08) +set(CUDF_TAG branch-25.10) diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp index bb086b611c7..d379674fad6 100644 --- a/cpp/include/cudf/aggregation.hpp +++ b/cpp/include/cudf/aggregation.hpp @@ -93,45 +93,46 @@ class aggregation { * @brief Possible aggregation operations */ enum Kind { - SUM, ///< sum reduction - PRODUCT, ///< product reduction - MIN, ///< min reduction - MAX, ///< max reduction - COUNT_VALID, ///< count number of valid elements - COUNT_ALL, ///< count number of elements - ANY, ///< any reduction - ALL, ///< all reduction - SUM_OF_SQUARES, ///< sum of squares reduction - MEAN, ///< arithmetic mean reduction - M2, ///< sum of squares of differences from the mean - VARIANCE, ///< variance - STD, ///< standard deviation - MEDIAN, ///< median reduction - QUANTILE, ///< compute specified quantile(s) - ARGMAX, ///< Index of max element - ARGMIN, ///< Index of min element - NUNIQUE, ///< count number of unique elements - NTH_ELEMENT, ///< get the nth element - ROW_NUMBER, ///< get row-number of current index (relative to rolling window) - EWMA, ///< get exponential weighted moving average at current index - RANK, ///< get rank of current index - COLLECT_LIST, ///< collect values into a list - COLLECT_SET, ///< collect values into a list without duplicate entries - LEAD, ///< window function, accesses row at specified offset following current row - LAG, ///< window function, accesses row at specified offset preceding current row - PTX, ///< PTX based UDF aggregation - CUDA, ///< CUDA based UDF aggregation - HOST_UDF, ///< host based UDF aggregation - MERGE_LISTS, ///< merge multiple lists values into one list - MERGE_SETS, ///< merge multiple lists values into one list then drop duplicate entries - MERGE_M2, ///< merge partial values of M2 aggregation, - COVARIANCE, ///< covariance between two sets of elements - CORRELATION, ///< correlation between two sets of elements - TDIGEST, ///< create a tdigest from a set of input values - MERGE_TDIGEST, ///< create a tdigest by merging multiple tdigests together - HISTOGRAM, ///< compute frequency of each element - MERGE_HISTOGRAM, ///< merge partial values of HISTOGRAM aggregation - BITWISE_AGG ///< bitwise aggregation on numeric columns + SUM, ///< sum reduction + SUM_WITH_OVERFLOW, ///< sum reduction with overflow detection + PRODUCT, ///< product reduction + MIN, ///< min reduction + MAX, ///< max reduction + COUNT_VALID, ///< count number of valid elements + COUNT_ALL, ///< count number of elements + ANY, ///< any reduction + ALL, ///< all reduction + SUM_OF_SQUARES, ///< sum of squares reduction + MEAN, ///< arithmetic mean reduction + M2, ///< sum of squares of differences from the mean + VARIANCE, ///< variance + STD, ///< standard deviation + MEDIAN, ///< median reduction + QUANTILE, ///< compute specified quantile(s) + ARGMAX, ///< Index of max element + ARGMIN, ///< Index of min element + NUNIQUE, ///< count number of unique elements + NTH_ELEMENT, ///< get the nth element + ROW_NUMBER, ///< get row-number of current index (relative to rolling window) + EWMA, ///< get exponential weighted moving average at current index + RANK, ///< get rank of current index + COLLECT_LIST, ///< collect values into a list + COLLECT_SET, ///< collect values into a list without duplicate entries + LEAD, ///< window function, accesses row at specified offset following current row + LAG, ///< window function, accesses row at specified offset preceding current row + PTX, ///< PTX based UDF aggregation + CUDA, ///< CUDA based UDF aggregation + HOST_UDF, ///< host based UDF aggregation + MERGE_LISTS, ///< merge multiple lists values into one list + MERGE_SETS, ///< merge multiple lists values into one list then drop duplicate entries + MERGE_M2, ///< merge partial values of M2 aggregation, + COVARIANCE, ///< covariance between two sets of elements + CORRELATION, ///< correlation between two sets of elements + TDIGEST, ///< create a tdigest from a set of input values + MERGE_TDIGEST, ///< create a tdigest by merging multiple tdigests together + HISTOGRAM, ///< compute frequency of each element + MERGE_HISTOGRAM, ///< merge partial values of HISTOGRAM aggregation + BITWISE_AGG ///< bitwise aggregation on numeric columns }; aggregation() = delete; @@ -271,6 +272,11 @@ enum class ewm_history : int32_t { INFINITE, FINITE }; template std::unique_ptr make_sum_aggregation(); +/// Factory to create a SUM_WITH_OVERFLOW aggregation +/// @return A SUM_WITH_OVERFLOW aggregation object +template +std::unique_ptr make_sum_with_overflow_aggregation(); + /// Factory to create a PRODUCT aggregation /// @return A PRODUCT aggregation object template diff --git a/cpp/include/cudf/ast/ast_operator.hpp b/cpp/include/cudf/ast/ast_operator.hpp new file mode 100644 index 00000000000..397a3550143 --- /dev/null +++ b/cpp/include/cudf/ast/ast_operator.hpp @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include + +namespace CUDF_EXPORT cudf { + +namespace ast { +/** + * @addtogroup expressions + * @{ + * @file + */ + +/** + * @brief Enum of supported operators. + */ +enum class ast_operator : int32_t { + // Binary operators + ADD, ///< operator + + SUB, ///< operator - + MUL, ///< operator * + DIV, ///< operator / using common type of lhs and rhs + TRUE_DIV, ///< operator / after promoting type to floating point + FLOOR_DIV, ///< operator / after promoting to 64 bit floating point and then + ///< flooring the result + MOD, ///< operator % + PYMOD, ///< operator % using Python's sign rules for negatives + POW, ///< lhs ^ rhs + EQUAL, ///< operator == + NULL_EQUAL, ///< operator == with Spark rules: NULL_EQUAL(null, null) is true, NULL_EQUAL(null, + ///< valid) is false, and + ///< NULL_EQUAL(valid, valid) == EQUAL(valid, valid) + NOT_EQUAL, ///< operator != + LESS, ///< operator < + GREATER, ///< operator > + LESS_EQUAL, ///< operator <= + GREATER_EQUAL, ///< operator >= + BITWISE_AND, ///< operator & + BITWISE_OR, ///< operator | + BITWISE_XOR, ///< operator ^ + LOGICAL_AND, ///< operator && + NULL_LOGICAL_AND, ///< operator && with Spark rules: NULL_LOGICAL_AND(null, null) is null, + ///< NULL_LOGICAL_AND(null, true) is + ///< null, NULL_LOGICAL_AND(null, false) is false, and NULL_LOGICAL_AND(valid, + ///< valid) == LOGICAL_AND(valid, valid) + LOGICAL_OR, ///< operator || + NULL_LOGICAL_OR, ///< operator || with Spark rules: NULL_LOGICAL_OR(null, null) is null, + ///< NULL_LOGICAL_OR(null, true) is true, + ///< NULL_LOGICAL_OR(null, false) is null, and NULL_LOGICAL_OR(valid, valid) == + ///< LOGICAL_OR(valid, valid) + // Unary operators + IDENTITY, ///< Identity function + IS_NULL, ///< Check if operand is null + SIN, ///< Trigonometric sine + COS, ///< Trigonometric cosine + TAN, ///< Trigonometric tangent + ARCSIN, ///< Trigonometric sine inverse + ARCCOS, ///< Trigonometric cosine inverse + ARCTAN, ///< Trigonometric tangent inverse + SINH, ///< Hyperbolic sine + COSH, ///< Hyperbolic cosine + TANH, ///< Hyperbolic tangent + ARCSINH, ///< Hyperbolic sine inverse + ARCCOSH, ///< Hyperbolic cosine inverse + ARCTANH, ///< Hyperbolic tangent inverse + EXP, ///< Exponential (base e, Euler number) + LOG, ///< Natural Logarithm (base e) + SQRT, ///< Square-root (x^0.5) + CBRT, ///< Cube-root (x^(1.0/3)) + CEIL, ///< Smallest integer value not less than arg + FLOOR, ///< largest integer value not greater than arg + ABS, ///< Absolute value + RINT, ///< Rounds the floating-point argument arg to an integer value + BIT_INVERT, ///< Bitwise Not (~) + NOT, ///< Logical Not (!) + CAST_TO_INT64, ///< Cast value to int64_t + CAST_TO_UINT64, ///< Cast value to uint64_t + CAST_TO_FLOAT64 ///< Cast value to double +}; + +/** @} */ // end of group +} // namespace ast + +} // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/ast/detail/operator_functor.cuh b/cpp/include/cudf/ast/detail/operator_functor.cuh new file mode 100644 index 00000000000..640c7548c80 --- /dev/null +++ b/cpp/include/cudf/ast/detail/operator_functor.cuh @@ -0,0 +1,760 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include +#include + +#include + +namespace CUDF_EXPORT cudf { +namespace ast::detail { + +/** + * @brief Operator functor. + * + * This functor is templated on an `ast_operator`, with each template specialization defining a + * callable `operator()` that executes the operation. The functor specialization also has a member + * `arity` defining the number of operands that are accepted by the call to `operator()`. The + * `operator()` is templated on the types of its inputs (e.g. `typename LHS` and `typename RHS` for + * a binary operator). Trailing return types are defined as `decltype(result)` where `result` is + * the returned value. The trailing return types allow SFINAE to only consider template + * instantiations for valid combinations of types. This, in turn, allows the operator functors to be + * used with traits like `is_valid_binary_op` that rely on `std::is_invocable` and related features. + * + * @tparam op AST operator. + */ +template +struct operator_functor {}; + +template <> +struct operator_functor { + static constexpr auto arity{2}; + + template + __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs + rhs) + { + return lhs + rhs; + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{2}; + + template + __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs - rhs) + { + return lhs - rhs; + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{2}; + + template + __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs * rhs) + { + return lhs * rhs; + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{2}; + + template + __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs / rhs) + { + return lhs / rhs; + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{2}; + + template + __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept + -> decltype(static_cast(lhs) / static_cast(rhs)) + { + return static_cast(lhs) / static_cast(rhs); + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{2}; + + template + __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept + -> decltype(floor(static_cast(lhs) / static_cast(rhs))) + { + return floor(static_cast(lhs) / static_cast(rhs)); + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{2}; + + template > + __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept + -> decltype(static_cast(lhs) % static_cast(rhs)) + requires(cuda::std::is_integral_v) + { + return static_cast(lhs) % static_cast(rhs); + } + + template > + __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept + -> decltype(fmodf(static_cast(lhs), static_cast(rhs))) + requires(cuda::std::is_same_v) + { + return fmodf(static_cast(lhs), static_cast(rhs)); + } + + template > + __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept + -> decltype(fmod(static_cast(lhs), static_cast(rhs))) + requires(cuda::std::is_same_v) + { + return fmod(static_cast(lhs), static_cast(rhs)); + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{2}; + + template > + __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept + -> decltype(((static_cast(lhs) % static_cast(rhs)) + + static_cast(rhs)) % + static_cast(rhs)) + requires(cuda::std::is_integral_v) + { + return ((static_cast(lhs) % static_cast(rhs)) + + static_cast(rhs)) % + static_cast(rhs); + } + + template > + __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept + -> decltype(fmodf(fmodf(static_cast(lhs), static_cast(rhs)) + + static_cast(rhs), + static_cast(rhs))) + requires(cuda::std::is_same_v) + { + return fmodf(fmodf(static_cast(lhs), static_cast(rhs)) + + static_cast(rhs), + static_cast(rhs)); + } + + template > + __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept + -> decltype(fmod(fmod(static_cast(lhs), static_cast(rhs)) + + static_cast(rhs), + static_cast(rhs))) + requires(cuda::std::is_same_v) + { + return fmod(fmod(static_cast(lhs), static_cast(rhs)) + + static_cast(rhs), + static_cast(rhs)); + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{2}; + + template + __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept + -> decltype(cuda::std::pow(lhs, rhs)) + { + return cuda::std::pow(lhs, rhs); + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{2}; + + template + __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs == rhs) + { + return lhs == rhs; + } +}; + +// Alias NULL_EQUAL = EQUAL in the non-nullable case. +template <> +struct operator_functor + : public operator_functor {}; + +template <> +struct operator_functor { + static constexpr auto arity{2}; + + template + __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs != rhs) + { + return lhs != rhs; + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{2}; + + template + __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs < rhs) + { + return lhs < rhs; + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{2}; + + template + __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs > rhs) + { + return lhs > rhs; + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{2}; + + template + __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs <= rhs) + { + return lhs <= rhs; + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{2}; + + template + __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs >= rhs) + { + return lhs >= rhs; + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{2}; + + template + __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs & rhs) + { + return lhs & rhs; + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{2}; + + template + __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs | rhs) + { + return lhs | rhs; + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{2}; + + template + __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs ^ rhs) + { + return lhs ^ rhs; + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{2}; + + template + __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs && rhs) + { + return lhs && rhs; + } +}; + +// Alias NULL_LOGICAL_AND = LOGICAL_AND in the non-nullable case. +template <> +struct operator_functor + : public operator_functor {}; + +template <> +struct operator_functor { + static constexpr auto arity{2}; + + template + __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs || rhs) + { + return lhs || rhs; + } +}; + +// Alias NULL_LOGICAL_OR = LOGICAL_OR in the non-nullable case. +template <> +struct operator_functor + : public operator_functor {}; + +template <> +struct operator_functor { + static constexpr auto arity{1}; + + template + __device__ inline auto operator()(InputT input) const noexcept -> decltype(input) + { + return input; + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{1}; + + template + __device__ inline auto operator()(InputT input) const noexcept -> bool + { + return false; + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{1}; + + template + __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::sin(input)) + requires(cuda::std::is_floating_point_v) + { + return cuda::std::sin(input); + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{1}; + + template + __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::cos(input)) + requires(cuda::std::is_floating_point_v) + { + return cuda::std::cos(input); + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{1}; + + template + __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::tan(input)) + requires(cuda::std::is_floating_point_v) + { + return cuda::std::tan(input); + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{1}; + + template + __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::asin(input)) + requires(cuda::std::is_floating_point_v) + { + return cuda::std::asin(input); + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{1}; + + template + __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::acos(input)) + requires(cuda::std::is_floating_point_v) + { + return cuda::std::acos(input); + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{1}; + + template + __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::atan(input)) + requires(cuda::std::is_floating_point_v) + { + return cuda::std::atan(input); + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{1}; + + template + __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::sinh(input)) + requires(cuda::std::is_floating_point_v) + { + return cuda::std::sinh(input); + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{1}; + + template + __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::cosh(input)) + requires(cuda::std::is_floating_point_v) + { + return cuda::std::cosh(input); + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{1}; + + template + __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::tanh(input)) + requires(cuda::std::is_floating_point_v) + { + return cuda::std::tanh(input); + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{1}; + + template + __device__ inline auto operator()(InputT input) const noexcept + -> decltype(cuda::std::asinh(input)) + requires(cuda::std::is_floating_point_v) + { + return cuda::std::asinh(input); + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{1}; + + template + __device__ inline auto operator()(InputT input) const noexcept + -> decltype(cuda::std::acosh(input)) + requires(cuda::std::is_floating_point_v) + { + return cuda::std::acosh(input); + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{1}; + + template + __device__ inline auto operator()(InputT input) const noexcept + -> decltype(cuda::std::atanh(input)) + requires(cuda::std::is_floating_point_v) + { + return cuda::std::atanh(input); + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{1}; + + template + __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::exp(input)) + { + return cuda::std::exp(input); + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{1}; + + template + __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::log(input)) + { + return cuda::std::log(input); + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{1}; + + template + __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::sqrt(input)) + { + return cuda::std::sqrt(input); + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{1}; + + template + __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::cbrt(input)) + { + return cuda::std::cbrt(input); + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{1}; + + template + __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::ceil(input)) + { + return cuda::std::ceil(input); + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{1}; + + template + __device__ inline auto operator()(InputT input) const noexcept + -> decltype(cuda::std::floor(input)) + { + return cuda::std::floor(input); + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{1}; + + // Only accept signed or unsigned types (both require is_arithmetic to be true) + template + __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::abs(input)) + requires(cuda::std::is_signed_v) + { + return cuda::std::abs(input); + } + + template + __device__ inline auto operator()(InputT input) const noexcept -> decltype(input) + requires(cuda::std::is_unsigned_v) + { + return input; + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{1}; + + template + __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::rint(input)) + { + return cuda::std::rint(input); + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{1}; + + template + __device__ inline auto operator()(InputT input) const noexcept -> decltype(~input) + { + return ~input; + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{1}; + + template + __device__ inline auto operator()(InputT input) const noexcept -> decltype(!input) + { + return !input; + } +}; + +template +struct cast { + static constexpr auto arity{1}; + template + __device__ inline auto operator()(From f) const noexcept -> To + requires(is_fixed_point()) + { + if constexpr (cuda::std::is_floating_point_v) { + return convert_fixed_to_floating(f); + } else { + return static_cast(f); + } + } + + template + __device__ inline auto operator()(From f) const noexcept -> decltype(static_cast(f)) + requires(!is_fixed_point()) + { + return static_cast(f); + } +}; + +template <> +struct operator_functor : cast {}; +template <> +struct operator_functor : cast {}; +template <> +struct operator_functor : cast {}; + +/* + * The default specialization of nullable operators is to fall back to the non-nullable + * implementation + */ +template +struct operator_functor { + using NonNullOperator = operator_functor; + static constexpr auto arity = NonNullOperator::arity; + + template + __device__ inline auto operator()(LHS const lhs, RHS const rhs) const noexcept + -> possibly_null_value_t + requires(arity_placeholder == 2) + { + using Out = possibly_null_value_t; + return (lhs.has_value() && rhs.has_value()) ? Out{NonNullOperator{}(*lhs, *rhs)} : Out{}; + } + + template + __device__ inline auto operator()(Input const input) const noexcept + -> possibly_null_value_t + requires(arity_placeholder == 1) + { + using Out = possibly_null_value_t; + return input.has_value() ? Out{NonNullOperator{}(*input)} : Out{}; + } +}; + +// IS_NULL(null) is true, IS_NULL(valid) is false +template <> +struct operator_functor { + using NonNullOperator = operator_functor; + static constexpr auto arity = NonNullOperator::arity; + + template + __device__ inline auto operator()(LHS const lhs) const noexcept -> bool + { + return !lhs.has_value(); + } +}; + +// NULL_EQUAL(null, null) is true, NULL_EQUAL(null, valid) is false, and NULL_EQUAL(valid, valid) == +// EQUAL(valid, valid) +template <> +struct operator_functor { + using NonNullOperator = operator_functor; + static constexpr auto arity = NonNullOperator::arity; + + template + __device__ inline auto operator()(LHS const lhs, RHS const rhs) const noexcept + -> possibly_null_value_t + { + // Case 1: Neither is null, so the output is given by the operation. + if (lhs.has_value() && rhs.has_value()) { return {NonNullOperator{}(*lhs, *rhs)}; } + // Case 2: Two nulls compare equal. + if (!lhs.has_value() && !rhs.has_value()) { return {true}; } + // Case 3: One value is null, while the other is not, so we return false. + return {false}; + } +}; + +///< NULL_LOGICAL_AND(null, null) is null, NULL_LOGICAL_AND(null, true) is null, +///< NULL_LOGICAL_AND(null, false) is false, and NULL_LOGICAL_AND(valid, valid) == +///< LOGICAL_AND(valid, valid) +template <> +struct operator_functor { + using NonNullOperator = operator_functor; + static constexpr auto arity = NonNullOperator::arity; + + template + __device__ inline auto operator()(LHS const lhs, RHS const rhs) const noexcept + -> possibly_null_value_t + { + // Case 1: Neither is null, so the output is given by the operation. + if (lhs.has_value() && rhs.has_value()) { return {NonNullOperator{}(*lhs, *rhs)}; } + // Case 2: Two nulls return null. + if (!lhs.has_value() && !rhs.has_value()) { return {}; } + // Case 3: One value is null, while the other is not. If it's true we return null, otherwise we + // return false. + auto const& valid_element = lhs.has_value() ? lhs : rhs; + if (*valid_element) { return {}; } + return {false}; + } +}; + +///< NULL_LOGICAL_OR(null, null) is null, NULL_LOGICAL_OR(null, true) is true, NULL_LOGICAL_OR(null, +///< false) is null, and NULL_LOGICAL_OR(valid, valid) == LOGICAL_OR(valid, valid) +template <> +struct operator_functor { + using NonNullOperator = operator_functor; + static constexpr auto arity = NonNullOperator::arity; + + template + __device__ inline auto operator()(LHS const lhs, RHS const rhs) const noexcept + -> possibly_null_value_t + { + // Case 1: Neither is null, so the output is given by the operation. + if (lhs.has_value() && rhs.has_value()) { return {NonNullOperator{}(*lhs, *rhs)}; } + // Case 2: Two nulls return null. + if (!lhs.has_value() && !rhs.has_value()) { return {}; } + // Case 3: One value is null, while the other is not. If it's true we return true, otherwise we + // return null. + auto const& valid_element = lhs.has_value() ? lhs : rhs; + if (*valid_element) { return {true}; } + return {}; + } +}; + +} // namespace ast::detail +} // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/ast/detail/operators.cuh b/cpp/include/cudf/ast/detail/operators.cuh index 1a7f7357f7e..333b8be2de3 100644 --- a/cpp/include/cudf/ast/detail/operators.cuh +++ b/cpp/include/cudf/ast/detail/operators.cuh @@ -15,37 +15,20 @@ */ #pragma once +#include +#include +#include #include #include #include #include #include -#include -#include #include namespace CUDF_EXPORT cudf { namespace ast::detail { -// Type trait for wrapping nullable types in a cuda::std::optional. Non-nullable -// types are returned as is. -template -struct possibly_null_value; - -template -struct possibly_null_value { - using type = cuda::std::optional; -}; - -template -struct possibly_null_value { - using type = T; -}; - -template -using possibly_null_value_t = typename possibly_null_value::type; - // Traits for valid operator / type combinations template constexpr bool is_valid_binary_op = cuda::std::is_invocable_v; @@ -177,734 +160,5 @@ CUDF_HOST_DEVICE inline constexpr decltype(auto) ast_operator_dispatcher(ast_ope } } -/** - * @brief Operator functor. - * - * This functor is templated on an `ast_operator`, with each template specialization defining a - * callable `operator()` that executes the operation. The functor specialization also has a member - * `arity` defining the number of operands that are accepted by the call to `operator()`. The - * `operator()` is templated on the types of its inputs (e.g. `typename LHS` and `typename RHS` for - * a binary operator). Trailing return types are defined as `decltype(result)` where `result` is - * the returned value. The trailing return types allow SFINAE to only consider template - * instantiations for valid combinations of types. This, in turn, allows the operator functors to be - * used with traits like `is_valid_binary_op` that rely on `std::is_invocable` and related features. - * - * @tparam op AST operator. - */ -template -struct operator_functor {}; - -template <> -struct operator_functor { - static constexpr auto arity{2}; - - template - __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs + rhs) - { - return lhs + rhs; - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{2}; - - template - __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs - rhs) - { - return lhs - rhs; - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{2}; - - template - __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs * rhs) - { - return lhs * rhs; - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{2}; - - template - __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs / rhs) - { - return lhs / rhs; - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{2}; - - template - __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept - -> decltype(static_cast(lhs) / static_cast(rhs)) - { - return static_cast(lhs) / static_cast(rhs); - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{2}; - - template - __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept - -> decltype(floor(static_cast(lhs) / static_cast(rhs))) - { - return floor(static_cast(lhs) / static_cast(rhs)); - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{2}; - - template > - __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept - -> decltype(static_cast(lhs) % static_cast(rhs)) - requires(cuda::std::is_integral_v) - { - return static_cast(lhs) % static_cast(rhs); - } - - template > - __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept - -> decltype(fmodf(static_cast(lhs), static_cast(rhs))) - requires(cuda::std::is_same_v) - { - return fmodf(static_cast(lhs), static_cast(rhs)); - } - - template > - __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept - -> decltype(fmod(static_cast(lhs), static_cast(rhs))) - requires(cuda::std::is_same_v) - { - return fmod(static_cast(lhs), static_cast(rhs)); - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{2}; - - template > - __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept - -> decltype(((static_cast(lhs) % static_cast(rhs)) + - static_cast(rhs)) % - static_cast(rhs)) - requires(cuda::std::is_integral_v) - { - return ((static_cast(lhs) % static_cast(rhs)) + - static_cast(rhs)) % - static_cast(rhs); - } - - template > - __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept - -> decltype(fmodf(fmodf(static_cast(lhs), static_cast(rhs)) + - static_cast(rhs), - static_cast(rhs))) - requires(cuda::std::is_same_v) - { - return fmodf(fmodf(static_cast(lhs), static_cast(rhs)) + - static_cast(rhs), - static_cast(rhs)); - } - - template > - __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept - -> decltype(fmod(fmod(static_cast(lhs), static_cast(rhs)) + - static_cast(rhs), - static_cast(rhs))) - requires(cuda::std::is_same_v) - { - return fmod(fmod(static_cast(lhs), static_cast(rhs)) + - static_cast(rhs), - static_cast(rhs)); - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{2}; - - template - __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept - -> decltype(cuda::std::pow(lhs, rhs)) - { - return cuda::std::pow(lhs, rhs); - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{2}; - - template - __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs == rhs) - { - return lhs == rhs; - } -}; - -// Alias NULL_EQUAL = EQUAL in the non-nullable case. -template <> -struct operator_functor - : public operator_functor {}; - -template <> -struct operator_functor { - static constexpr auto arity{2}; - - template - __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs != rhs) - { - return lhs != rhs; - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{2}; - - template - __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs < rhs) - { - return lhs < rhs; - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{2}; - - template - __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs > rhs) - { - return lhs > rhs; - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{2}; - - template - __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs <= rhs) - { - return lhs <= rhs; - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{2}; - - template - __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs >= rhs) - { - return lhs >= rhs; - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{2}; - - template - __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs & rhs) - { - return lhs & rhs; - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{2}; - - template - __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs | rhs) - { - return lhs | rhs; - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{2}; - - template - __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs ^ rhs) - { - return lhs ^ rhs; - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{2}; - - template - __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs && rhs) - { - return lhs && rhs; - } -}; - -// Alias NULL_LOGICAL_AND = LOGICAL_AND in the non-nullable case. -template <> -struct operator_functor - : public operator_functor {}; - -template <> -struct operator_functor { - static constexpr auto arity{2}; - - template - __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs || rhs) - { - return lhs || rhs; - } -}; - -// Alias NULL_LOGICAL_OR = LOGICAL_OR in the non-nullable case. -template <> -struct operator_functor - : public operator_functor {}; - -template <> -struct operator_functor { - static constexpr auto arity{1}; - - template - __device__ inline auto operator()(InputT input) const noexcept -> decltype(input) - { - return input; - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{1}; - - template - __device__ inline auto operator()(InputT input) const noexcept -> bool - { - return false; - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{1}; - - template - __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::sin(input)) - requires(cuda::std::is_floating_point_v) - { - return cuda::std::sin(input); - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{1}; - - template - __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::cos(input)) - requires(cuda::std::is_floating_point_v) - { - return cuda::std::cos(input); - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{1}; - - template - __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::tan(input)) - requires(cuda::std::is_floating_point_v) - { - return cuda::std::tan(input); - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{1}; - - template - __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::asin(input)) - requires(cuda::std::is_floating_point_v) - { - return cuda::std::asin(input); - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{1}; - - template - __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::acos(input)) - requires(cuda::std::is_floating_point_v) - { - return cuda::std::acos(input); - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{1}; - - template - __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::atan(input)) - requires(cuda::std::is_floating_point_v) - { - return cuda::std::atan(input); - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{1}; - - template - __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::sinh(input)) - requires(cuda::std::is_floating_point_v) - { - return cuda::std::sinh(input); - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{1}; - - template - __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::cosh(input)) - requires(cuda::std::is_floating_point_v) - { - return cuda::std::cosh(input); - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{1}; - - template - __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::tanh(input)) - requires(cuda::std::is_floating_point_v) - { - return cuda::std::tanh(input); - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{1}; - - template - __device__ inline auto operator()(InputT input) const noexcept - -> decltype(cuda::std::asinh(input)) - requires(cuda::std::is_floating_point_v) - { - return cuda::std::asinh(input); - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{1}; - - template - __device__ inline auto operator()(InputT input) const noexcept - -> decltype(cuda::std::acosh(input)) - requires(cuda::std::is_floating_point_v) - { - return cuda::std::acosh(input); - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{1}; - - template - __device__ inline auto operator()(InputT input) const noexcept - -> decltype(cuda::std::atanh(input)) - requires(cuda::std::is_floating_point_v) - { - return cuda::std::atanh(input); - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{1}; - - template - __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::exp(input)) - { - return cuda::std::exp(input); - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{1}; - - template - __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::log(input)) - { - return cuda::std::log(input); - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{1}; - - template - __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::sqrt(input)) - { - return cuda::std::sqrt(input); - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{1}; - - template - __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::cbrt(input)) - { - return cuda::std::cbrt(input); - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{1}; - - template - __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::ceil(input)) - { - return cuda::std::ceil(input); - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{1}; - - template - __device__ inline auto operator()(InputT input) const noexcept - -> decltype(cuda::std::floor(input)) - { - return cuda::std::floor(input); - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{1}; - - // Only accept signed or unsigned types (both require is_arithmetic to be true) - template - __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::abs(input)) - requires(cuda::std::is_signed_v) - { - return cuda::std::abs(input); - } - - template - __device__ inline auto operator()(InputT input) const noexcept -> decltype(input) - requires(cuda::std::is_unsigned_v) - { - return input; - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{1}; - - template - __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::rint(input)) - { - return cuda::std::rint(input); - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{1}; - - template - __device__ inline auto operator()(InputT input) const noexcept -> decltype(~input) - { - return ~input; - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{1}; - - template - __device__ inline auto operator()(InputT input) const noexcept -> decltype(!input) - { - return !input; - } -}; - -template -struct cast { - static constexpr auto arity{1}; - template - __device__ inline auto operator()(From f) const noexcept -> To - requires(is_fixed_point()) - { - if constexpr (cuda::std::is_floating_point_v) { - return convert_fixed_to_floating(f); - } else { - return static_cast(f); - } - } - - template - __device__ inline auto operator()(From f) const noexcept -> decltype(static_cast(f)) - requires(!is_fixed_point()) - { - return static_cast(f); - } -}; - -template <> -struct operator_functor : cast {}; -template <> -struct operator_functor : cast {}; -template <> -struct operator_functor : cast {}; - -/* - * The default specialization of nullable operators is to fall back to the non-nullable - * implementation - */ -template -struct operator_functor { - using NonNullOperator = operator_functor; - static constexpr auto arity = NonNullOperator::arity; - - template - __device__ inline auto operator()(LHS const lhs, RHS const rhs) const noexcept - -> possibly_null_value_t - requires(arity_placeholder == 2) - { - using Out = possibly_null_value_t; - return (lhs.has_value() && rhs.has_value()) ? Out{NonNullOperator{}(*lhs, *rhs)} : Out{}; - } - - template - __device__ inline auto operator()(Input const input) const noexcept - -> possibly_null_value_t - requires(arity_placeholder == 1) - { - using Out = possibly_null_value_t; - return input.has_value() ? Out{NonNullOperator{}(*input)} : Out{}; - } -}; - -// IS_NULL(null) is true, IS_NULL(valid) is false -template <> -struct operator_functor { - using NonNullOperator = operator_functor; - static constexpr auto arity = NonNullOperator::arity; - - template - __device__ inline auto operator()(LHS const lhs) const noexcept -> bool - { - return !lhs.has_value(); - } -}; - -// NULL_EQUAL(null, null) is true, NULL_EQUAL(null, valid) is false, and NULL_EQUAL(valid, valid) == -// EQUAL(valid, valid) -template <> -struct operator_functor { - using NonNullOperator = operator_functor; - static constexpr auto arity = NonNullOperator::arity; - - template - __device__ inline auto operator()(LHS const lhs, RHS const rhs) const noexcept - -> possibly_null_value_t - { - // Case 1: Neither is null, so the output is given by the operation. - if (lhs.has_value() && rhs.has_value()) { return {NonNullOperator{}(*lhs, *rhs)}; } - // Case 2: Two nulls compare equal. - if (!lhs.has_value() && !rhs.has_value()) { return {true}; } - // Case 3: One value is null, while the other is not, so we return false. - return {false}; - } -}; - -///< NULL_LOGICAL_AND(null, null) is null, NULL_LOGICAL_AND(null, true) is null, -///< NULL_LOGICAL_AND(null, false) is false, and NULL_LOGICAL_AND(valid, valid) == -///< LOGICAL_AND(valid, valid) -template <> -struct operator_functor { - using NonNullOperator = operator_functor; - static constexpr auto arity = NonNullOperator::arity; - - template - __device__ inline auto operator()(LHS const lhs, RHS const rhs) const noexcept - -> possibly_null_value_t - { - // Case 1: Neither is null, so the output is given by the operation. - if (lhs.has_value() && rhs.has_value()) { return {NonNullOperator{}(*lhs, *rhs)}; } - // Case 2: Two nulls return null. - if (!lhs.has_value() && !rhs.has_value()) { return {}; } - // Case 3: One value is null, while the other is not. If it's true we return null, otherwise we - // return false. - auto const& valid_element = lhs.has_value() ? lhs : rhs; - if (*valid_element) { return {}; } - return {false}; - } -}; - -///< NULL_LOGICAL_OR(null, null) is null, NULL_LOGICAL_OR(null, true) is true, NULL_LOGICAL_OR(null, -///< false) is null, and NULL_LOGICAL_OR(valid, valid) == LOGICAL_OR(valid, valid) -template <> -struct operator_functor { - using NonNullOperator = operator_functor; - static constexpr auto arity = NonNullOperator::arity; - - template - __device__ inline auto operator()(LHS const lhs, RHS const rhs) const noexcept - -> possibly_null_value_t - { - // Case 1: Neither is null, so the output is given by the operation. - if (lhs.has_value() && rhs.has_value()) { return {NonNullOperator{}(*lhs, *rhs)}; } - // Case 2: Two nulls return null. - if (!lhs.has_value() && !rhs.has_value()) { return {}; } - // Case 3: One value is null, while the other is not. If it's true we return true, otherwise we - // return null. - auto const& valid_element = lhs.has_value() ? lhs : rhs; - if (*valid_element) { return {true}; } - return {}; - } -}; - } // namespace ast::detail } // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/ast/detail/operators.hpp b/cpp/include/cudf/ast/detail/operators.hpp index 61bee5b479b..f12148c9ef8 100644 --- a/cpp/include/cudf/ast/detail/operators.hpp +++ b/cpp/include/cudf/ast/detail/operators.hpp @@ -47,6 +47,8 @@ cudf::data_type ast_operator_return_type(ast_operator op, */ cudf::size_type ast_operator_arity(ast_operator op); +std::string_view ast_operator_string(ast_operator op); + } // namespace ast::detail } // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/ast/detail/possibly_null.cuh b/cpp/include/cudf/ast/detail/possibly_null.cuh new file mode 100644 index 00000000000..d30b914666f --- /dev/null +++ b/cpp/include/cudf/ast/detail/possibly_null.cuh @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include + +namespace CUDF_EXPORT cudf { + +namespace ast::detail { + +// Type trait for wrapping nullable types in a cuda::std::optional. Non-nullable +// types are returned as is. +template +struct possibly_null_value; + +template +struct possibly_null_value { + using type = cuda::std::optional; +}; + +template +struct possibly_null_value { + using type = T; +}; + +template +using possibly_null_value_t = typename possibly_null_value::type; + +} // namespace ast::detail +} // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/ast/expressions.hpp b/cpp/include/cudf/ast/expressions.hpp index 4c5601be856..9b273b87231 100644 --- a/cpp/include/cudf/ast/expressions.hpp +++ b/cpp/include/cudf/ast/expressions.hpp @@ -15,6 +15,7 @@ */ #pragma once +#include #include #include #include @@ -26,6 +27,29 @@ #include namespace CUDF_EXPORT cudf { + +namespace detail { +namespace row_ir { + +/** + * @brief The base class for all IR nodes + * + * This class defines the interface for IR nodes, which can be instantiated and used to generate + * code. Each IR node represents a specific operation or value in the program. They represent a + * single-static-assignment (SSA) variable in the program IR. It is separate from the AST as it + * contains more detailed program information and analysis that would be needed to instantiate the + * program and generate correct and robust code. + */ +struct node; + +/** + * @brief A converter that converts AST expressions to IR nodes and CUDA UDFs. + */ +struct ast_converter; + +} // namespace row_ir +} // namespace detail + namespace ast { /** * @addtogroup expressions @@ -63,6 +87,15 @@ struct expression { virtual std::reference_wrapper accept( detail::expression_transformer& visitor) const = 0; + /** + * @brief Accepts an `row_ir::ast_converter` class. + * + * @param visitor The `row_ir::ast_converter` converting this expression tree + * @return The IR node representing this expression + */ + [[nodiscard]] virtual std::unique_ptr accept( + cudf::detail::row_ir::ast_converter& visitor) const = 0; + /** * @brief Returns true if the expression may evaluate to null. * @@ -90,73 +123,6 @@ struct expression { virtual ~expression() {} }; -/** - * @brief Enum of supported operators. - */ -enum class ast_operator : int32_t { - // Binary operators - ADD, ///< operator + - SUB, ///< operator - - MUL, ///< operator * - DIV, ///< operator / using common type of lhs and rhs - TRUE_DIV, ///< operator / after promoting type to floating point - FLOOR_DIV, ///< operator / after promoting to 64 bit floating point and then - ///< flooring the result - MOD, ///< operator % - PYMOD, ///< operator % using Python's sign rules for negatives - POW, ///< lhs ^ rhs - EQUAL, ///< operator == - NULL_EQUAL, ///< operator == with Spark rules: NULL_EQUAL(null, null) is true, NULL_EQUAL(null, - ///< valid) is false, and - ///< NULL_EQUAL(valid, valid) == EQUAL(valid, valid) - NOT_EQUAL, ///< operator != - LESS, ///< operator < - GREATER, ///< operator > - LESS_EQUAL, ///< operator <= - GREATER_EQUAL, ///< operator >= - BITWISE_AND, ///< operator & - BITWISE_OR, ///< operator | - BITWISE_XOR, ///< operator ^ - LOGICAL_AND, ///< operator && - NULL_LOGICAL_AND, ///< operator && with Spark rules: NULL_LOGICAL_AND(null, null) is null, - ///< NULL_LOGICAL_AND(null, true) is - ///< null, NULL_LOGICAL_AND(null, false) is false, and NULL_LOGICAL_AND(valid, - ///< valid) == LOGICAL_AND(valid, valid) - LOGICAL_OR, ///< operator || - NULL_LOGICAL_OR, ///< operator || with Spark rules: NULL_LOGICAL_OR(null, null) is null, - ///< NULL_LOGICAL_OR(null, true) is true, - ///< NULL_LOGICAL_OR(null, false) is null, and NULL_LOGICAL_OR(valid, valid) == - ///< LOGICAL_OR(valid, valid) - // Unary operators - IDENTITY, ///< Identity function - IS_NULL, ///< Check if operand is null - SIN, ///< Trigonometric sine - COS, ///< Trigonometric cosine - TAN, ///< Trigonometric tangent - ARCSIN, ///< Trigonometric sine inverse - ARCCOS, ///< Trigonometric cosine inverse - ARCTAN, ///< Trigonometric tangent inverse - SINH, ///< Hyperbolic sine - COSH, ///< Hyperbolic cosine - TANH, ///< Hyperbolic tangent - ARCSINH, ///< Hyperbolic sine inverse - ARCCOSH, ///< Hyperbolic cosine inverse - ARCTANH, ///< Hyperbolic tangent inverse - EXP, ///< Exponential (base e, Euler number) - LOG, ///< Natural Logarithm (base e) - SQRT, ///< Square-root (x^0.5) - CBRT, ///< Cube-root (x^(1.0/3)) - CEIL, ///< Smallest integer value not less than arg - FLOOR, ///< largest integer value not greater than arg - ABS, ///< Absolute value - RINT, ///< Rounds the floating-point argument arg to an integer value - BIT_INVERT, ///< Bitwise Not (~) - NOT, ///< Logical Not (!) - CAST_TO_INT64, ///< Cast value to int64_t - CAST_TO_UINT64, ///< Cast value to uint64_t - CAST_TO_FLOAT64 ///< Cast value to double -}; - /** * @brief Enum of table references. * @@ -317,6 +283,13 @@ class literal : public expression { */ [[nodiscard]] generic_scalar_device_view get_value() const { return value; } + /** + * @brief Get the scalar. + * + * @return The scalar object + */ + [[nodiscard]] cudf::scalar const& get_scalar() const { return scalar; } + /** * @copydoc expression::accept */ @@ -328,6 +301,12 @@ class literal : public expression { std::reference_wrapper accept( detail::expression_transformer& visitor) const override; + /** + * @copydoc expression::accept + */ + [[nodiscard]] std::unique_ptr accept( + cudf::detail::row_ir::ast_converter& visitor) const override; + [[nodiscard]] bool may_evaluate_null(table_view const& left, table_view const& right, rmm::cuda_stream_view stream) const override @@ -434,6 +413,12 @@ class column_reference : public expression { return (table_source == table_reference::LEFT ? left : right).column(column_index).has_nulls(); } + /** + * @copydoc expression::accept + */ + [[nodiscard]] std::unique_ptr accept( + cudf::detail::row_ir::ast_converter& visitor) const override; + private: cudf::size_type column_index; table_reference table_source; @@ -500,6 +485,12 @@ class operation : public expression { table_view const& right, rmm::cuda_stream_view stream) const override; + /** + * @copydoc expression::accept + */ + [[nodiscard]] std::unique_ptr accept( + cudf::detail::row_ir::ast_converter& visitor) const override; + private: ast_operator op; std::vector> operands; @@ -543,6 +534,12 @@ class column_name_reference : public expression { return true; } + /** + * @copydoc expression::accept + */ + [[nodiscard]] std::unique_ptr accept( + cudf::detail::row_ir::ast_converter& visitor) const override; + private: std::string column_name; }; @@ -640,5 +637,4 @@ class tree { /** @} */ // end of group } // namespace ast - } // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/column/column_view.hpp b/cpp/include/cudf/column/column_view.hpp index 11180e8c339..2c6a14739ea 100644 --- a/cpp/include/cudf/column/column_view.hpp +++ b/cpp/include/cudf/column/column_view.hpp @@ -17,7 +17,6 @@ #include #include -#include #include #include #include diff --git a/cpp/include/cudf/context.hpp b/cpp/include/cudf/context.hpp index f9a9011e79d..60062e8ab05 100644 --- a/cpp/include/cudf/context.hpp +++ b/cpp/include/cudf/context.hpp @@ -18,11 +18,44 @@ #include +#include +#include + namespace CUDF_EXPORT cudf { -/// @brief initialize the cudf global context +/// @brief Flags for controlling initialization steps +enum class init_flags : std::uint32_t { + /// @brief Load the nvCOMP library during initialization + LOAD_NVCOMP = 1 << 0, + /// @brief Initialize the JIT program cache during initialization + INIT_JIT_CACHE = 1 << 1, + /// @brief All initialization steps (default behavior) + ALL = LOAD_NVCOMP | INIT_JIT_CACHE +}; + +/// @brief Bitwise OR operator for init_flags +/// @param lhs The left-hand side of the operator +/// @param rhs The right-hand side of the operator +/// @return The result of the bitwise OR operation +constexpr init_flags operator|(init_flags lhs, init_flags rhs) noexcept +{ + using underlying_t = std::underlying_type_t; + return static_cast(static_cast(lhs) | static_cast(rhs)); +} + +/// @brief Check if a flag is set +/// @param flags The flags to check against +/// @param flag The specific flag to check for +/// @return true if all bits in `flag` are set in `flags`, false otherwise +constexpr bool has_flag(init_flags flags, init_flags flag) noexcept +{ + return (flags | flag) == flags; +} + +/// @brief Initialize the cudf global context +/// @param flags Optional flags to control which initialization steps to perform. /// @throws std::runtime_error if the context is already initialized -void initialize(); +void initialize(init_flags flags = init_flags::INIT_JIT_CACHE); /// @brief de-initialize the cudf global context /// @throws std::runtime_error if the context is already de-initialized diff --git a/cpp/include/cudf/detail/aggregation/aggregation.cuh b/cpp/include/cudf/detail/aggregation/aggregation.cuh index 00bdb229391..92f1aeea572 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.cuh +++ b/cpp/include/cudf/detail/aggregation/aggregation.cuh @@ -78,6 +78,10 @@ struct corresponding_operator { using type = DeviceSum; }; template <> +struct corresponding_operator { + using type = DeviceSum; +}; +template <> struct corresponding_operator { using type = DeviceProduct; }; @@ -86,6 +90,10 @@ struct corresponding_operator { using type = DeviceSum; }; template <> +struct corresponding_operator { + using type = DeviceSum; +}; +template <> struct corresponding_operator { using type = DeviceSum; }; diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp index 49ad841cc33..f71e9ec3262 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.hpp +++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp @@ -18,6 +18,7 @@ #include #include +#include #include #include #include @@ -39,6 +40,8 @@ class simple_aggregations_collector { // Declares the interface for the simple aggregation const& agg); virtual std::vector> visit(data_type col_type, class sum_aggregation const& agg); + virtual std::vector> visit( + data_type col_type, class sum_with_overflow_aggregation const& agg); virtual std::vector> visit(data_type col_type, class product_aggregation const& agg); virtual std::vector> visit(data_type col_type, @@ -116,6 +119,7 @@ class aggregation_finalizer { // Declares the interface for the finalizer // Declare overloads for each kind of a agg to dispatch virtual void visit(aggregation const& agg); virtual void visit(class sum_aggregation const& agg); + virtual void visit(class sum_with_overflow_aggregation const& agg); virtual void visit(class product_aggregation const& agg); virtual void visit(class min_aggregation const& agg); virtual void visit(class max_aggregation const& agg); @@ -177,6 +181,28 @@ class sum_aggregation final : public rolling_aggregation, void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); } }; +/** + * @brief Derived class for specifying a sum_with_overflow aggregation + */ +class sum_with_overflow_aggregation final : public groupby_aggregation, + public groupby_scan_aggregation, + public reduce_aggregation, + public segmented_reduce_aggregation { + public: + sum_with_overflow_aggregation() : aggregation(SUM_WITH_OVERFLOW) {} + + [[nodiscard]] std::unique_ptr clone() const override + { + return std::make_unique(*this); + } + std::vector> get_simple_aggregations( + data_type col_type, simple_aggregations_collector& collector) const override + { + return collector.visit(col_type, *this); + } + void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); } +}; + /** * @brief Derived class for specifying a product aggregation */ @@ -253,7 +279,8 @@ class max_aggregation final : public rolling_aggregation, */ class count_aggregation final : public rolling_aggregation, public groupby_aggregation, - public groupby_scan_aggregation { + public groupby_scan_aggregation, + public reduce_aggregation { public: count_aggregation(aggregation::Kind kind) : aggregation(kind) {} @@ -1352,11 +1379,12 @@ constexpr bool is_sum_product_agg(aggregation::Kind k) (k == aggregation::SUM_OF_SQUARES); } -// Summing/Multiplying integers of any type, always use int64_t accumulator +// Summing/Multiplying integers of any type, always use int64_t accumulator (except +// SUM_WITH_OVERFLOW which has its own template) template -struct target_type_impl && is_sum_product_agg(k)>> { + requires(std::is_integral_v && is_sum_product_agg(k) && + k != aggregation::SUM_WITH_OVERFLOW) +struct target_type_impl { using type = int64_t; }; @@ -1369,12 +1397,12 @@ struct target_type_impl< using type = Source; }; -// Summing/Multiplying float/doubles, use same type accumulator +// Summing/Multiplying float/doubles, use same type accumulator (except SUM_WITH_OVERFLOW which has +// its own template) template -struct target_type_impl< - Source, - k, - std::enable_if_t && is_sum_product_agg(k)>> { + requires(std::is_floating_point_v && is_sum_product_agg(k) && + k != aggregation::SUM_WITH_OVERFLOW) +struct target_type_impl { using type = Source; }; @@ -1386,6 +1414,12 @@ struct target_type_impl +struct target_type_impl { + using type = struct_view; // SUM_WITH_OVERFLOW outputs a struct with sum and overflow fields +}; + // Always use `double` for M2 template struct target_type_impl { @@ -1483,12 +1517,14 @@ struct target_type_impl { // Always use list for MERGE_LISTS template + requires cuda::std::is_same_v struct target_type_impl { using type = list_view; }; // Always use list for MERGE_SETS template + requires cuda::std::is_same_v struct target_type_impl { using type = list_view; }; @@ -1599,6 +1635,8 @@ CUDF_HOST_DEVICE inline decltype(auto) aggregation_dispatcher(aggregation::Kind switch (k) { case aggregation::SUM: return f.template operator()(std::forward(args)...); + case aggregation::SUM_WITH_OVERFLOW: + return f.template operator()(std::forward(args)...); case aggregation::PRODUCT: return f.template operator()(std::forward(args)...); case aggregation::MIN: diff --git a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh index 3af5afd20cd..0c5b57c51a7 100644 --- a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh +++ b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh @@ -25,6 +25,7 @@ #include #include +#include #include namespace cudf::detail { @@ -154,6 +155,45 @@ struct update_target_element { } }; +template + requires(cuda::std::is_same_v) +struct update_target_element { + __device__ void operator()(mutable_column_device_view target, + size_type target_index, + column_device_view source, + size_type source_index) const noexcept + { + // For SUM_WITH_OVERFLOW, target is a struct with sum value at child(0) and overflow flag at + // child(1) + auto sum_column = target.child(0); + auto overflow_column = target.child(1); + + auto const source_value = source.element(source_index); + auto const old_sum = + cudf::detail::atomic_add(&sum_column.element(target_index), source_value); + + // Early exit if overflow is already set to avoid unnecessary overflow checking + auto bool_ref = cuda::atomic_ref{ + *(overflow_column.data() + target_index)}; + if (bool_ref.load(cuda::memory_order_relaxed)) { return; } + + // Check for overflow before performing the addition to avoid UB + // For positive overflow: old_sum > 0, source_value > 0, and old_sum > max - source_value + // For negative overflow: old_sum < 0, source_value < 0, and old_sum < min - source_value + // TODO: to be replaced by CCCL equivalents once https://github.com/NVIDIA/cccl/pull/3755 is + // ready + auto constexpr int64_max = cuda::std::numeric_limits::max(); + auto constexpr int64_min = cuda::std::numeric_limits::min(); + auto const overflow = + ((old_sum > 0 && source_value > 0 && old_sum > int64_max - source_value) || + (old_sum < 0 && source_value < 0 && old_sum < int64_min - source_value)); + if (overflow) { + // Atomically set overflow flag to true (use atomic_max since true > false) + cudf::detail::atomic_max(&overflow_column.element(target_index), true); + } + } +}; + /** * @brief Function object to update a single element in a target column using * the dictionary key addressed by the specific index. diff --git a/cpp/include/cudf/detail/join/distinct_filtered_join.cuh b/cpp/include/cudf/detail/join/distinct_filtered_join.cuh new file mode 100644 index 00000000000..087f5ce3816 --- /dev/null +++ b/cpp/include/cudf/detail/join/distinct_filtered_join.cuh @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include + +#include +#include + +namespace cudf { +namespace detail { + +/** + * @brief Implementation of filtered join using set hash tables + * + * This class extends the base filtered_join to implement join operations + * using set semantics, where duplicate keys are not allowed in the hash table. + * This implementation is more memory efficient when the same filter table (right table) + * is to be reused for multiple semi/anti join operations. + */ +class distinct_filtered_join : public filtered_join { + private: + /** + * @brief Performs either a semi or anti join based on the specified kind + * + * @param probe The table to probe the hash table with + * @param kind The kind of join to perform (SEMI or ANTI) + * @param stream CUDA stream on which to perform operations + * @param mr Memory resource for allocations + * @return Device vector of indices representing the join result + */ + std::unique_ptr> semi_anti_join( + cudf::table_view const& probe, + join_kind kind, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + + /** + * @brief Core implementation for querying the hash table + * + * Performs the actual hash table query operation for both semi and anti joins + * using set semantics. + * + * @tparam CGSize CUDA cooperative group size + * @tparam Ref Reference type for the hash table + * @param probe The table to probe the hash table with + * @param preprocessed_probe Preprocessed probe table for row operators + * @param kind The kind of join to perform + * @param query_ref Reference to the hash table for querying + * @param stream CUDA stream on which to perform operations + * @param mr Memory resource for allocations + * @return Device vector of indices representing the join result + */ + template + std::unique_ptr> query_build_table( + cudf::table_view const& probe, + std::shared_ptr preprocessed_probe, + join_kind kind, + Ref query_ref, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + + public: + /** + * @brief Constructor for filtered join with set + * + * @param build The table to build the hash table from + * @param compare_nulls How null values should be compared + * @param load_factor Target load factor for the hash table + * @param stream CUDA stream on which to perform operations + */ + distinct_filtered_join(cudf::table_view const& build, + cudf::null_equality compare_nulls, + double load_factor, + rmm::cuda_stream_view stream); + + /** + * @brief Implementation of semi join for set + * + * Returns indices of probe table rows that have matching keys in the build table. + * + * @param probe The table to probe the hash table with + * @param stream CUDA stream on which to perform operations + * @param mr Memory resource for allocations + * @return Device vector of indices representing the join result + */ + std::unique_ptr> semi_join( + cudf::table_view const& probe, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) override; + + /** + * @brief Implementation of anti join for set + * + * Returns indices of probe table rows that do not have matching keys in the build table. + * + * @param probe The table to probe the hash table with + * @param stream CUDA stream on which to perform operations + * @param mr Memory resource for allocations + * @return Device vector of indices representing the join result + */ + std::unique_ptr> anti_join( + cudf::table_view const& probe, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) override; +}; + +} // namespace detail +} // namespace cudf diff --git a/cpp/include/cudf/detail/join/distinct_hash_join.cuh b/cpp/include/cudf/detail/join/distinct_hash_join.cuh index 3da903dc415..a50675c156e 100644 --- a/cpp/include/cudf/detail/join/distinct_hash_join.cuh +++ b/cpp/include/cudf/detail/join/distinct_hash_join.cuh @@ -15,8 +15,8 @@ */ #pragma once +#include #include -#include #include #include @@ -32,8 +32,8 @@ namespace cudf::detail { -using cudf::experimental::row::lhs_index_type; -using cudf::experimental::row::rhs_index_type; +using cudf::detail::row::lhs_index_type; +using cudf::detail::row::rhs_index_type; /** * @brief A custom comparator used for the build table insertion @@ -170,7 +170,7 @@ class distinct_hash_join { bool _has_nested_columns; ///< True if nested columns are present in build and probe tables cudf::null_equality _nulls_equal; ///< Whether to consider nulls as equal cudf::table_view _build; ///< Input table to build the hash map - std::shared_ptr + std::shared_ptr _preprocessed_build; ///< Input table preprocssed for row operators hash_table_type _hash_table; ///< Hash table built on `_build` }; diff --git a/cpp/include/cudf/detail/join/filtered_join.cuh b/cpp/include/cudf/detail/join/filtered_join.cuh new file mode 100644 index 00000000000..814763bf254 --- /dev/null +++ b/cpp/include/cudf/detail/join/filtered_join.cuh @@ -0,0 +1,237 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +namespace cudf { +namespace detail { + +using cudf::detail::row::lhs_index_type; +using cudf::detail::row::rhs_index_type; + +/** + * @brief Base class providing common functionality for filtered join operations. + * + * This abstract class implements the core components needed for hash-based semi + * and anti join operations. + */ +class filtered_join { + public: + /** + * @brief Properties of the build table used in the join operation + */ + struct build_properties { + bool has_nested_columns; ///< True if the build table contains nested columns + }; + + /** + * @brief Adapter for insertion operations in the hash table + * + * Returns result of self comparator passed + */ + template + struct insertion_adapter { + insertion_adapter(T const& _c) : _comparator{_c} {} + __device__ constexpr bool operator()( + cuco::pair const& lhs, + cuco::pair const& rhs) const noexcept + { + if (lhs.first != rhs.first) { return false; } + auto const lhs_index = static_cast(lhs.second); + auto const rhs_index = static_cast(rhs.second); + return _comparator(lhs_index, rhs_index); + } + + private: + T _comparator; + }; + + /** + * @brief Adapter for extracting hash values from key-value pairs + */ + struct hash_extract_fn { + template + __device__ constexpr hash_value_type operator()( + cuco::pair const& key) const noexcept + { + return key.first; + } + }; + + /** + * @brief Adapter for generating key-value pairs from indices + * + * @tparam T Index type + * @tparam Hasher Hash function type + */ + template + struct key_pair_fn { + CUDF_HOST_DEVICE constexpr key_pair_fn(Hasher const& hasher) : _hasher{hasher} {} + + __device__ __forceinline__ auto operator()(size_type i) const noexcept + { + return cuco::pair{_hasher(i), T{i}}; + } + + private: + Hasher _hasher; + }; + + /** + * @brief Adapter for comparing key-value pairs + * + * Compares hash values first for performance, then uses the provided equality comparator + * + * @tparam Equal Equality comparator type + */ + template + struct comparator_adapter { + comparator_adapter(Equal const& d_equal) : _d_equal{d_equal} {} + + __device__ constexpr auto operator()( + cuco::pair const& rhs, + cuco::pair const& lhs) const noexcept + { + if (lhs.first != rhs.first) { return false; } + return _d_equal(lhs.second, rhs.second); + } + + private: + Equal _d_equal; + }; + + /** + * @brief Constructor for filtered_join base class + * + * Initializes the hash table with the build table and prepares it for join operations. + * + * @param build The table to build the hash table from + * @param compare_nulls How null values should be compared + * @param load_factor Target load factor for the hash table + * @param stream CUDA stream on which to perform operations + */ + filtered_join(cudf::table_view const& build, + cudf::null_equality compare_nulls, + double load_factor, + rmm::cuda_stream_view stream); + + /** + * Virtual semi join function overridden in derived classes + */ + virtual std::unique_ptr> semi_join( + cudf::table_view const& probe, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) = 0; + + /** + * Virtual anti join function overridden in derived classes + */ + virtual std::unique_ptr> anti_join( + cudf::table_view const& probe, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) = 0; + + /** + * Virtual abstract base class destructor + */ + virtual ~filtered_join() = default; + + protected: + // Key type used in the hash table + using key = cuco::pair; + + // Storage type for the hash table buckets + using storage_type = + cuco::bucket_storage, + cudf::detail::cuco_allocator>; + + // Hasher for primitive row types + using primitive_row_hasher = + cudf::detail::row::primitive::row_hasher; + // Linear probing scheme with bucket size 1 for primitive types + using primitive_probing_scheme = cuco::linear_probing<1, hash_extract_fn>; + // Equality comparator for primitive rows + using primitive_row_comparator = cudf::detail::row::primitive::row_equality_comparator; + + // Hasher for complex row types with compile-time null handling + using row_hasher = + cudf::detail::row::hash::device_row_hasher; + // Linear probing scheme with bucket size 4 for nested data structures + using nested_probing_scheme = cuco::linear_probing<4, hash_extract_fn>; + // Linear probing scheme with bucket size 1 for simple data + using simple_probing_scheme = cuco::linear_probing<1, hash_extract_fn>; + // Equality comparator for complex rows with null handling and NaN comparison + using row_comparator = cudf::detail::row::equality::device_row_comparator< + true, + cudf::nullate::YES, + cudf::detail::row::equality::nan_equal_physical_equality_comparator>; + + storage_type _bucket_storage; ///< Storage for hash table buckets + + // Empty sentinel key used to mark empty slots in the hash table + static constexpr auto empty_sentinel_key = cuco::empty_key{ + cuco::pair{std::numeric_limits::max(), lhs_index_type{JoinNoneValue}}}; + build_properties _build_props; ///< Properties of the build table + cudf::table_view _build; ///< input table to build the hash map + cudf::null_equality const _nulls_equal; ///< whether to consider nulls as equal + std::shared_ptr + _preprocessed_build; ///< input table preprocssed for row operators + + /** + * @brief Populates the hash table with the build table + * + * @tparam CGSize CUDA cooperative group size + * @tparam Ref Reference type for the hash table + * @param insert_ref Reference to the hash table for insertion + * @param stream CUDA stream on which to perform operations + */ + template + void insert_build_table(Ref const& insert_ref, rmm::cuda_stream_view stream); + + private: + /** + * @brief Calculates the required storage size for the hash table + * + * Computes the appropriate size for the bucket storage based on the input + * table size and desired load factor. + * + * @param tbl Table for which to calculate storage + * @param load_factor Target load factor for the hash table + * @return Calculated bucket storage size + */ + auto compute_bucket_storage_size(cudf::table_view tbl, double load_factor); +}; + +} // namespace detail +} // namespace cudf diff --git a/cpp/include/cudf/detail/join/hash_join.cuh b/cpp/include/cudf/detail/join/hash_join.cuh index c71e1548d3d..99f11126999 100644 --- a/cpp/include/cudf/detail/join/hash_join.cuh +++ b/cpp/include/cudf/detail/join/hash_join.cuh @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -35,7 +36,7 @@ #include // Forward declaration -namespace cudf::experimental::row::equality { +namespace cudf::detail::row::equality { class preprocessed_table; } @@ -107,7 +108,7 @@ struct hash_join { bool const _has_nulls; ///< true if nulls are present in either build table or any probe table cudf::null_equality const _nulls_equal; ///< whether to consider nulls as equal cudf::table_view _build; ///< input table to build the hash map - std::shared_ptr + std::shared_ptr _preprocessed_build; ///< input table preprocssed for row operators hash_table_t _hash_table; ///< hash table built on `_build` @@ -188,7 +189,36 @@ struct hash_join { rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const; + /** + * @copydoc cudf::hash_join::inner_join_match_context + */ + [[nodiscard]] cudf::join_match_context inner_join_match_context( + cudf::table_view const& probe, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const; + + /** + * @copydoc cudf::hash_join::left_join_match_context + */ + [[nodiscard]] cudf::join_match_context left_join_match_context( + cudf::table_view const& probe, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const; + + /** + * @copydoc cudf::hash_join::full_join_match_context + */ + [[nodiscard]] cudf::join_match_context full_join_match_context( + cudf::table_view const& probe, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const; + private: + template + void compute_match_counts(cudf::table_view const& probe, + OutputIterator output_iter, + rmm::cuda_stream_view stream) const; + /** * @brief Probes the `_hash_table` built from `_build` for tuples in `probe_table`, * and returns the output indices of `build_table` and `probe_table` as a combined table, diff --git a/cpp/include/cudf/detail/row_operator/common_utils.cuh b/cpp/include/cudf/detail/row_operator/common_utils.cuh new file mode 100644 index 00000000000..794b2300bf7 --- /dev/null +++ b/cpp/include/cudf/detail/row_operator/common_utils.cuh @@ -0,0 +1,151 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +namespace cudf::detail { + +/** + * @brief Result type of comparison operations. + * + * Indicates how two elements `a` and `b` compare with one and another. + * + * Equivalence is defined as `not (a +__device__ weak_ordering compare_elements(Element lhs, Element rhs) +{ + if (lhs < rhs) { + return weak_ordering::LESS; + } else if (rhs < lhs) { + return weak_ordering::GREATER; + } + return weak_ordering::EQUIVALENT; +} + +/** + * @brief A specialization for floating-point `Element` type relational comparison + * to derive the order of the elements with respect to `lhs`. + * + * This specialization handles `nan` in the following order: + * `[-Inf, -ve, 0, -0, +ve, +Inf, NaN, NaN, null] (for null_order::AFTER)` + * `[null, -Inf, -ve, 0, -0, +ve, +Inf, NaN, NaN] (for null_order::BEFORE)` + * + * @param lhs The first element + * @param rhs The second element + * @return Indicates the relationship between the elements + */ +template +__device__ weak_ordering relational_compare(Element lhs, Element rhs) + requires(cuda::std::is_floating_point_v) +{ + if (isnan(lhs) and isnan(rhs)) { + return weak_ordering::EQUIVALENT; + } else if (isnan(rhs)) { + return weak_ordering::LESS; + } else if (isnan(lhs)) { + return weak_ordering::GREATER; + } + + return detail::compare_elements(lhs, rhs); +} + +/** + * @brief Compare the nulls according to null order. + * + * @param lhs_is_null boolean representing if lhs is null + * @param rhs_is_null boolean representing if rhs is null + * @param null_precedence null order + * @return Indicates the relationship between null in lhs and rhs columns. + */ +inline __device__ auto null_compare(bool lhs_is_null, bool rhs_is_null, null_order null_precedence) +{ + if (lhs_is_null and rhs_is_null) { // null +__device__ weak_ordering relational_compare(Element lhs, Element rhs) + requires(not cuda::std::is_floating_point_v) +{ + return detail::compare_elements(lhs, rhs); +} + +/** + * @brief A specialization for floating-point `Element` type to check if + * `lhs` is equivalent to `rhs`. `nan == nan`. + * + * @param lhs first element + * @param rhs second element + * @return `true` if `lhs` == `rhs` else `false`. + */ +template +__device__ bool equality_compare(Element lhs, Element rhs) + requires(cuda::std::is_floating_point_v) +{ + if (isnan(lhs) and isnan(rhs)) { return true; } + return lhs == rhs; +} + +/** + * @brief A specialization for non-floating-point `Element` type to check if + * `lhs` is equivalent to `rhs`. + * + * @param lhs first element + * @param rhs second element + * @return `true` if `lhs` == `rhs` else `false`. + */ +template +__device__ bool equality_compare(Element const lhs, Element const rhs) + requires(not cuda::std::is_floating_point_v) +{ + return lhs == rhs; +} + +} // namespace cudf::detail diff --git a/cpp/include/cudf/detail/row_operator/primitive_row_operators.cuh b/cpp/include/cudf/detail/row_operator/primitive_row_operators.cuh new file mode 100644 index 00000000000..4743e9fc645 --- /dev/null +++ b/cpp/include/cudf/detail/row_operator/primitive_row_operators.cuh @@ -0,0 +1,287 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +namespace CUDF_EXPORT cudf { + +namespace detail { + +/** + * @brief Checks if a table is compatible with primitive row operations + * + * A table is compatible with primitive row operations if it contains exactly one column + * and that column contains only numeric data types. + * + * @param table The table to check for compatibility + * @return Boolean indicating if the table is compatible with primitive row operations + */ +bool is_primitive_row_op_compatible(cudf::table_view const& table); + +namespace row::primitive { + +/** + * @brief Returns `void` if it's not a primitive type + */ +template +using primitive_type_t = cuda::std::conditional_t(), T, void>; + +/** + * @brief Custom dispatcher for primitive types + */ +template +struct dispatch_primitive_type { + using type = primitive_type_t>; ///< The underlying type +}; + +/** + * @brief Performs an equality comparison between two elements in two columns. + */ +class element_equality_comparator { + public: + /** + * @brief Compares the specified elements for equality. + * + * @param lhs The first column + * @param rhs The second column + * @param lhs_element_index The index of the first element + * @param rhs_element_index The index of the second element + * @return True if lhs and rhs element are equal + */ + template ())> + __device__ bool operator()(column_device_view const& lhs, + column_device_view const& rhs, + size_type lhs_element_index, + size_type rhs_element_index) const + { + return cudf::detail::equality_compare(lhs.element(lhs_element_index), + rhs.element(rhs_element_index)); + } + + // @cond + template ())> + __device__ bool operator()(column_device_view const&, + column_device_view const&, + size_type, + size_type) const + { + CUDF_UNREACHABLE("Attempted to compare elements of uncomparable types."); + } + // @endcond +}; + +/** + * @brief Performs a relational comparison between two elements in two tables. + */ +class row_equality_comparator { + public: + /** + * @brief Construct a new row equality comparator object + * + * @param has_nulls Indicates if either input column contains nulls + * @param lhs Preprocessed table containing the first element + * @param rhs Preprocessed table containing the second element (may be the same as lhs) + * @param nulls_are_equal Indicates if two null elements are treated as equivalent + */ + row_equality_comparator(cudf::nullate::DYNAMIC const& has_nulls, + std::shared_ptr lhs, + std::shared_ptr rhs, + null_equality nulls_are_equal) + : _has_nulls{has_nulls}, _lhs{*lhs}, _rhs{*rhs}, _nulls_are_equal{nulls_are_equal} + { + CUDF_EXPECTS(_lhs.num_columns() == _rhs.num_columns(), "Mismatched number of columns."); + } + + /** + * @brief Compares the specified rows for equality. + * + * @param lhs_row_index The index of the first row to compare (in the lhs table) + * @param rhs_row_index The index of the second row to compare (in the rhs table) + * @return true if both rows are equal, otherwise false + */ + __device__ bool operator()(size_type lhs_row_index, size_type rhs_row_index) const + { + auto equal_elements = [this, lhs_row_index, rhs_row_index](column_device_view const& l, + column_device_view const& r) { + // Handle null comparison for each element + if (_has_nulls) { + bool const lhs_is_null{l.is_null(lhs_row_index)}; + bool const rhs_is_null{r.is_null(rhs_row_index)}; + if (lhs_is_null and rhs_is_null) { + return _nulls_are_equal == null_equality::EQUAL; + } else if (lhs_is_null != rhs_is_null) { + return false; + } + } + + // Both elements are non-null, compare their values + element_equality_comparator comparator; + return cudf::type_dispatcher( + l.type(), comparator, l, r, lhs_row_index, rhs_row_index); + }; + + return thrust::equal(thrust::seq, _lhs.begin(), _lhs.end(), _rhs.begin(), equal_elements); + } + + /** + * @brief Compares the specified rows for equality. + * + * @param lhs_index The index of the first row to compare (in the lhs table) + * @param rhs_index The index of the second row to compare (in the rhs table) + * @return Boolean indicating if both rows are equal + */ + __device__ bool operator()(cudf::detail::row::lhs_index_type lhs_index, + cudf::detail::row::rhs_index_type rhs_index) const + { + return (*this)(static_cast(lhs_index), static_cast(rhs_index)); + } + + private: + cudf::nullate::DYNAMIC _has_nulls; + table_device_view _lhs; + table_device_view _rhs; + null_equality _nulls_are_equal; +}; + +/** + * @brief Function object for computing the hash value of a row in a column. + * + * @tparam Hash Hash functor to use for hashing elements + */ +template