From e3f1bb7e61d8f5a17bd6c0e76f0fb988837e9159 Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Mon, 29 Sep 2025 03:32:53 +0000 Subject: [PATCH 1/4] Add a workaround to run PDS-H using WebHDFS --- cpp/src/io/utilities/datasource.cpp | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 22d728dd8c6..8af0db3e1c7 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -437,8 +437,16 @@ std::unique_ptr datasource::create(std::string const& filepath, } else if (use_memory_mapping) { return std::make_unique(filepath.c_str(), offset, max_size_estimate); } else { - // `file_source` reads the file directly, without memory mapping - return std::make_unique(filepath.c_str()); + auto* local_dir_pattern = getenv("LIBCUDF_WEBHDFS_LOCAL_DIR_PATTERN"); + auto* remote_dir_pattern = getenv("LIBCUDF_WEBHDFS_REMOTE_DIR_PATTERN"); + if (local_dir_pattern != nullptr and remote_dir_pattern != nullptr) { + auto remote_file_path = + std::regex_replace(filepath, std::regex{local_dir_pattern}, remote_dir_pattern); + return std::make_unique(remote_file_path.c_str()); + } else { + // `file_source` reads the file directly, without memory mapping + return std::make_unique(filepath.c_str()); + } } } From f2d18482f498600da0d49e9536a19844cc1e5ba5 Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Mon, 29 Sep 2025 05:02:18 +0000 Subject: [PATCH 2/4] Skip pdfs scale factor validation --- .../cudf_polars/experimental/benchmarks/utils.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/python/cudf_polars/cudf_polars/experimental/benchmarks/utils.py b/python/cudf_polars/cudf_polars/experimental/benchmarks/utils.py index 50043582944..dac8a5d51eb 100644 --- a/python/cudf_polars/cudf_polars/experimental/benchmarks/utils.py +++ b/python/cudf_polars/cudf_polars/experimental/benchmarks/utils.py @@ -245,7 +245,15 @@ def from_args(cls, args: argparse.Namespace) -> RunConfig: except ValueError: scale_factor = float(scale_factor) - if "pdsh" in name and args.scale is not None: + skip_scale_factor_inference = ( + "LIBCUDF_WEBHDFS_LOCAL_DIR_PATTERN" in os.environ + ) and ("LIBCUDF_WEBHDFS_REMOTE_DIR_PATTERN" in os.environ) + + if ( + "pdsh" in name + and args.scale is not None + and skip_scale_factor_inference is False + ): # Validate the user-supplied scale factor sf_inf = _infer_scale_factor(name, path, args.suffix) rel_error = abs((scale_factor - sf_inf) / sf_inf) From 43426fbb96bc7268d1d38429fabba56249830f3b Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Mon, 29 Sep 2025 13:54:59 +0000 Subject: [PATCH 3/4] Update --- cpp/src/io/utilities/datasource.cpp | 23 +++++++++++++++---- .../experimental/benchmarks/utils.py | 4 ++-- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 8af0db3e1c7..dc046152379 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -437,11 +437,26 @@ std::unique_ptr datasource::create(std::string const& filepath, } else if (use_memory_mapping) { return std::make_unique(filepath.c_str(), offset, max_size_estimate); } else { - auto* local_dir_pattern = getenv("LIBCUDF_WEBHDFS_LOCAL_DIR_PATTERN"); - auto* remote_dir_pattern = getenv("LIBCUDF_WEBHDFS_REMOTE_DIR_PATTERN"); + // Reroute I/O: If the following two env vars are specified, the filepath for an existing local + // file will be modified (only in-memory, not affecting the original file) such that the first + // occurrence of local_dir_pattern is replaced by remote_dir_pattern, and a remote file resource + // will be used instead of a local file resource. + // + // For example, let "LIBCUDF_IO_REROUTE_LOCAL_DIR_PATTERN" be "/mnt/nvme/tmp", and + // "LIBCUDF_IO_REROUTE_REMOTE_DIR_PATTERN" be + // "http://example.com:9870/webhdfs/v1/home/ubuntu/data". If a local file with the name + // "/mnt/nvme/tmp/test.bin" exists, libcudf will create a remote file resource with the URL + // "http://example.com:9870/webhdfs/v1/home/ubuntu/data/test.bin" + // + // This feature can be used as a workaround for PDS-H benchmark using WebHDFS without the need + // for upstream Polars change. + auto* local_dir_pattern = std::getenv("LIBCUDF_IO_REROUTE_LOCAL_DIR_PATTERN"); + auto* remote_dir_pattern = std::getenv("LIBCUDF_IO_REROUTE_REMOTE_DIR_PATTERN"); if (local_dir_pattern != nullptr and remote_dir_pattern != nullptr) { - auto remote_file_path = - std::regex_replace(filepath, std::regex{local_dir_pattern}, remote_dir_pattern); + auto remote_file_path = std::regex_replace(filepath, + std::regex{local_dir_pattern}, + remote_dir_pattern, + std::regex_constants::format_first_only); return std::make_unique(remote_file_path.c_str()); } else { // `file_source` reads the file directly, without memory mapping diff --git a/python/cudf_polars/cudf_polars/experimental/benchmarks/utils.py b/python/cudf_polars/cudf_polars/experimental/benchmarks/utils.py index dac8a5d51eb..1494a645825 100644 --- a/python/cudf_polars/cudf_polars/experimental/benchmarks/utils.py +++ b/python/cudf_polars/cudf_polars/experimental/benchmarks/utils.py @@ -246,8 +246,8 @@ def from_args(cls, args: argparse.Namespace) -> RunConfig: scale_factor = float(scale_factor) skip_scale_factor_inference = ( - "LIBCUDF_WEBHDFS_LOCAL_DIR_PATTERN" in os.environ - ) and ("LIBCUDF_WEBHDFS_REMOTE_DIR_PATTERN" in os.environ) + "LIBCUDF_IO_REROUTE_LOCAL_DIR_PATTERN" in os.environ + ) and ("LIBCUDF_IO_REROUTE_REMOTE_DIR_PATTERN" in os.environ) if ( "pdsh" in name From c8bfb9ef48c2b453db9c3aeee35da9f9e95ae68c Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Mon, 29 Sep 2025 14:50:30 +0000 Subject: [PATCH 4/4] Update --- cpp/src/io/utilities/datasource.cpp | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index dc046152379..ff9fe5ddb45 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -452,16 +452,22 @@ std::unique_ptr datasource::create(std::string const& filepath, // for upstream Polars change. auto* local_dir_pattern = std::getenv("LIBCUDF_IO_REROUTE_LOCAL_DIR_PATTERN"); auto* remote_dir_pattern = std::getenv("LIBCUDF_IO_REROUTE_REMOTE_DIR_PATTERN"); + if (local_dir_pattern != nullptr and remote_dir_pattern != nullptr) { auto remote_file_path = std::regex_replace(filepath, std::regex{local_dir_pattern}, remote_dir_pattern, std::regex_constants::format_first_only); - return std::make_unique(remote_file_path.c_str()); - } else { - // `file_source` reads the file directly, without memory mapping - return std::make_unique(filepath.c_str()); + + // Create a remote file resource only when the pattern is found and replaced; otherwise, still + // create a local file resource + if (filepath != remote_file_path) { + return std::make_unique(remote_file_path.c_str()); + } } + + // `file_source` reads the file directly, without memory mapping + return std::make_unique(filepath.c_str()); } }