diff --git a/ci/run_cudf_examples.sh b/ci/run_cudf_examples.sh index cc202fa9d56..c918fa93b73 100755 --- a/ci/run_cudf_examples.sh +++ b/ci/run_cudf_examples.sh @@ -9,21 +9,21 @@ trap "EXITCODE=1" ERR # Support customizing the examples' install location cd "${INSTALL_PREFIX:-${CONDA_PREFIX:-/usr}}/bin/examples/libcudf/" || exit -cd basic || exit +pushd basic || exit compute-sanitizer --tool memcheck basic_example -cd .. +popd || exit -cd nested_types || exit +pushd nested_types || exit compute-sanitizer --tool memcheck deduplication -cd .. +popd || exit -cd strings || exit +pushd strings || exit compute-sanitizer --tool memcheck custom_optimized names.csv compute-sanitizer --tool memcheck custom_prealloc names.csv compute-sanitizer --tool memcheck custom_with_malloc names.csv -cd .. +popd || exit -cd string_transformers || exit +pushd string_transformers || exit compute-sanitizer --tool memcheck compute_checksum_jit info.csv output.csv compute-sanitizer --tool memcheck extract_email_jit info.csv output.csv compute-sanitizer --tool memcheck extract_email_precompiled info.csv output.csv @@ -31,14 +31,18 @@ compute-sanitizer --tool memcheck format_phone_jit info.csv output.csv compute-sanitizer --tool memcheck format_phone_precompiled info.csv output.csv compute-sanitizer --tool memcheck localize_phone_jit info.csv output.csv compute-sanitizer --tool memcheck localize_phone_precompiled info.csv output.csv -cd .. +popd || exit -cd parquet_io || exit +pushd parquet_io || exit compute-sanitizer --tool memcheck parquet_io example.parquet compute-sanitizer --tool memcheck parquet_io example.parquet output.parquet DELTA_BINARY_PACKED ZSTD TRUE compute-sanitizer --tool memcheck parquet_io_multithreaded example.parquet compute-sanitizer --tool memcheck parquet_io_multithreaded example.parquet 4 DEVICE_BUFFER 2 2 -cd .. +popd || exit + +pushd parquet_inspect || exit +compute-sanitizer --tool memcheck parquet_inspect example.parquet +popd || exit exit ${EXITCODE} diff --git a/cpp/examples/build.sh b/cpp/examples/build.sh index 7296a1afd04..713500fc46d 100755 --- a/cpp/examples/build.sh +++ b/cpp/examples/build.sh @@ -61,5 +61,6 @@ build_example basic build_example strings build_example string_transforms build_example nested_types +build_example parquet_inspect build_example parquet_io build_example billion_rows diff --git a/cpp/examples/parquet_inspect/CMakeLists.txt b/cpp/examples/parquet_inspect/CMakeLists.txt new file mode 100644 index 00000000000..f450470a835 --- /dev/null +++ b/cpp/examples/parquet_inspect/CMakeLists.txt @@ -0,0 +1,42 @@ +# Copyright (c) 2024-2025, NVIDIA CORPORATION. + +cmake_minimum_required(VERSION 3.30.4 FATAL_ERROR) + +include(../set_cuda_architecture.cmake) + +# initialize cuda architecture +rapids_cuda_init_architectures(parquet_inspect) + +project( + parquet_inspect + VERSION 0.0.1 + LANGUAGES CXX CUDA +) + +include(../fetch_dependencies.cmake) + +include(rapids-cmake) +rapids_cmake_build_type("Release") + +# For now, disable CMake's automatic module scanning for C++ files. There is an sccache bug in the +# version RAPIDS uses in CI that causes it to handle the resulting -M* flags incorrectly with +# gcc>=14. We can remove this once we upgrade to a newer sccache version. +set(CMAKE_CXX_SCAN_FOR_MODULES OFF) + +add_library(parquet_inspect_utils OBJECT parquet_inspect_utils.cpp) +target_compile_features(parquet_inspect_utils PRIVATE cxx_std_20) +target_link_libraries(parquet_inspect_utils PRIVATE cudf::cudf) + +# Build and install parquet_inspect +add_executable(parquet_inspect parquet_inspect.cpp) +target_link_libraries( + parquet_inspect PRIVATE cudf::cudf $ + $ +) +target_compile_features(parquet_inspect PRIVATE cxx_std_20) +install(TARGETS parquet_inspect DESTINATION bin/examples/libcudf/parquet_inspect) + +# Install the example.parquet file +install(FILES ${CMAKE_CURRENT_LIST_DIR}/example.parquet + DESTINATION bin/examples/libcudf/parquet_inspect +) diff --git a/cpp/examples/parquet_inspect/example.parquet b/cpp/examples/parquet_inspect/example.parquet new file mode 100644 index 00000000000..5632b144275 Binary files /dev/null and b/cpp/examples/parquet_inspect/example.parquet differ diff --git a/cpp/examples/parquet_inspect/parquet_inspect.cpp b/cpp/examples/parquet_inspect/parquet_inspect.cpp new file mode 100644 index 00000000000..7a7fceedc19 --- /dev/null +++ b/cpp/examples/parquet_inspect/parquet_inspect.cpp @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "parquet_inspect_utils.hpp" + +#include +#include + +#include + +#include +#include +#include + +/** + * @file parquet_inspect.cpp + * @brief Inspects a parquet file and writes two parquet files containing row group and page + * metadata respectively. + */ + +namespace { + +/** + * @brief Function to print example usage and argument information. + */ +void print_usage() +{ + std::cout << "\nUsage: parquet_inspect \n\n"; +} + +} // namespace + +/** + * @brief Main for parquet_inspect examples + * + * Command line parameters: + * 1. parquet input file name (default: "example.parquet") + * 2. parquet output path (default: "$pwd") + * + * Example invocation from directory `cudf/cpp/examples/parquet_inspect`: + * ./build/parquet_inspect example.parquet ./ + */ +int main(int argc, char const** argv) +{ + std::string input_filepath = "example.parquet"; + std::string output_path = std::filesystem::current_path().string(); + std::optional page_stats = std::nullopt; + + switch (argc) { + case 3: output_path = argv[2]; [[fallthrough]]; + case 2: // Check if instead of input_paths, the first argument is `-h` or `--help` + { + auto const arg = std::string{argv[1]}; + if (arg == "-h" or arg == "--help") { + print_usage(); + return 0; + } else if (arg != "-h" and arg != "--help") { + input_filepath = std::filesystem::absolute(arg).string(); + break; + } + [[fallthrough]]; + } + default: print_usage(); throw std::runtime_error("Invalid arguments"); + } + + CUDF_EXPECTS( + std::filesystem::exists(input_filepath) and std::filesystem::is_regular_file(input_filepath), + "Input file '" + input_filepath + "' does not exist or is not a regular file.", + std::invalid_argument); + + auto const filename = std::filesystem::path(input_filepath).stem().string(); + + auto const stream = cudf::get_default_stream(); + auto const mr = create_memory_resource(true); + cudf::set_current_device_resource(mr.get()); + + // Read parquet footer metadata + auto [metadata, has_page_index] = read_parquet_file_metadata(input_filepath); + + // Write row group metadata + auto output_filepath = output_path + "/" + filename + ".rowgroups.parquet"; + write_rowgroup_metadata(metadata, output_filepath, stream); + + // Write page metadata + if (has_page_index) { + auto output_filepath = output_path + "/" + filename + ".pages.parquet"; + write_page_metadata(metadata, output_filepath, stream); + } + + stream.synchronize(); + + return 0; +} diff --git a/cpp/examples/parquet_inspect/parquet_inspect_utils.cpp b/cpp/examples/parquet_inspect/parquet_inspect_utils.cpp new file mode 100644 index 00000000000..e61b60047ee --- /dev/null +++ b/cpp/examples/parquet_inspect/parquet_inspect_utils.cpp @@ -0,0 +1,422 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "parquet_inspect_utils.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +/** + * @file parquet_inspect_utils.cpp + * @brief Definitions for utilities for `parquet_inspect` example + */ + +namespace { + +/** + * @brief Compute page row counts and page row offsets and column chunk page (count) offsets for a + * given column index + */ +[[nodiscard]] auto compute_page_row_counts_and_offsets( + cudf::io::parquet::FileMetaData const& metadata, + cudf::size_type col_idx, + rmm::cuda_stream_view stream) +{ + auto const num_colchunks = metadata.row_groups.front().columns.size(); + + // Row counts per page per column chunk + auto page_row_counts = thrust::host_vector{}; + // Row offsets per page per column chunk + auto page_row_offsets = thrust::host_vector{}; + auto page_byte_offsets = thrust::host_vector{}; + // Pages (count) offsets per column chunk + auto col_page_offsets = thrust::host_vector{}; + + page_row_offsets.push_back(0); + col_page_offsets.push_back(0); + + // For all column chunks + std::for_each( + metadata.row_groups.cbegin(), metadata.row_groups.cend(), [&](auto const& row_group) { + // Find the column chunk with the given schema index + auto const& colchunk = row_group.columns[col_idx]; + + // Compute page row counts and offsets if this column chunk has column and offset indexes + if (colchunk.offset_index.has_value()) { + CUDF_EXPECTS(colchunk.column_index.has_value(), + "Both offset and column indexes must be present"); + // Get the offset and column indexes of the column chunk + auto const& offset_index = colchunk.offset_index.value(); + auto const& column_index = colchunk.column_index.value(); + + // Number of pages in this column chunk + auto const row_group_num_pages = offset_index.page_locations.size(); + + CUDF_EXPECTS(column_index.min_values.size() == column_index.max_values.size(), + "page min and max values should be of same size"); + CUDF_EXPECTS(column_index.min_values.size() == row_group_num_pages, + "mismatch between size of min/max page values and the size of page " + "locations"); + // Update the cumulative number of pages in this column chunk + col_page_offsets.push_back(col_page_offsets.back() + row_group_num_pages); + + // For all pages in this column chunk, update page row counts and offsets. + std::for_each( + thrust::counting_iterator(0), + thrust::counting_iterator(row_group_num_pages), + [&](auto const page_idx) { + int64_t const first_row_idx = offset_index.page_locations[page_idx].first_row_index; + // For the last page, this is simply the total number of rows in the + // column chunk + int64_t const last_row_idx = + (page_idx < row_group_num_pages - 1) + ? offset_index.page_locations[page_idx + 1].first_row_index + : row_group.num_rows; + + // Update the page row counts and offsets + page_row_counts.push_back(last_row_idx - first_row_idx); + page_byte_offsets.push_back(offset_index.page_locations[page_idx].offset); + page_row_offsets.push_back(page_row_offsets.back() + page_row_counts.back()); + }); + } + }); + return std::tuple{std::move(page_row_counts), + std::move(page_row_offsets), + std::move(page_byte_offsets), + std::move(col_page_offsets)}; +} + +/** + * @brief Makes an INT64 index column containing elements: [0, size) + * + * @param num_rows Number of rows + * @param stream CUDA stream + * + * @return A unique pointer to a column + */ +auto make_index_column(cudf::size_type num_rows, rmm::cuda_stream_view stream) +{ + std::vector data(num_rows); + std::iota(data.begin(), data.end(), 0); + auto buffer = rmm::device_buffer(data.data(), num_rows * sizeof(int64_t), stream); + return std::make_unique(cudf::data_type{cudf::type_to_id()}, + num_rows, + std::move(buffer), + rmm::device_buffer{}, + 0); +} + +/** + * @brief Constructs a cuDF column from the host data + * + * @tparam T Data type + * @param host_data Span of host data + * @param stream CUDA stream + * + * @return A unique pointer to a column + */ +template +auto make_column(cudf::host_span host_data, rmm::cuda_stream_view stream) +{ + auto device_buffer = rmm::device_buffer(host_data.data(), host_data.size() * sizeof(T), stream); + return std::make_unique(cudf::data_type{cudf::type_to_id()}, + host_data.size(), + std::move(device_buffer), + rmm::device_buffer{}, + 0); +} + +/** + * @brief Constructs a list column (one list per row group) for the given page-level data + * + * @tparam T Data type + * @param data Span of host data + * @param col_page_offsets Span of column page (count) offsets per row group + * @param num_row_groups Number of row groups + * @param num_pages_this_column Total number of pages in this column + * @param stream CUDA stream + * + * @return A unique pointer to a column + */ +template +auto make_page_data_list_column(cudf::host_span data, + cudf::host_span col_page_offsets, + cudf::size_type num_row_groups, + cudf::size_type num_pages_this_column, + rmm::cuda_stream_view stream) +{ + CUDF_EXPECTS(col_page_offsets.size() == num_row_groups + 1, + "Mismatch between offsets and number of row groups"); + + auto offsets_column = make_column(col_page_offsets, stream); + + auto page_data_buffer = + rmm::device_buffer(data.data(), num_pages_this_column * sizeof(int64_t), stream); + + auto page_data_column = + std::make_unique(cudf::data_type{cudf::type_to_id()}, + num_pages_this_column, + std::move(page_data_buffer), + rmm::device_buffer{}, + 0); + + return cudf::make_lists_column(num_row_groups, + std::move(offsets_column), + std::move(page_data_column), + 0, + rmm::device_buffer{}, + stream); +} + +} // namespace + +std::shared_ptr create_memory_resource(bool is_pool_used) +{ + auto cuda_mr = std::make_shared(); + if (is_pool_used) { + return rmm::mr::make_owning_wrapper( + cuda_mr, rmm::percent_of_free_device_memory(50)); + } + return cuda_mr; +} + +cudf::host_span fetch_footer_bytes(cudf::host_span buffer) +{ + CUDF_FUNC_RANGE(); + + using namespace cudf::io::parquet; + + constexpr auto header_len = sizeof(file_header_s); + constexpr auto ender_len = sizeof(file_ender_s); + size_t const len = buffer.size(); + + auto const header_buffer = cudf::host_span(buffer.data(), header_len); + auto const header = reinterpret_cast(header_buffer.data()); + auto const ender_buffer = + cudf::host_span(buffer.data() + len - ender_len, ender_len); + auto const ender = reinterpret_cast(ender_buffer.data()); + CUDF_EXPECTS(len > header_len + ender_len, "Incorrect data source"); + constexpr uint32_t parquet_magic = (('P' << 0) | ('A' << 8) | ('R' << 16) | ('1' << 24)); + CUDF_EXPECTS(header->magic == parquet_magic && ender->magic == parquet_magic, + "Corrupted header or footer"); + CUDF_EXPECTS(ender->footer_len != 0 && ender->footer_len <= (len - header_len - ender_len), + "Incorrect footer length"); + + return cudf::host_span(buffer.data() + len - ender->footer_len - ender_len, + ender->footer_len); +} + +cudf::host_span fetch_page_index_bytes( + cudf::host_span buffer, cudf::io::text::byte_range_info const page_index_bytes) +{ + return cudf::host_span( + reinterpret_cast(buffer.data()) + page_index_bytes.offset(), + page_index_bytes.size()); +} + +std::tuple read_parquet_file_metadata( + std::string_view input_filepath) +{ + CUDF_FUNC_RANGE(); + + auto file_buffer = std::vector(std::filesystem::file_size(input_filepath)); + std::ifstream file(input_filepath.data(), std::ios::binary); + file.read(reinterpret_cast(file_buffer.data()), file_buffer.size()); + file.close(); + + auto options = cudf::io::parquet_reader_options::builder().build(); + + // Fetch footer bytes and setup reader + auto const footer_buffer = fetch_footer_bytes(file_buffer); + auto const reader = + std::make_unique(footer_buffer, options); + + // Get page index byte range from the reader + auto const page_index_byte_range = reader->page_index_byte_range(); + + // Check and setup page index if the file contains one + auto const has_page_index = not page_index_byte_range.is_empty(); + if (has_page_index) { + auto const page_index_buffer = fetch_page_index_bytes(file_buffer, page_index_byte_range); + reader->setup_page_index(page_index_buffer); + } else { + std::cout << "The input parquet file does not contain a page index\n"; + } + + return std::tuple{reader->parquet_metadata(), has_page_index}; +} + +void write_rowgroup_metadata(cudf::io::parquet::FileMetaData const& metadata, + std::string const& output_filepath, + rmm::cuda_stream_view stream) +{ + CUDF_FUNC_RANGE(); + + auto const num_row_groups = metadata.row_groups.size(); + + // Compute row group row offsets, row group row counts, and row group byte offsets + auto row_group_row_offsets = std::vector(); + row_group_row_offsets.reserve(num_row_groups + 1); + row_group_row_offsets.emplace_back(0); + + // Compute row group row counts + auto row_group_row_counts = std::vector(); + row_group_row_counts.reserve(num_row_groups); + + // Compute row group byte offsets + auto row_group_byte_offsets = std::vector(); + row_group_byte_offsets.reserve(num_row_groups); + + std::for_each(metadata.row_groups.begin(), metadata.row_groups.end(), [&](auto const& rg) { + row_group_row_counts.emplace_back(rg.num_rows); + row_group_row_offsets.emplace_back(row_group_row_offsets.back() + rg.num_rows); + // Get the file offset of this row group + auto const row_group_file_offset = [&]() { + if (rg.file_offset.has_value()) { + return rg.file_offset.value(); + } else if (rg.columns.front().file_offset != 0) { + return rg.columns.front().file_offset; + } else { + auto const& col_meta = rg.columns.front().meta_data; + return col_meta.dictionary_page_offset != 0 + ? std::min(col_meta.dictionary_page_offset, col_meta.data_page_offset) + : col_meta.data_page_offset; + } + }(); + row_group_byte_offsets.emplace_back(row_group_file_offset); + }); + + std::vector> columns; + columns.emplace_back(make_index_column(num_row_groups, stream)); + + auto row_offsets_buffer = + rmm::device_buffer(row_group_row_offsets.data(), num_row_groups * sizeof(int64_t), stream); + auto row_counts_buffer = + rmm::device_buffer(row_group_row_counts.data(), num_row_groups * sizeof(int64_t), stream); + auto byte_offsets_buffer = + rmm::device_buffer(row_group_byte_offsets.data(), num_row_groups * sizeof(int64_t), stream); + + columns.emplace_back(std::make_unique(cudf::data_type{cudf::type_to_id()}, + num_row_groups, + std::move(row_offsets_buffer), + rmm::device_buffer{}, + 0)); + columns.emplace_back(std::make_unique(cudf::data_type{cudf::type_to_id()}, + num_row_groups, + std::move(row_counts_buffer), + rmm::device_buffer{}, + 0)); + columns.emplace_back(std::make_unique(cudf::data_type{cudf::type_to_id()}, + num_row_groups, + std::move(byte_offsets_buffer), + rmm::device_buffer{}, + 0)); + + auto table = std::make_unique(std::move(columns)); + + cudf::io::table_input_metadata out_metadata(table->view()); + out_metadata.column_metadata[0].set_name("row group index"); + out_metadata.column_metadata[1].set_name("row offsets"); + out_metadata.column_metadata[2].set_name("row counts"); + out_metadata.column_metadata[3].set_name("byte offsets"); + + auto const out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info(output_filepath), table->view()) + .metadata(out_metadata) + .build(); + cudf::io::write_parquet(out_opts, stream); +} + +void write_page_metadata(cudf::io::parquet::FileMetaData const& metadata, + std::string const& output_filepath, + rmm::cuda_stream_view stream) +{ + CUDF_FUNC_RANGE(); + + auto const num_columns = metadata.row_groups.front().columns.size(); + auto const num_row_groups = metadata.row_groups.size(); + + auto constexpr output_cols_per_column = 3; + + std::vector> columns; + columns.emplace_back(make_index_column(num_row_groups, stream)); + + std::for_each( + thrust::counting_iterator(0), + thrust::counting_iterator(num_columns), + [&](auto const col_idx) { + auto const [page_row_counts, page_row_offsets, page_byte_offsets, col_page_offsets] = + compute_page_row_counts_and_offsets(metadata, col_idx, stream); + + auto const num_pages_this_column = page_row_counts.size(); + + CUDF_EXPECTS(num_pages_this_column == col_page_offsets.back(), + "Mismatch between the number of pages and page offsets"); + + columns.emplace_back(make_page_data_list_column( + page_row_counts, col_page_offsets, num_row_groups, num_pages_this_column, stream)); + columns.emplace_back(make_page_data_list_column( + page_row_offsets, col_page_offsets, num_row_groups, num_pages_this_column, stream)); + columns.emplace_back(make_page_data_list_column( + page_byte_offsets, col_page_offsets, num_row_groups, num_pages_this_column, stream)); + + stream.synchronize(); + }); + + CUDF_EXPECTS(columns.size() == (num_columns * output_cols_per_column) + 1, + "Mismatch between number of columns and number of columns in the table"); + + auto table = std::make_unique(std::move(columns)); + cudf::io::table_input_metadata out_metadata(table->view()); + out_metadata.column_metadata[0].set_name("row group index"); + + std::for_each(thrust::counting_iterator(0), + thrust::counting_iterator(num_columns), + [&](auto const col_idx) { + std::string const col_name = "col" + std::to_string(col_idx); + out_metadata.column_metadata[1 + col_idx * output_cols_per_column].set_name( + col_name + " page row counts"); + out_metadata.column_metadata[1 + col_idx * output_cols_per_column + 1].set_name( + col_name + " page row offsets "); + out_metadata.column_metadata[1 + col_idx * output_cols_per_column + 2].set_name( + col_name + " page byte offsets"); + }); + + auto const out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info(output_filepath), table->view()) + .metadata(out_metadata) + .build(); + cudf::io::write_parquet(out_opts, stream); +} diff --git a/cpp/examples/parquet_inspect/parquet_inspect_utils.hpp b/cpp/examples/parquet_inspect/parquet_inspect_utils.hpp new file mode 100644 index 00000000000..8cd2e9136e1 --- /dev/null +++ b/cpp/examples/parquet_inspect/parquet_inspect_utils.hpp @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include +#include + +/** + * @file parquet_inspect_utils.hpp + * @brief Utilities for `parquet_inspect` example + */ + +/** + * @brief Create memory resource for libcudf functions + * + * @param pool Whether to use a pool memory resource. + * @return Memory resource instance + */ +std::shared_ptr create_memory_resource(bool is_pool_used); + +/** + * @brief Fetches a host span of Parquet footer bytes from the input buffer span + * + * @param buffer Input buffer span + * @return A host span of the footer bytes + */ +cudf::host_span fetch_footer_bytes(cudf::host_span buffer); +/** + * @brief Fetches a host span of Parquet PageIndexbytes from the input buffer span + * + * @param buffer Input buffer span + * @param page_index_bytes Byte range of `PageIndex` to fetch + * @return A host span of the PageIndex bytes + */ +cudf::host_span fetch_page_index_bytes( + cudf::host_span buffer, cudf::io::text::byte_range_info const page_index_bytes); + +/** + * @brief Reads parquet metadata (FileMetaData struct) from a file + * + * @param input_filepath Path to the input parquet file + * + * @return A tuple containing the parquet metadata and a boolean indicating if the file contains a + * page index + */ +std::tuple read_parquet_file_metadata( + std::string_view input_filepath); + +/** + * @brief Writes row group metadata to a parquet file + * + * @param metadata Parquet file metadata + * @param output_filepath Path to the output file + * @param stream CUDA stream + */ +void write_rowgroup_metadata(cudf::io::parquet::FileMetaData const& metadata, + std::string const& output_filepath, + rmm::cuda_stream_view stream); + +/** + * @brief Writes page metadata to a parquet file + * + * @param metadata Parquet file metadata + * @param output_filepath Path to the output file + * @param stream CUDA stream + */ +void write_page_metadata(cudf::io::parquet::FileMetaData const& metadata, + std::string const& output_filepath, + rmm::cuda_stream_view stream);