From fca95e636d5798c808401a143207a1913c5fcc7f Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 11 Sep 2025 13:50:53 -0400 Subject: [PATCH 1/8] Update parquet-testing pin to `master` --- parquet-testing | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet-testing b/parquet-testing index 5cbfc43d488c..a3d96a65e11e 160000 --- a/parquet-testing +++ b/parquet-testing @@ -1 +1 @@ -Subproject commit 5cbfc43d488c9c8404a1a7088cca400ae095b831 +Subproject commit a3d96a65e11e2bbca7d22a894e8313ede90a33a3 From a2ce848e660885585aa4b8f92eec119ffecdab22 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 11 Sep 2025 14:23:49 -0400 Subject: [PATCH 2/8] Update variant_integration tests to read and use the new shredding support --- parquet-variant/src/variant/metadata.rs | 8 +- parquet/Cargo.toml | 5 + parquet/tests/variant_integration.rs | 1580 ++++++----------------- 3 files changed, 421 insertions(+), 1172 deletions(-) diff --git a/parquet-variant/src/variant/metadata.rs b/parquet-variant/src/variant/metadata.rs index 1c9da6bcc0cf..14d048631de7 100644 --- a/parquet-variant/src/variant/metadata.rs +++ b/parquet-variant/src/variant/metadata.rs @@ -130,6 +130,7 @@ impl VariantMetadataHeader { /// [Variant Spec]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#metadata-encoding #[derive(Debug, Clone, PartialEq)] pub struct VariantMetadata<'m> { + /// (Only) the bytes that make up this metadata instance. pub(crate) bytes: &'m [u8], header: VariantMetadataHeader, dictionary_size: u32, @@ -299,7 +300,7 @@ impl<'m> VariantMetadata<'m> { self.header.version } - /// Gets an offset array entry by index. + /// Gets an offset into the dictionary entry by index. /// /// This offset is an index into the dictionary, at the boundary between string `i-1` and string /// `i`. See [`Self::get`] to retrieve a specific dictionary entry. @@ -309,6 +310,11 @@ impl<'m> VariantMetadata<'m> { self.header.offset_size.unpack_u32(bytes, i) } + /// Returns the last byte offset of the metadata dictionary, which is also the total size of + pub fn size(&self) -> usize { + self.bytes.len() + } + /// Attempts to retrieve a dictionary entry by index, failing if out of bounds or if the /// underlying bytes are [invalid]. /// diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index a39275fb254e..5dbd4b5b39dd 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -171,6 +171,11 @@ name = "encryption" required-features = ["arrow"] path = "./tests/encryption/mod.rs" +[[test]] +name = "variant_integration" +required-features = ["arrow", "variant_experimental", "serde"] +path = "./tests/variant_integration.rs" + [[bin]] name = "parquet-read" required-features = ["cli"] diff --git a/parquet/tests/variant_integration.rs b/parquet/tests/variant_integration.rs index e379b820f29f..fe863462a660 100644 --- a/parquet/tests/variant_integration.rs +++ b/parquet/tests/variant_integration.rs @@ -21,1233 +21,471 @@ //! Variant values from .variant.bin files, reads Parquet files, converts StructArray //! to VariantArray, and verifies that extracted values match expected results. //! -//! Based on the parquet-testing PR: https://github.com/apache/parquet-testing/pull/90/files -//! Inspired by the arrow-go implementation: https://github.com/apache/arrow-go/pull/455/files +//! Inspired by the arrow-go implementation: -// These tests require the arrow feature -#![cfg(feature = "arrow")] - -use arrow_array::{Array, StructArray}; +use arrow::util::test_util::parquet_test_data; +use arrow_array::{Array, ArrayRef}; +use arrow_cast::cast; +use arrow_schema::{DataType, Fields}; use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; -use std::{ - env, - error::Error, - fs, - path::{Path, PathBuf}, -}; +use parquet_variant::{Variant, VariantMetadata}; +use parquet_variant_compute::VariantArray; +use serde::Deserialize; +use std::path::Path; +use std::sync::{Arc, LazyLock}; +use std::{fs, path::PathBuf}; + +type Result = std::result::Result; + +/// Creates a test function for a given case number +/// +/// Note the index is zero-based, while the case number is one-based +macro_rules! variant_test_case { + ($case_num:literal) => { + paste::paste! { + #[test] + fn []() { + all_cases()[$case_num - 1].run() + } + } + }; + + // Generates an error test case, where the expected result is an error message + ($case_num:literal, $expected_error:literal) => { + paste::paste! { + #[test] + #[should_panic(expected = $expected_error)] + fn []() { + all_cases()[$case_num - 1].run() + } + } + }; +} -/// Test case definition structure matching the format from cases.json -#[derive(Debug, Clone)] +// Generate test functions for each case +// Note that case 3 is empty in cases.json, so we skip it here +// Only cases 40, 42, 87, 127 and 128 are expected to fail, the rest should pass + +variant_test_case!(1, "Unsupported typed_value type: List("); +variant_test_case!(2, "Unsupported typed_value type: List("); +// case 3 is empty in cases.json 🤷 +// ```json +// { +// "case_number" : 3 +// }, +// ``` +variant_test_case!(3, "parquet_file must be set"); +variant_test_case!(4, "Unsupported typed_value type: Boolean"); +variant_test_case!(5, "Unsupported typed_value type: Boolean"); +variant_test_case!(6); +variant_test_case!(7); +variant_test_case!(8); +variant_test_case!(9); +variant_test_case!(10); +variant_test_case!(11); +variant_test_case!(12); +variant_test_case!(13); +variant_test_case!(14); +variant_test_case!(15); +variant_test_case!(16); +variant_test_case!(17); +variant_test_case!(18, "Unsupported typed_value type: Date32"); +variant_test_case!(19, "Unsupported typed_value type: Date32"); +variant_test_case!( + 20, + "Unsupported typed_value type: Timestamp(Microsecond, Some(\"UTC\"))" +); +variant_test_case!( + 21, + "Unsupported typed_value type: Timestamp(Microsecond, Some(\"UTC\"))" +); +variant_test_case!( + 22, + "Unsupported typed_value type: Timestamp(Microsecond, None)" +); +variant_test_case!( + 23, + "Unsupported typed_value type: Timestamp(Microsecond, None)" +); +variant_test_case!(24, "Unsupported typed_value type: Decimal128(9, 4)"); +variant_test_case!(25, "Unsupported typed_value type: Decimal128(9, 4)"); +variant_test_case!(26, "Unsupported typed_value type: Decimal128(18, 9)"); +variant_test_case!(27, "Unsupported typed_value type: Decimal128(18, 9)"); +variant_test_case!(28, "Unsupported typed_value type: Decimal128(38, 9)"); +variant_test_case!(29, "Unsupported typed_value type: Decimal128(38, 9)"); +variant_test_case!(30, "Unsupported typed_value type: BinaryView"); +variant_test_case!(31, "Unsupported typed_value type: Utf8"); +variant_test_case!(32, "Unsupported typed_value type: Time64(Microsecond)"); +variant_test_case!( + 33, + "Unsupported typed_value type: Timestamp(Nanosecond, Some(\"UTC\"))" +); +variant_test_case!( + 34, + "Unsupported typed_value type: Timestamp(Nanosecond, Some(\"UTC\"))" +); +variant_test_case!( + 35, + "Unsupported typed_value type: Timestamp(Nanosecond, None)" +); +variant_test_case!( + 36, + "Unsupported typed_value type: Timestamp(Nanosecond, None)" +); +variant_test_case!(37, "Unsupported typed_value type: FixedSizeBinary(16)"); +variant_test_case!(38, "Unsupported typed_value type: Struct("); +variant_test_case!(39); +// Is an error case (should be failing as the expected error message indicates) +variant_test_case!(40, "Unsupported typed_value type: List("); +variant_test_case!(41, "Unsupported typed_value type: List(Field"); +// Is an error case (should be failing as the expected error message indicates) +variant_test_case!( + 42, + "Expected an error 'Invalid variant, conflicting value and typed_value`, but got no error" +); +variant_test_case!(43, "Unsupported typed_value type: Struct([Field"); +variant_test_case!(44, "Unsupported typed_value type: Struct([Field"); +variant_test_case!(45, "Unsupported typed_value type: List(Field"); +variant_test_case!(46, "Unsupported typed_value type: Struct([Field"); +variant_test_case!(47); +variant_test_case!(48); +variant_test_case!(49); +variant_test_case!(50); +variant_test_case!(51); +variant_test_case!(52); +variant_test_case!(53); +variant_test_case!(54); +variant_test_case!(55); +variant_test_case!(56); +variant_test_case!(57); +variant_test_case!(58); +variant_test_case!(59); +variant_test_case!(60); +variant_test_case!(61); +variant_test_case!(62); +variant_test_case!(63); +variant_test_case!(64); +variant_test_case!(65); +variant_test_case!(66); +variant_test_case!(67); +variant_test_case!(68); +variant_test_case!(69); +variant_test_case!(70); +variant_test_case!(71); +variant_test_case!(72); +variant_test_case!(73); +variant_test_case!(74); +variant_test_case!(75); +variant_test_case!(76); +variant_test_case!(77); +variant_test_case!(78); +variant_test_case!(79); +variant_test_case!(80); +variant_test_case!(81); +variant_test_case!(82); +variant_test_case!(83, "Unsupported typed_value type: Struct([Field"); +variant_test_case!(84, "Unsupported typed_value type: Struct([Field"); +variant_test_case!(85, "Unsupported typed_value type: List(Field"); +variant_test_case!(86, "Unsupported typed_value type: List(Field"); +// Is an error case (should be failing as the expected error message indicates) +variant_test_case!(87, "Unsupported typed_value type: Struct([Field"); +variant_test_case!(88, "Unsupported typed_value type: List(Field"); +variant_test_case!(89); +variant_test_case!(90); +variant_test_case!(91); +variant_test_case!(92); +variant_test_case!(93); +variant_test_case!(94); +variant_test_case!(95); +variant_test_case!(96); +variant_test_case!(97); +variant_test_case!(98); +variant_test_case!(99); +variant_test_case!(100); +variant_test_case!(101); +variant_test_case!(102); +variant_test_case!(103); +variant_test_case!(104); +variant_test_case!(105); +variant_test_case!(106); +variant_test_case!(107); +variant_test_case!(108); +variant_test_case!(109); +variant_test_case!(110); +variant_test_case!(111); +variant_test_case!(112); +variant_test_case!(113); +variant_test_case!(114); +variant_test_case!(115); +variant_test_case!(116); +variant_test_case!(117); +variant_test_case!(118); +variant_test_case!(119); +variant_test_case!(120); +variant_test_case!(121); +variant_test_case!(122); +variant_test_case!(123); +variant_test_case!(124); +variant_test_case!(125, "Unsupported typed_value type: Struct"); +variant_test_case!(126, "Unsupported typed_value type: List("); +// Is an error case (should be failing as the expected error message indicates) +variant_test_case!( + 127, + "Invalid variant data: InvalidArgumentError(\"Received empty bytes\")" +); +// Is an error case (should be failing as the expected error message indicates) +variant_test_case!(128, "Unsupported typed_value type: Struct([Field"); +variant_test_case!(129, "Invalid variant data: InvalidArgumentError("); +variant_test_case!(130, "Unsupported typed_value type: Struct([Field"); +variant_test_case!(131); +variant_test_case!(132, "Unsupported typed_value type: Struct([Field"); +variant_test_case!(133, "Unsupported typed_value type: Struct([Field"); +variant_test_case!(134, "Unsupported typed_value type: Struct([Field"); +variant_test_case!(135); +variant_test_case!(136, "Unsupported typed_value type: List(Field "); +variant_test_case!(137, "Invalid variant data: InvalidArgumentError("); +variant_test_case!(138, "Unsupported typed_value type: Struct([Field"); + +/// Test case definition structure matching the format from +/// `parquet-testing/parquet_shredded/cases.json` +/// +/// See [README] for details. +/// +/// [README]: https://github.com/apache/parquet-testing/blob/master/shredded_variant/README.md +/// +/// Example JSON +/// ```json +/// { +/// "case_number" : 5, +/// "test" : "testShreddedVariantPrimitives", +/// "parquet_file" : "case-005.parquet", +/// "variant_file" : "case-005_row-0.variant.bin", +/// "variant" : "Variant(metadata=VariantMetadata(dict={}), value=Variant(type=BOOLEAN_FALSE, value=false))" +/// }, +/// ``` +#[allow(dead_code)] // some fields are not used except when printing the struct +#[derive(Debug, Clone, Deserialize)] struct VariantTestCase { - /// Case number (e.g., 1, 2, 4, etc. - note: case 3 is missing) + /// Case number (e.g., 1, 2, 4, etc. - note: case 3 is missing any data) pub case_number: u32, /// Test method name (e.g., "testSimpleArray") pub test: Option, /// Name of the parquet file (e.g., "case-001.parquet") - pub parquet_file: String, + pub parquet_file: Option, + /// Expected variant binary file (e.g., "case-001_row-0.variant.bin") - None for error cases pub variant_file: Option, + /// Multiple expected variant binary files, for multi row inputs. If there + /// is no variant, there is no file + pub variant_files: Option>>, /// Expected error message for negative test cases + /// + /// (this is the message from the cases.json file, which is from the Iceberg + /// implementation, so it is not guaranteed to match the actual Rust error message) pub error_message: Option, /// Description of the variant value (for debugging) pub variant_description: Option, - /// Whether this test is currently expected to pass - pub enabled: bool, - /// Test category for grouping and analysis - pub test_category: TestCategory, -} - -/// Categories of variant tests for organized validation -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -enum TestCategory { - /// Basic primitive type tests - Primitives, - /// Array-related tests (simple, nested, with errors) - Arrays, - /// Object-related tests (shredded, partial, with errors) - Objects, - /// Tests expecting specific error conditions - ErrorHandling, - /// Schema validation and unshredded variants - SchemaValidation, - /// Mixed and complex scenarios - Complex, -} - -/// Comprehensive test harness for Parquet Variant integration -struct VariantIntegrationHarness { - /// Directory containing shredded_variant test data - test_data_dir: PathBuf, - /// Parsed test cases from cases.json - test_cases: Vec, } -impl VariantIntegrationHarness { - /// Create a new integration test harness - fn new() -> Result> { - let test_data_dir = find_shredded_variant_test_data()?; - let test_cases = load_test_cases(&test_data_dir)?; - - println!( - "Loaded {} test cases from {}", - test_cases.len(), - test_data_dir.display() - ); - - Ok(Self { - test_data_dir, - test_cases, - }) - } - - /// Run all integration tests - fn run_all_tests(&self) -> Result<(), Box> { - println!("Running Parquet Variant Integration Tests"); - println!("=========================================="); +/// Run a single test case +impl VariantTestCase { + /// Run a test case. Panics on unexpected error + fn run(&self) { + println!("{self:#?}"); - let mut passed = 0; - let mut failed = 0; - let mut ignored = 0; + let variant_data = self.load_variants(); + let variant_array = self.load_parquet(); - for test_case in &self.test_cases { - if !test_case.enabled { - println!( - "IGNORED: case-{:03} - {}", - test_case.case_number, - test_case.test.as_deref().unwrap_or("unknown test") - ); - ignored += 1; - continue; - } - - match self.run_single_test(test_case) { - Ok(()) => { - println!( - "PASSED: case-{:03} - {}", - test_case.case_number, - test_case.test.as_deref().unwrap_or("unknown test") - ); - passed += 1; - } - Err(e) => { - println!( - "FAILED: case-{:03} - {} - Error: {}", - test_case.case_number, - test_case.test.as_deref().unwrap_or("unknown test"), - e - ); - failed += 1; - } - } - } - - println!("\nTest Results:"); - println!(" Passed: {}", passed); - println!(" Failed: {}", failed); - println!(" Ignored: {}", ignored); - println!(" Total: {}", passed + failed + ignored); - - if failed > 0 { - Err(format!("{} tests failed", failed).into()) - } else { - Ok(()) - } - } - - /// Run a single test case - fn run_single_test(&self, test_case: &VariantTestCase) -> Result<(), Box> { - match &test_case.test_category { - TestCategory::ErrorHandling => { - // For error cases, we expect the parsing/validation to fail - self.run_error_test(test_case) - } - _ => { - // For normal cases, run standard validation - self.run_success_test(test_case) + // if this is an error case, the expected error message should be set + if let Some(expected_error) = &self.error_message { + // just accessing the variant_array should trigger the error + for i in 0..variant_array.len() { + let _ = variant_array.value(i); } + panic!("Expected an error '{expected_error}`, but got no error"); } - } - - /// Run a test case that should succeed - fn run_success_test(&self, test_case: &VariantTestCase) -> Result<(), Box> { - // Step 1: Load expected Variant data from .variant.bin file (if present) - let expected_variant_data = if let Some(variant_file) = &test_case.variant_file { - Some(self.load_expected_variant_data_by_file(variant_file)?) - } else { - None - }; - // Step 2: Read Parquet file and extract StructArray - let struct_arrays = self.read_parquet_file(test_case)?; - - // Step 3: For now, just verify the structure and basic validation - // TODO: Convert StructArray to VariantArray using cast_to_variant (requires variant crates) - // TODO: Extract values using both VariantArray::value() and variant_get kernel - // TODO: Compare extracted values with expected values - - self.verify_variant_structure(&struct_arrays)?; - - println!( - " {} validation passed for case-{:03}", - match test_case.test_category { - TestCategory::Primitives => "Primitive type", - TestCategory::Arrays => "Array structure", - TestCategory::Objects => "Object structure", - TestCategory::SchemaValidation => "Schema", - TestCategory::Complex => "Complex structure", - _ => "Basic structure", - }, - test_case.case_number + assert_eq!( + variant_array.len(), + variant_data.len(), + "Number of variants in parquet file does not match expected number" ); - - if let Some(data) = expected_variant_data { - println!(" Expected variant data: {} bytes", data.len()); - } - println!( - " Found {} StructArray(s) with variant structure", - struct_arrays.len() - ); - - Ok(()) - } - - /// Run a test case that should produce an error - fn run_error_test(&self, test_case: &VariantTestCase) -> Result<(), Box> { - println!(" Testing error case for case-{:03}", test_case.case_number); - - // Try to read the parquet file - this might fail as expected - match self.read_parquet_file(test_case) { - Ok(struct_arrays) => { - // If file reading succeeds, the error should come during variant processing - println!( - " Parquet file read successfully, expecting error during variant processing" + for (i, expected) in variant_data.iter().enumerate() { + if variant_array.is_null(i) { + assert!( + expected.is_none(), + "Expected null variant at index {i}, but got {:?}", + variant_array.value(i) ); - println!(" Found {} StructArray(s)", struct_arrays.len()); - - // TODO: When variant processing is implemented, capture and validate the error - if let Some(expected_error) = &test_case.error_message { - println!(" Expected error: {}", expected_error); - } - } - Err(e) => { - // File reading failed - check if this matches expected error - println!(" Parquet file reading failed: {}", e); - if let Some(expected_error) = &test_case.error_message { - println!(" Expected error: {}", expected_error); - // TODO: Match actual error against expected error pattern - } + continue; } - } + let actual = variant_array.value(i); + let expected = variant_data[i] + .as_ref() + .expect("Expected non-null variant data"); - Ok(()) - } + let expected = expected.as_variant(); - /// Load expected Variant binary data from .variant.bin file - #[allow(dead_code)] - fn load_expected_variant_data( - &self, - test_case: &VariantTestCase, - ) -> Result, Box> { - if let Some(variant_file) = &test_case.variant_file { - self.load_expected_variant_data_by_file(variant_file) - } else { - Err("No variant file specified for this test case".into()) + // compare the variants (is this the right way to compare?) + assert_eq!(actual, expected, "Variant data mismatch at index {}\n\nactual\n{actual:#?}\n\nexpected\n{expected:#?}", i); } } - /// Load expected Variant binary data by file name - fn load_expected_variant_data_by_file( - &self, - variant_file: &str, - ) -> Result, Box> { - let variant_path = self.test_data_dir.join(variant_file); - - if !variant_path.exists() { - return Err(format!("Variant file not found: {}", variant_path.display()).into()); - } - - let data = fs::read(&variant_path)?; - Ok(data) + /// Parses the expected variant files, returning a vector of `ExpectedVariant` or None + /// if the corresponding entry in `variant_files` is null + fn load_variants(&self) -> Vec> { + let variant_files: Box>> = + match (&self.variant_files, &self.variant_file) { + (Some(files), None) => Box::new(files.iter().map(|f| f.as_ref())), + (None, Some(file)) => Box::new(std::iter::once(Some(file))), + // error cases may not have any variant files + _ => Box::new(std::iter::empty()), + }; + + // load each file + variant_files + .map(|f| { + let v = ExpectedVariant::try_load(&TEST_CASE_DIR.join(f?)) + .expect("Failed to load expected variant"); + Some(v) + }) + .collect() } - /// Read Parquet file and extract StructArray columns - fn read_parquet_file( - &self, - test_case: &VariantTestCase, - ) -> Result, Box> { - let parquet_path = self.test_data_dir.join(&test_case.parquet_file); - - if !parquet_path.exists() { - return Err(format!("Parquet file not found: {}", parquet_path.display()).into()); + /// Load the parquet file, extract the Variant column, and return as a VariantArray + fn load_parquet(&self) -> VariantArray { + let parquet_file = self + .parquet_file + .as_ref() + .expect("parquet_file must be set"); + let path = TEST_CASE_DIR.join(parquet_file); + let file = fs::File::open(&path) + .unwrap_or_else(|e| panic!("cannot open parquet file {path:?}: {e}")); + + let reader = ParquetRecordBatchReaderBuilder::try_new(file) + .and_then(|b| b.build()) + .unwrap_or_else(|e| panic!("Error reading parquet reader for {path:?}: {e}")); + + let mut batches: Vec<_> = reader + .collect::>() + .unwrap_or_else(|e| panic!("Error reading parquet batches for {path:?}: {e}")); + + if batches.is_empty() { + panic!("No parquet batches were found in file {path:?}"); } - - let file = fs::File::open(&parquet_path)?; - let builder = ParquetRecordBatchReaderBuilder::try_new(file)?; - let reader = builder.build()?; - - let mut struct_arrays = Vec::new(); - - for batch_result in reader { - let batch = batch_result?; - - // Look for StructArray columns that could contain Variant data - for column in batch.columns() { - if let Some(struct_array) = column.as_any().downcast_ref::() { - // Check if this StructArray has the expected Variant structure - if self.is_variant_struct_array(struct_array)? { - struct_arrays.push(struct_array.clone()); - } - } - } - } - - if struct_arrays.is_empty() { - return Err("No valid Variant StructArray columns found in Parquet file".into()); - } - - Ok(struct_arrays) - } - - /// Check if a StructArray has the expected Variant structure (metadata, value fields) - fn is_variant_struct_array(&self, struct_array: &StructArray) -> Result> { - let column_names = struct_array.column_names(); - let field_names: Vec<&str> = column_names.to_vec(); - - // Check for required Variant fields - let has_metadata = field_names.contains(&"metadata"); - let has_value = field_names.contains(&"value"); - - Ok(has_metadata && has_value) - } - - /// Verify that StructArrays have the expected Variant structure - fn verify_variant_structure( - &self, - struct_arrays: &[StructArray], - ) -> Result<(), Box> { - for (i, struct_array) in struct_arrays.iter().enumerate() { - if !self.is_variant_struct_array(struct_array)? { - return Err( - format!("StructArray {} does not have expected Variant structure", i).into(), - ); - } - - println!( - " StructArray {} has {} rows and valid Variant structure", - i, - struct_array.len() + if batches.len() > 1 { + panic!( + "Multiple parquet batches were found in file {path:?}, only single batch supported" ); } - - Ok(()) - } -} - -/// Find the shredded_variant test data directory -fn find_shredded_variant_test_data() -> Result> { - // Try environment variable first - if let Ok(dir) = env::var("PARQUET_TEST_DATA") { - let shredded_variant_dir = PathBuf::from(dir).join("shredded_variant"); - if shredded_variant_dir.is_dir() { - return Ok(shredded_variant_dir); - } - } - - // Try relative paths from CARGO_MANIFEST_DIR - let manifest_dir = env::var("CARGO_MANIFEST_DIR").unwrap_or_else(|_| ".".to_string()); - let candidates = vec![ - PathBuf::from(&manifest_dir).join("../parquet-testing/shredded_variant"), - PathBuf::from(&manifest_dir).join("parquet-testing/shredded_variant"), - PathBuf::from("parquet-testing/shredded_variant"), - ]; - - for candidate in candidates { - if candidate.is_dir() { - return Ok(candidate); - } - } - - Err("Could not find shredded_variant test data directory. Ensure parquet-testing submodule is initialized with PR #90 data.".into()) -} - -/// Load test cases from cases.json -fn load_test_cases(test_data_dir: &Path) -> Result, Box> { - let cases_file = test_data_dir.join("cases.json"); - - if !cases_file.exists() { - return Err(format!("cases.json not found at {}", cases_file.display()).into()); - } - - let content = fs::read_to_string(&cases_file)?; - - // Parse JSON manually since serde is not available as a dependency - parse_cases_json(&content) -} - -/// Parse cases.json manually without serde -fn parse_cases_json(content: &str) -> Result, Box> { - let mut test_cases = Vec::new(); - - // Simple JSON parsing for the specific format we expect - // Format: [{"case_number": 1, "test": "...", "parquet_file": "...", "variant_file": "...", "variant": "..."}, ...] - - let lines: Vec<&str> = content.lines().collect(); - let mut current_case: Option = None; - - for line in lines { - let trimmed = line.trim(); - - if trimmed.contains("\"case_number\"") { - // Extract case number - if let Some(colon_pos) = trimmed.find(':') { - let number_part = &trimmed[colon_pos + 1..]; - if let Some(comma_pos) = number_part.find(',') { - let number_str = number_part[..comma_pos].trim(); - if let Ok(case_number) = number_str.parse::() { - current_case = Some(VariantTestCase { - case_number, - test: None, - parquet_file: String::new(), - variant_file: None, - error_message: None, - variant_description: None, - enabled: false, // Start disabled, enable progressively - test_category: TestCategory::Primitives, // Default, will be updated - }); - } - } - } - } else if trimmed.contains("\"test\"") && current_case.is_some() { - // Extract test name - if let Some(case) = current_case.as_mut() { - if let Some(start) = trimmed.find("\"test\"") { - let after_test = &trimmed[start + 6..]; - if let Some(colon_pos) = after_test.find(':') { - let value_part = &after_test[colon_pos + 1..].trim(); - if let Some(start_quote) = value_part.find('"') { - let after_quote = &value_part[start_quote + 1..]; - if let Some(end_quote) = after_quote.find('"') { - case.test = Some(after_quote[..end_quote].to_string()); - } - } - } - } - } - } else if trimmed.contains("\"parquet_file\"") && current_case.is_some() { - // Extract parquet file name - if let Some(case) = current_case.as_mut() { - if let Some(start_quote) = trimmed.rfind('"') { - let before_quote = &trimmed[..start_quote]; - if let Some(second_quote) = before_quote.rfind('"') { - case.parquet_file = before_quote[second_quote + 1..].to_string(); - } - } - } - } else if trimmed.contains("\"variant_file\"") && current_case.is_some() { - // Extract variant file name - if let Some(case) = current_case.as_mut() { - if let Some(start_quote) = trimmed.rfind('"') { - let before_quote = &trimmed[..start_quote]; - if let Some(second_quote) = before_quote.rfind('"') { - case.variant_file = Some(before_quote[second_quote + 1..].to_string()); - } - } - } - } else if trimmed.contains("\"error_message\"") && current_case.is_some() { - // Extract error message for negative test cases - if let Some(case) = current_case.as_mut() { - if let Some(start_quote) = trimmed.rfind('"') { - let before_quote = &trimmed[..start_quote]; - if let Some(second_quote) = before_quote.rfind('"') { - case.error_message = Some(before_quote[second_quote + 1..].to_string()); - case.test_category = TestCategory::ErrorHandling; - } - } - } - } else if trimmed.contains("\"variant\"") && current_case.is_some() { - // Extract variant description - if let Some(case) = current_case.as_mut() { - if let Some(start_quote) = trimmed.rfind('"') { - let before_quote = &trimmed[..start_quote]; - if let Some(second_quote) = before_quote.rfind('"') { - case.variant_description = - Some(before_quote[second_quote + 1..].to_string()); - } - } - } - } else if trimmed == "}, {" || trimmed == "}" { - // End of current case - if let Some(mut case) = current_case.take() { - if !case.parquet_file.is_empty() - && (case.variant_file.is_some() || case.error_message.is_some()) - { - // Categorize the test based on its name if not already categorized - if case.test_category == TestCategory::Primitives - && case.error_message.is_none() - { - case.test_category = categorize_test(&case.test); - } - test_cases.push(case); - } - } - } - } - - // Handle the last case if the JSON doesn't end with }, { - if let Some(mut case) = current_case { - if !case.parquet_file.is_empty() - && (case.variant_file.is_some() || case.error_message.is_some()) - { - // Categorize the test based on its name if not already categorized - if case.test_category == TestCategory::Primitives && case.error_message.is_none() { - case.test_category = categorize_test(&case.test); - } - test_cases.push(case); - } - } - - Ok(test_cases) -} - -/// Categorize a test based on its test method name -fn categorize_test(test_name: &Option) -> TestCategory { - match test_name.as_ref().map(|s| s.as_str()) { - Some(name) if name.contains("Array") => TestCategory::Arrays, - Some(name) if name.contains("Object") => TestCategory::Objects, - Some(name) if name.contains("Unshredded") => TestCategory::SchemaValidation, - Some(name) if name.contains("Mixed") || name.contains("Nested") => TestCategory::Complex, - Some(name) if name.contains("Primitives") => TestCategory::Primitives, - _ => TestCategory::Primitives, // Default fallback - } -} - -// Individual test functions with #[ignore] for progressive enablement -// Following the exact pattern from the PR description - -#[test] -#[ignore] // Enable once parquet-variant dependencies are added -fn test_variant_integration_case_001() { - let harness = VariantIntegrationHarness::new().expect("Failed to create test harness"); - - let test_case = harness - .test_cases - .iter() - .find(|case| case.case_number == 1) - .expect("case-001 not found"); - - harness - .run_single_test(test_case) - .expect("case-001 should pass"); -} - -#[test] -#[ignore] // Enable once parquet-variant dependencies are added -fn test_variant_integration_case_002() { - let harness = VariantIntegrationHarness::new().expect("Failed to create test harness"); - - let test_case = harness - .test_cases - .iter() - .find(|case| case.case_number == 2) - .expect("case-002 not found"); - - harness - .run_single_test(test_case) - .expect("case-002 should pass"); -} - -#[test] -#[ignore] // Enable once parquet-variant dependencies are added -fn test_variant_integration_case_004() { - let harness = VariantIntegrationHarness::new().expect("Failed to create test harness"); - - let test_case = harness - .test_cases - .iter() - .find(|case| case.case_number == 4) - .expect("case-004 not found"); - - harness - .run_single_test(test_case) - .expect("case-004 should pass"); -} - -#[test] -#[ignore] // Enable once parquet-variant dependencies are added -fn test_variant_integration_case_005() { - let harness = VariantIntegrationHarness::new().expect("Failed to create test harness"); - - let test_case = harness - .test_cases - .iter() - .find(|case| case.case_number == 5) - .expect("case-005 not found"); - - harness - .run_single_test(test_case) - .expect("case-005 should pass"); -} - -#[test] -#[ignore] // Enable once parquet-variant dependencies are added -fn test_variant_integration_case_006() { - let harness = VariantIntegrationHarness::new().expect("Failed to create test harness"); - - let test_case = harness - .test_cases - .iter() - .find(|case| case.case_number == 6) - .expect("case-006 not found"); - - harness - .run_single_test(test_case) - .expect("case-006 should pass"); -} - -// Add more individual test cases for key scenarios -#[test] -#[ignore] // Enable once parquet-variant dependencies are added -fn test_variant_integration_case_007() { - let harness = VariantIntegrationHarness::new().expect("Failed to create test harness"); - - let test_case = harness - .test_cases - .iter() - .find(|case| case.case_number == 7) - .expect("case-007 not found"); - - harness - .run_single_test(test_case) - .expect("case-007 should pass"); -} - -#[test] -#[ignore] // Enable once parquet-variant dependencies are added -fn test_variant_integration_case_008() { - let harness = VariantIntegrationHarness::new().expect("Failed to create test harness"); - - let test_case = harness - .test_cases - .iter() - .find(|case| case.case_number == 8) - .expect("case-008 not found"); - - harness - .run_single_test(test_case) - .expect("case-008 should pass"); -} - -#[test] -#[ignore] // Enable once parquet-variant dependencies are added -fn test_variant_integration_case_009() { - let harness = VariantIntegrationHarness::new().expect("Failed to create test harness"); - - let test_case = harness - .test_cases - .iter() - .find(|case| case.case_number == 9) - .expect("case-009 not found"); - - harness - .run_single_test(test_case) - .expect("case-009 should pass"); -} - -#[test] -#[ignore] // Enable once parquet-variant dependencies are added -fn test_variant_integration_case_010() { - let harness = VariantIntegrationHarness::new().expect("Failed to create test harness"); - - let test_case = harness - .test_cases - .iter() - .find(|case| case.case_number == 10) - .expect("case-010 not found"); - - harness - .run_single_test(test_case) - .expect("case-010 should pass"); -} - -// Specific tests for error cases that should be enabled to test error handling -#[test] -#[ignore] // Enable to test error handling - case with conflicting value and typed_value -fn test_variant_integration_error_case_040() { - let harness = VariantIntegrationHarness::new().expect("Failed to create test harness"); - - let test_case = harness - .test_cases - .iter() - .find(|case| case.case_number == 40) - .expect("case-040 not found"); - - // This should handle the error gracefully - harness - .run_single_test(test_case) - .expect("Error case should be handled gracefully"); -} - -#[test] -#[ignore] // Enable to test error handling - case with value and typed_value conflict -fn test_variant_integration_error_case_042() { - let harness = VariantIntegrationHarness::new().expect("Failed to create test harness"); - - let test_case = harness - .test_cases - .iter() - .find(|case| case.case_number == 42) - .expect("case-042 not found"); - - harness - .run_single_test(test_case) - .expect("Error case should be handled gracefully"); -} - -// Test that runs all cases by category -#[test] -#[ignore] // Enable when ready to run all tests -fn test_variant_integration_all_cases() { - let harness = VariantIntegrationHarness::new().expect("Failed to create test harness"); - harness - .run_all_tests() - .expect("Integration tests should pass"); -} - -#[test] -#[ignore] // Enable to test primitive type cases -fn test_variant_integration_primitives_only() { - let harness = VariantIntegrationHarness::new().expect("Failed to create test harness"); - - let primitive_cases: Vec<_> = harness - .test_cases - .iter() - .filter(|case| case.test_category == TestCategory::Primitives) - .collect(); - - println!("Testing {} primitive cases", primitive_cases.len()); - - let mut passed = 0; - let mut failed = 0; - - for test_case in primitive_cases { - match harness.run_single_test(test_case) { - Ok(()) => { - println!("PASSED: case-{:03}", test_case.case_number); - passed += 1; - } - Err(e) => { - println!("FAILED: case-{:03} - {}", test_case.case_number, e); - failed += 1; - } - } - } - - println!("Primitive tests: {} passed, {} failed", passed, failed); - assert!(failed == 0, "All primitive tests should pass"); -} - -#[test] -#[ignore] // Enable to test array cases -fn test_variant_integration_arrays_only() { - let harness = VariantIntegrationHarness::new().expect("Failed to create test harness"); - - let array_cases: Vec<_> = harness - .test_cases - .iter() - .filter(|case| case.test_category == TestCategory::Arrays) - .collect(); - - println!("Testing {} array cases", array_cases.len()); - - for test_case in array_cases { - println!( - "Testing case-{:03}: {}", - test_case.case_number, - test_case.test.as_deref().unwrap_or("unknown") - ); - match harness.run_single_test(test_case) { - Ok(()) => println!(" PASSED"), - Err(e) => println!(" FAILED: {}", e), - } - } -} - -#[test] -#[ignore] // Enable to test object cases -fn test_variant_integration_objects_only() { - let harness = VariantIntegrationHarness::new().expect("Failed to create test harness"); - - let object_cases: Vec<_> = harness - .test_cases - .iter() - .filter(|case| case.test_category == TestCategory::Objects) - .collect(); - - println!("Testing {} object cases", object_cases.len()); - - for test_case in object_cases { - println!( - "Testing case-{:03}: {}", - test_case.case_number, - test_case.test.as_deref().unwrap_or("unknown") - ); - match harness.run_single_test(test_case) { - Ok(()) => println!(" PASSED"), - Err(e) => println!(" FAILED: {}", e), - } + let batch = batches.swap_remove(0); + + // The schema is "id", "var" for the id and variant columns + // TODO: support the actual parquet logical type annotation somehow + let var = batch + .column_by_name("var") + .unwrap_or_else(|| panic!("No 'var' column found in parquet file {path:?}")); + + // the values are read as + // * StructArray + // but VariantArray needs them as + // * StructArray + // + // So cast them to get the right type. Hack Alert: the parquet reader + // should read them directly as BinaryView + let var = cast_to_binary_view_arrays(var); + + VariantArray::try_new(var).unwrap_or_else(|e| { + panic!("Error converting StructArray to VariantArray for {path:?}: {e}") + }) } } -#[test] -#[ignore] // Enable to test error handling cases -fn test_variant_integration_error_cases_only() { - let harness = VariantIntegrationHarness::new().expect("Failed to create test harness"); - - let error_cases: Vec<_> = harness - .test_cases - .iter() - .filter(|case| case.test_category == TestCategory::ErrorHandling) - .collect(); - - println!("Testing {} error cases", error_cases.len()); - - for test_case in error_cases { - println!( - "Testing error case-{:03}: {}", - test_case.case_number, - test_case.test.as_deref().unwrap_or("unknown") - ); - println!( - " Expected error: {}", - test_case.error_message.as_deref().unwrap_or("none") - ); - match harness.run_single_test(test_case) { - Ok(()) => println!(" Error case handled gracefully"), - Err(e) => println!(" Error case processing failed: {}", e), - } - } +fn cast_to_binary_view_arrays(array: &ArrayRef) -> ArrayRef { + let new_type = map_type(array.data_type()); + cast(array, &new_type).unwrap_or_else(|e| { + panic!( + "Error casting array from {:?} to {:?}: {e}", + array.data_type(), + new_type + ) + }) } -// Test that actually reads and validates parquet file structure -#[test] -fn test_variant_structure_validation() { - // This test attempts to read actual parquet files and validate their structure - println!("Testing parquet file structure validation"); - - match VariantIntegrationHarness::new() { - Ok(harness) => { - println!( - "Successfully loaded test harness with {} test cases", - harness.test_cases.len() - ); - - // Test structural validation on a few test cases - let test_cases_to_validate = [1, 2, 4, 5]; - let mut validated_cases = 0; - - for case_number in test_cases_to_validate { - if let Some(test_case) = harness - .test_cases - .iter() - .find(|c| c.case_number == case_number) - { - println!( - "\nValidating case-{:03}: {}", - case_number, test_case.parquet_file - ); - - match harness.run_single_test(test_case) { - Ok(()) => { - println!(" Structure validation PASSED for case-{:03}", case_number); - validated_cases += 1; - } - Err(e) => { - println!( - " Structure validation FAILED for case-{:03}: {}", - case_number, e - ); - // Don't fail the test for structural issues during development - } - } - } - } - - println!( - "\nValidated {} test case structures successfully", - validated_cases - ); - } - Err(e) => { - println!("Could not find shredded_variant test data: {}", e); - println!("This is expected if parquet-testing submodule is not at PR #90 branch"); - } - } -} - -// Comprehensive test that shows test coverage and categorization -#[test] -fn test_variant_integration_comprehensive_analysis() { - // This test analyzes the comprehensive shredded_variant test data from PR #90 - println!("Running comprehensive analysis of variant integration test data"); - - match VariantIntegrationHarness::new() { - Ok(harness) => { - println!( - "Successfully loaded test harness with {} test cases", - harness.test_cases.len() - ); - - // Analyze test breakdown by category - let mut category_counts = std::collections::HashMap::new(); - let mut error_cases = Vec::new(); - let mut success_cases = Vec::new(); - - for test_case in &harness.test_cases { - *category_counts - .entry(test_case.test_category.clone()) - .or_insert(0) += 1; - - if test_case.error_message.is_some() { - error_cases.push(test_case); - } else { - success_cases.push(test_case); - } - } - - println!("\nTest Coverage Analysis:"); - println!( - " Primitives: {}", - category_counts.get(&TestCategory::Primitives).unwrap_or(&0) - ); - println!( - " Arrays: {}", - category_counts.get(&TestCategory::Arrays).unwrap_or(&0) - ); - println!( - " Objects: {}", - category_counts.get(&TestCategory::Objects).unwrap_or(&0) - ); - println!( - " Error Handling: {}", - category_counts - .get(&TestCategory::ErrorHandling) - .unwrap_or(&0) - ); - println!( - " Schema Validation: {}", - category_counts - .get(&TestCategory::SchemaValidation) - .unwrap_or(&0) - ); - println!( - " Complex: {}", - category_counts.get(&TestCategory::Complex).unwrap_or(&0) - ); - println!(" Total Success Cases: {}", success_cases.len()); - println!(" Total Error Cases: {}", error_cases.len()); - - // Test a representative sample from each category - let test_cases_to_check = [1, 2, 4, 5, 6]; - let mut validated_cases = 0; - - println!("\nValidating representative test cases:"); - for case_number in test_cases_to_check { - if let Some(test_case) = harness - .test_cases - .iter() - .find(|c| c.case_number == case_number) - { - println!( - "Case-{:03} ({:?}): {} -> {}", - case_number, - test_case.test_category, - test_case.parquet_file, - test_case - .variant_file - .as_deref() - .unwrap_or("no variant file") - ); - - // Verify files exist - let parquet_path = harness.test_data_dir.join(&test_case.parquet_file); - assert!( - parquet_path.exists(), - "Parquet file should exist: {}", - parquet_path.display() - ); - - if let Some(variant_file) = &test_case.variant_file { - let variant_path = harness.test_data_dir.join(variant_file); - assert!( - variant_path.exists(), - "Variant file should exist: {}", - variant_path.display() - ); - - if let Ok(variant_data) = fs::read(&variant_path) { - println!(" Variant data: {} bytes", variant_data.len()); - } - } - - validated_cases += 1; - } - } - - println!("\nError test cases found:"); - for error_case in error_cases.iter().take(3) { - println!( - " Case-{:03}: {} - {}", - error_case.case_number, - error_case.test.as_deref().unwrap_or("unknown"), - error_case - .error_message - .as_deref() - .unwrap_or("no error message") - ); - } - - assert!( - validated_cases >= 3, - "Should validate at least 3 test cases" - ); - assert!( - !harness.test_cases.is_empty(), - "Should have loaded test cases" - ); - println!("\nComprehensive analysis completed successfully!"); - } - Err(e) => { - println!("Could not find shredded_variant test data: {}", e); - println!("This is expected if parquet-testing submodule is not at PR #90 branch"); - - // Don't fail the test if data isn't available, just report it - // This allows the test to work in different environments +/// replaces all instances of Binary with BinaryView in a DataType +fn map_type(data_type: &DataType) -> DataType { + match data_type { + DataType::Binary => DataType::BinaryView, + DataType::List(field) => { + let new_field = field + .as_ref() + .clone() + .with_data_type(map_type(field.data_type())); + DataType::List(Arc::new(new_field)) } - } -} - -// Test to verify error case handling works -#[test] -fn test_variant_integration_error_case_handling() { - // This test demonstrates that error cases are properly detected and handled - println!("Testing error case handling with actual error files"); - - match VariantIntegrationHarness::new() { - Ok(harness) => { - println!( - "Successfully loaded test harness with {} test cases", - harness.test_cases.len() - ); - - // Find and test a few error cases - let error_cases: Vec<_> = harness - .test_cases + DataType::Struct(fields) => { + let new_fields: Fields = fields .iter() - .filter(|case| case.test_category == TestCategory::ErrorHandling) - .take(3) + .map(|f| { + let new_field = f.as_ref().clone().with_data_type(map_type(f.data_type())); + Arc::new(new_field) + }) .collect(); - - println!("Found {} error cases for testing", error_cases.len()); - - for error_case in &error_cases { - println!( - "\nTesting error case-{:03}: {}", - error_case.case_number, - error_case.test.as_deref().unwrap_or("unknown") - ); - println!( - " Expected error: {}", - error_case - .error_message - .as_deref() - .unwrap_or("no error message") - ); - - // Verify the parquet file exists (error cases should still have readable files) - let parquet_path = harness.test_data_dir.join(&error_case.parquet_file); - assert!( - parquet_path.exists(), - "Error case parquet file should exist: {}", - parquet_path.display() - ); - - // Run the error case test (should handle gracefully) - match harness.run_single_test(error_case) { - Ok(()) => println!(" Error case handled gracefully"), - Err(e) => println!(" Error case processing issue: {}", e), - } - } - - assert!(!error_cases.is_empty(), "Should have found error cases"); - println!("\nError case handling test completed successfully!"); - } - Err(e) => { - println!("Could not find shredded_variant test data: {}", e); - println!("This is expected if parquet-testing submodule is not at PR #90 branch"); - } - } -} - -// Working test that demonstrates the harness functionality -#[test] -fn test_variant_integration_with_shredded_variant_data() { - // This test uses the comprehensive shredded_variant test data from PR #90 - println!("Running basic integration test with shredded variant test data"); - - match VariantIntegrationHarness::new() { - Ok(harness) => { - println!( - "Successfully loaded test harness with {} test cases", - harness.test_cases.len() - ); - - // Test a few basic cases to verify the framework works - let test_cases_to_check = [1, 2, 4, 5, 6]; - let mut found_cases = 0; - - for case_number in test_cases_to_check { - if let Some(test_case) = harness - .test_cases - .iter() - .find(|c| c.case_number == case_number) - { - println!( - "Found case-{:03}: {} -> {}", - case_number, - test_case.parquet_file, - test_case - .variant_file - .as_deref() - .unwrap_or("no variant file") - ); - found_cases += 1; - - // Verify files exist - let parquet_path = harness.test_data_dir.join(&test_case.parquet_file); - assert!( - parquet_path.exists(), - "Parquet file should exist: {}", - parquet_path.display() - ); - - if let Some(variant_file) = &test_case.variant_file { - let variant_path = harness.test_data_dir.join(variant_file); - assert!( - variant_path.exists(), - "Variant file should exist: {}", - variant_path.display() - ); - - if let Ok(variant_data) = fs::read(&variant_path) { - println!(" Variant data: {} bytes", variant_data.len()); - } - } - } - } - - assert!(found_cases >= 3, "Should find at least 3 test cases"); - println!("Successfully validated {} test cases", found_cases); - } - Err(e) => { - println!("Could not find shredded_variant test data: {}", e); - println!("This is expected if parquet-testing submodule is not at PR #90 branch"); - - // Don't fail the test if data isn't available, just report it - // This allows the test to work in different environments + DataType::Struct(new_fields) } + _ => data_type.clone(), } } -// Fallback test using existing variant test data if shredded_variant is not available -#[test] -fn test_variant_integration_with_existing_data() { - // This test uses the existing variant test data in parquet-testing/variant/ - // as a fallback until the shredded_variant data from PR #90 is available - - println!("Running fallback test with existing variant test data"); - - // Try to find existing variant test data - let variant_dir = find_existing_variant_test_data(); - - match variant_dir { - Ok(dir) => { - println!("Found existing variant test data at: {}", dir.display()); - - // List available test files - if let Ok(entries) = fs::read_dir(&dir) { - let mut metadata_files = Vec::new(); - for entry in entries.flatten() { - if let Some(name) = entry.file_name().to_str() { - if name.ends_with(".metadata") { - metadata_files.push(name.to_string()); - } - } - } - - println!("Found {} metadata files for testing", metadata_files.len()); - assert!( - !metadata_files.is_empty(), - "Should find at least some metadata files" - ); - - // Test loading a few basic cases - for metadata_file in metadata_files.iter().take(3) { - let case_name = metadata_file.strip_suffix(".metadata").unwrap(); - match test_load_existing_variant_case(&dir, case_name) { - Ok(()) => println!("Successfully loaded variant case: {}", case_name), - Err(e) => println!("Failed to load variant case {}: {}", case_name, e), - } - } - } - } - Err(e) => { - println!("Could not find variant test data: {}", e); - println!("This is expected if parquet-testing submodule is not initialized"); - } - } +/// Variant value loaded from .variant.bin file +#[derive(Debug, Clone)] +struct ExpectedVariant { + data: Vec, + data_offset: usize, } -/// Find existing variant test data directory -fn find_existing_variant_test_data() -> Result> { - if let Ok(dir) = env::var("PARQUET_TEST_DATA") { - let variant_dir = PathBuf::from(dir).join("../variant"); - if variant_dir.is_dir() { - return Ok(variant_dir); - } +impl ExpectedVariant { + fn try_load(path: &Path) -> Result { + // "Each `*.variant.bin` file contains a single variant serialized + // by concatenating the serialized bytes of the variant metadata + // followed by the serialized bytes of the variant value." + let data = fs::read(path).map_err(|e| format!("cannot read variant file {path:?}: {e}"))?; + let metadata = VariantMetadata::try_new(&data) + .map_err(|e| format!("cannot parse variant metadata from {path:?}: {e}"))?; + + let data_offset = metadata.size(); + Ok(Self { data, data_offset }) } - let manifest_dir = env::var("CARGO_MANIFEST_DIR").unwrap_or_else(|_| ".".to_string()); - let candidates = vec![ - PathBuf::from(&manifest_dir).join("../parquet-testing/variant"), - PathBuf::from(&manifest_dir).join("parquet-testing/variant"), - ]; - - for candidate in candidates { - if candidate.is_dir() { - return Ok(candidate); - } + fn as_variant(&self) -> Variant<'_, '_> { + let metadata = &self.data[0..self.data_offset]; + let value = &self.data[self.data_offset..]; + Variant::try_new(metadata, value).expect("Invalid variant data") } - - Err("Could not find existing variant test data directory".into()) } -/// Test loading a single variant case from existing test data -fn test_load_existing_variant_case( - variant_dir: &Path, - case_name: &str, -) -> Result<(), Box> { - let metadata_path = variant_dir.join(format!("{}.metadata", case_name)); - let value_path = variant_dir.join(format!("{}.value", case_name)); +static TEST_CASE_DIR: LazyLock = LazyLock::new(|| { + PathBuf::from(parquet_test_data()) + .join("..") + .join("shredded_variant") +}); - if !metadata_path.exists() || !value_path.exists() { - return Err(format!("Missing files for case: {}", case_name).into()); +/// All tests +static ALL_CASES: LazyLock>> = LazyLock::new(|| { + let cases_file = TEST_CASE_DIR.join("cases.json"); + + if !cases_file.exists() { + return Err(format!("cases.json not found at {}", cases_file.display())); } - let _metadata = fs::read(&metadata_path)?; - let _value = fs::read(&value_path)?; + let content = fs::read_to_string(&cases_file) + .map_err(|e| format!("cannot read cases file {cases_file:?}: {e}"))?; - // TODO: Parse variant when parquet_variant crate is available - // let _variant = Variant::try_new(&metadata, &value)?; + serde_json::from_str::>(content.as_str()) + .map_err(|e| format!("cannot parse json from {cases_file:?}: {e}")) +}); - Ok(()) +// return a reference to the static ALL_CASES, or panic if loading failed +fn all_cases() -> &'static [VariantTestCase] { + ALL_CASES.as_ref().unwrap() } From 136c2d857924e7d76c570095bcef98fef73f908c Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 12 Sep 2025 06:50:34 -0400 Subject: [PATCH 3/8] fix docs --- parquet-variant/src/variant/metadata.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/parquet-variant/src/variant/metadata.rs b/parquet-variant/src/variant/metadata.rs index 14d048631de7..7944e45a4df5 100644 --- a/parquet-variant/src/variant/metadata.rs +++ b/parquet-variant/src/variant/metadata.rs @@ -310,7 +310,11 @@ impl<'m> VariantMetadata<'m> { self.header.offset_size.unpack_u32(bytes, i) } - /// Returns the last byte offset of the metadata dictionary, which is also the total size of + /// Returns the total size, in bytes, of the metadata. + /// + /// Note this value may be smaller than what was passed to [`Self::new`] or + /// [`Self::try_new`] if the input was larger than necessary to encode the + /// metadata dictionary. pub fn size(&self) -> usize { self.bytes.len() } From 675f88b3a11605a98aba284d9555d75bb06ab715 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 12 Sep 2025 06:51:54 -0400 Subject: [PATCH 4/8] clarify comments --- parquet/tests/variant_integration.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/parquet/tests/variant_integration.rs b/parquet/tests/variant_integration.rs index fe863462a660..0c48eb7839b1 100644 --- a/parquet/tests/variant_integration.rs +++ b/parquet/tests/variant_integration.rs @@ -63,8 +63,10 @@ macro_rules! variant_test_case { } // Generate test functions for each case -// Note that case 3 is empty in cases.json, so we skip it here -// Only cases 40, 42, 87, 127 and 128 are expected to fail, the rest should pass +// Notes +// - case 3 is empty in cases.json for some reason +// - cases 40, 42, 87, 127 and 128 are expected to fail always (they include invalid variants) +// - the remaining cases are expected to (eventually) pass variant_test_case!(1, "Unsupported typed_value type: List("); variant_test_case!(2, "Unsupported typed_value type: List("); From 60d9425c0ecd427b836170514ce0adb60052f12a Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 12 Sep 2025 07:17:56 -0400 Subject: [PATCH 5/8] Add issue links --- parquet/tests/variant_integration.rs | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/parquet/tests/variant_integration.rs b/parquet/tests/variant_integration.rs index 0c48eb7839b1..653b1173d969 100644 --- a/parquet/tests/variant_integration.rs +++ b/parquet/tests/variant_integration.rs @@ -77,6 +77,7 @@ variant_test_case!(2, "Unsupported typed_value type: List("); // }, // ``` variant_test_case!(3, "parquet_file must be set"); +// https://github.com/apache/arrow-rs/issues/8329 variant_test_case!(4, "Unsupported typed_value type: Boolean"); variant_test_case!(5, "Unsupported typed_value type: Boolean"); variant_test_case!(6); @@ -91,8 +92,10 @@ variant_test_case!(14); variant_test_case!(15); variant_test_case!(16); variant_test_case!(17); +// https://github.com/apache/arrow-rs/issues/8330 variant_test_case!(18, "Unsupported typed_value type: Date32"); variant_test_case!(19, "Unsupported typed_value type: Date32"); +// https://github.com/apache/arrow-rs/issues/8331 variant_test_case!( 20, "Unsupported typed_value type: Timestamp(Microsecond, Some(\"UTC\"))" @@ -109,15 +112,19 @@ variant_test_case!( 23, "Unsupported typed_value type: Timestamp(Microsecond, None)" ); +// https://github.com/apache/arrow-rs/issues/8332 variant_test_case!(24, "Unsupported typed_value type: Decimal128(9, 4)"); variant_test_case!(25, "Unsupported typed_value type: Decimal128(9, 4)"); variant_test_case!(26, "Unsupported typed_value type: Decimal128(18, 9)"); variant_test_case!(27, "Unsupported typed_value type: Decimal128(18, 9)"); variant_test_case!(28, "Unsupported typed_value type: Decimal128(38, 9)"); variant_test_case!(29, "Unsupported typed_value type: Decimal128(38, 9)"); +// https://github.com/apache/arrow-rs/issues/8333 variant_test_case!(30, "Unsupported typed_value type: BinaryView"); variant_test_case!(31, "Unsupported typed_value type: Utf8"); +// https://github.com/apache/arrow-rs/issues/8334 variant_test_case!(32, "Unsupported typed_value type: Time64(Microsecond)"); +// https://github.com/apache/arrow-rs/issues/8331 variant_test_case!( 33, "Unsupported typed_value type: Timestamp(Nanosecond, Some(\"UTC\"))" @@ -134,7 +141,9 @@ variant_test_case!( 36, "Unsupported typed_value type: Timestamp(Nanosecond, None)" ); +// https://github.com/apache/arrow-rs/issues/8335 variant_test_case!(37, "Unsupported typed_value type: FixedSizeBinary(16)"); +// https://github.com/apache/arrow-rs/issues/8336 variant_test_case!(38, "Unsupported typed_value type: Struct("); variant_test_case!(39); // Is an error case (should be failing as the expected error message indicates) @@ -145,8 +154,10 @@ variant_test_case!( 42, "Expected an error 'Invalid variant, conflicting value and typed_value`, but got no error" ); +// https://github.com/apache/arrow-rs/issues/8336 variant_test_case!(43, "Unsupported typed_value type: Struct([Field"); variant_test_case!(44, "Unsupported typed_value type: Struct([Field"); +// https://github.com/apache/arrow-rs/issues/8337 variant_test_case!(45, "Unsupported typed_value type: List(Field"); variant_test_case!(46, "Unsupported typed_value type: Struct([Field"); variant_test_case!(47); @@ -185,8 +196,10 @@ variant_test_case!(79); variant_test_case!(80); variant_test_case!(81); variant_test_case!(82); +// https://github.com/apache/arrow-rs/issues/8336 variant_test_case!(83, "Unsupported typed_value type: Struct([Field"); variant_test_case!(84, "Unsupported typed_value type: Struct([Field"); +// https://github.com/apache/arrow-rs/issues/8337 variant_test_case!(85, "Unsupported typed_value type: List(Field"); variant_test_case!(86, "Unsupported typed_value type: List(Field"); // Is an error case (should be failing as the expected error message indicates) From 94a9f2dc3287c4aa6ced99468dadfc3e7346b389 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 15 Sep 2025 16:24:36 -0400 Subject: [PATCH 6/8] Update tests --- parquet/tests/variant_integration.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parquet/tests/variant_integration.rs b/parquet/tests/variant_integration.rs index 653b1173d969..7c8dffc11d9c 100644 --- a/parquet/tests/variant_integration.rs +++ b/parquet/tests/variant_integration.rs @@ -78,8 +78,8 @@ variant_test_case!(2, "Unsupported typed_value type: List("); // ``` variant_test_case!(3, "parquet_file must be set"); // https://github.com/apache/arrow-rs/issues/8329 -variant_test_case!(4, "Unsupported typed_value type: Boolean"); -variant_test_case!(5, "Unsupported typed_value type: Boolean"); +variant_test_case!(4); +variant_test_case!(5); variant_test_case!(6); variant_test_case!(7); variant_test_case!(8); From 2f4aacf0c7d4bbbb39228117af885a61a59beebd Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 16 Sep 2025 14:38:30 -0400 Subject: [PATCH 7/8] Return Variant::Uuid for FixedSizeBianry of 16 when possible --- parquet-variant-compute/src/variant_array.rs | 13 ++++++++++--- parquet-variant/src/variant.rs | 7 +++++-- parquet/tests/variant_integration.rs | 3 +-- 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index c3f3ad54131c..9ecc631da627 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -17,6 +17,7 @@ //! [`VariantArray`] implementation +use crate::type_conversion::primitive_conversion_single_value; use arrow::array::{Array, ArrayData, ArrayRef, AsArray, BinaryViewArray, StructArray}; use arrow::buffer::NullBuffer; use arrow::datatypes::{ @@ -24,12 +25,11 @@ use arrow::datatypes::{ UInt32Type, UInt64Type, UInt8Type, }; use arrow_schema::{ArrowError, DataType, Field, FieldRef, Fields}; +use parquet_variant::Uuid; use parquet_variant::Variant; use std::any::Any; use std::sync::Arc; -use crate::type_conversion::primitive_conversion_single_value; - /// An array of Parquet [`Variant`] values /// /// A [`VariantArray`] wraps an Arrow [`StructArray`] that stores the underlying @@ -556,8 +556,15 @@ fn typed_value_to_variant(typed_value: &ArrayRef, index: usize) -> Variant<'_, ' let value = boolean_array.value(index); Variant::from(value) } - DataType::FixedSizeBinary(_) => { + DataType::FixedSizeBinary(binary_len) => { let array = typed_value.as_fixed_size_binary(); + // Try to treat 16 byte FixedSizeBinary as UUID + let value = array.value(index); + if *binary_len == 16 { + if let Ok(uuid) = Uuid::from_slice(&value) { + return Variant::from(uuid); + } + } let value = array.value(index); Variant::from(value) } diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index cc4c3bcadd66..faaab94bc3fd 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -19,6 +19,11 @@ pub use self::decimal::{VariantDecimal16, VariantDecimal4, VariantDecimal8}; pub use self::list::VariantList; pub use self::metadata::{VariantMetadata, EMPTY_VARIANT_METADATA, EMPTY_VARIANT_METADATA_BYTES}; pub use self::object::VariantObject; + +// Publically export types used in the API +pub use half::f16; +pub use uuid::Uuid; + use crate::decoder::{ self, get_basic_type, get_primitive_type, VariantBasicType, VariantPrimitiveType, }; @@ -28,8 +33,6 @@ use std::ops::Deref; use arrow_schema::ArrowError; use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime, Timelike, Utc}; -use half::f16; -use uuid::Uuid; mod decimal; mod list; diff --git a/parquet/tests/variant_integration.rs b/parquet/tests/variant_integration.rs index 7c8dffc11d9c..6a586e013ef5 100644 --- a/parquet/tests/variant_integration.rs +++ b/parquet/tests/variant_integration.rs @@ -141,8 +141,7 @@ variant_test_case!( 36, "Unsupported typed_value type: Timestamp(Nanosecond, None)" ); -// https://github.com/apache/arrow-rs/issues/8335 -variant_test_case!(37, "Unsupported typed_value type: FixedSizeBinary(16)"); +variant_test_case!(37); // https://github.com/apache/arrow-rs/issues/8336 variant_test_case!(38, "Unsupported typed_value type: Struct("); variant_test_case!(39); From cdb8943f0846068ac684306085bccf090493b688 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 16 Sep 2025 14:38:51 -0400 Subject: [PATCH 8/8] clippy --- parquet-variant-compute/src/variant_array.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index 9ecc631da627..e87d03f88c5b 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -561,7 +561,7 @@ fn typed_value_to_variant(typed_value: &ArrayRef, index: usize) -> Variant<'_, ' // Try to treat 16 byte FixedSizeBinary as UUID let value = array.value(index); if *binary_len == 16 { - if let Ok(uuid) = Uuid::from_slice(&value) { + if let Ok(uuid) = Uuid::from_slice(value) { return Variant::from(uuid); } }