From e3a0b508876bc303f806b12810f13227463ef65d Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@live.com>
Date: Wed, 20 Aug 2025 12:28:59 -0700
Subject: [PATCH 01/46] custom PageLocation decoder for speed

---
 parquet/src/file/page_index/offset_index.rs | 64 ++++++++++++++++++++-
 parquet/src/parquet_thrift.rs               | 13 +++++
 2 files changed, 75 insertions(+), 2 deletions(-)
diff --git a/parquet/src/file/page_index/offset_index.rs b/parquet/src/file/page_index/offset_index.rs
index d4c196a3ae8b..791f61d37eae 100644
--- a/parquet/src/file/page_index/offset_index.rs
+++ b/parquet/src/file/page_index/offset_index.rs
@@ -25,7 +25,7 @@ use crate::{
     thrift_struct,
 };
 
-thrift_struct!(
+/*thrift_struct!(
 /// Page location information for [`OffsetIndexMetaData`]
 pub struct PageLocation {
   /// Offset of the page in the file
@@ -37,7 +37,67 @@ pub struct PageLocation {
   /// (repetition_level = 0).
   3: required i64 first_row_index
 }
-);
+);*/
+
+// hand coding this one because it is very time critical
+
+/// Page location information for [`OffsetIndexMetaData`]
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct PageLocation {
+    /// Offset of the page in the file
+    pub offset: i64,
+    /// Size of the page, including header. Sum of compressed_page_size and header
+    pub compressed_page_size: i32,
+    /// Index within the RowGroup of the first row of the page. When an
+    /// OffsetIndex is present, pages must begin on row boundaries
+    /// (repetition_level = 0).
+    pub first_row_index: i64,
+}
+
+// Note: this will fail if the fields are either out of order, or if a suboptimal
+// encoder doesn't use field deltas. If that ever occurs, remove this code and
+// revert to the commented out thrift_struct!() implementation above.
+impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for PageLocation {
+    type Error = ParquetError;
+    fn try_from(prot: &mut ThriftCompactInputProtocol<'a>) -> Result<Self> {
+        // there are 3 fields, all mandatory, so all field deltas should be 1
+        let (field_type, delta) = prot.read_field_header()?;
+        if delta != 1 || field_type != FieldType::I64 as u8 {
+            return Err(general_err!("error reading PageLocation::offset"));
+        }
+        let offset = prot.read_i64()?;
+
+        let (field_type, delta) = prot.read_field_header()?;
+        if delta != 1 || field_type != FieldType::I32 as u8 {
+            return Err(general_err!(
+                "error reading PageLocation::compressed_page_size"
+            ));
+        }
+        let compressed_page_size = prot.read_i32()?;
+
+        let (field_type, delta) = prot.read_field_header()?;
+        if delta != 1 || field_type != FieldType::I64 as u8 {
+            return Err(general_err!("error reading PageLocation::first_row_index"));
+        }
+        let first_row_index = prot.read_i64()?;
+
+        // This loop slows things down a bit, but it's an acceptible price to allow
+        // forwards compatibility. We could instead assert the next field is Stop.
+        loop {
+            let (field_type, _) = prot.read_field_header()?;
+            if field_type == FieldType::Stop as u8 {
+                break;
+            }
+            prot.skip(FieldType::try_from(field_type)?)?;
+        }
+
+        Ok(Self {
+            offset,
+            compressed_page_size,
+            first_row_index,
+        })
+    }
+}
 
 impl From<&crate::format::PageLocation> for PageLocation {
     fn from(value: &crate::format::PageLocation) -> Self {
diff --git a/parquet/src/parquet_thrift.rs b/parquet/src/parquet_thrift.rs
index 7f5fe475217f..2dff498372f0 100644
--- a/parquet/src/parquet_thrift.rs
+++ b/parquet/src/parquet_thrift.rs
@@ -244,6 +244,19 @@ impl<'b, 'a: 'b> ThriftCompactInputProtocol<'a> {
         Ok(())
     }
 
+    // This is a specialized version of read_field_begin, solely for use in parsing
+    // PageLocation structs in the offset index. This function assumes that the delta
+    // field will always be less than 0xf, fields will be in order, and no boolean fields
+    // will be read. This also skips validation of the field type.
+    //
+    // Returns a tuple of (field_type, field_delta)
+    pub(crate) fn read_field_header(&mut self) -> Result<(u8, u8)> {
+        let field_type = self.read_byte()?;
+        let field_delta = (field_type & 0xf0) >> 4;
+        let field_type = field_type & 0xf;
+        Ok((field_type, field_delta))
+    }
+
     pub(crate) fn read_field_begin(&mut self) -> Result<FieldIdentifier> {
         // we can read at least one byte, which is:
         // - the type

From 71d3859642701c3f90f3a16f5ae34582f5c00b85 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@live.com>
Date: Wed, 20 Aug 2025 12:55:50 -0700
Subject: [PATCH 02/46] fix recently added test

---
 parquet/tests/arrow_reader/io/mod.rs | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/parquet/tests/arrow_reader/io/mod.rs b/parquet/tests/arrow_reader/io/mod.rs
index b31f295755b0..3a09181c72cf 100644
--- a/parquet/tests/arrow_reader/io/mod.rs
+++ b/parquet/tests/arrow_reader/io/mod.rs
@@ -49,7 +49,6 @@ use parquet::data_type::AsBytes;
 use parquet::file::metadata::{ParquetMetaData, ParquetMetaDataReader, ParquetOffsetIndex};
 use parquet::file::properties::WriterProperties;
 use parquet::file::FOOTER_SIZE;
-use parquet::format::PageLocation;
 use parquet::schema::types::SchemaDescriptor;
 use std::collections::BTreeMap;
 use std::fmt::Display;
@@ -257,7 +256,7 @@ struct TestColumnChunk {
     dictionary_page_location: Option<i64>,
 
     /// The location of the data pages in the file
-    page_locations: Vec<PageLocation>,
+    page_locations: Vec<parquet::format::PageLocation>,
 }
 
 /// Information about the pages in a single row group
@@ -296,6 +295,11 @@ impl TestRowGroups {
                         let start_offset = start_offset as usize;
                         let end_offset = start_offset + length as usize;
 
+                        let page_locations = page_locations
+                            .iter()
+                            .map(|loc| parquet::format::PageLocation::from(loc))
+                            .collect();
+
                         TestColumnChunk {
                             name: column_name.clone(),
                             location: start_offset..end_offset,

From ff42e5a86bce951c287794748b55fce7f74dad51 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@live.com>
Date: Wed, 20 Aug 2025 13:12:51 -0700
Subject: [PATCH 03/46] clippy

---
 parquet/tests/arrow_reader/io/mod.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/parquet/tests/arrow_reader/io/mod.rs b/parquet/tests/arrow_reader/io/mod.rs
index 3a09181c72cf..65a014967b4a 100644
--- a/parquet/tests/arrow_reader/io/mod.rs
+++ b/parquet/tests/arrow_reader/io/mod.rs
@@ -297,7 +297,7 @@ impl TestRowGroups {
 
                         let page_locations = page_locations
                             .iter()
-                            .map(|loc| parquet::format::PageLocation::from(loc))
+                            .map(parquet::format::PageLocation::from)
                             .collect();
 
                         TestColumnChunk {

From 1f2c2161c4f554ffb56385627dde5a4af2abcbcf Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@live.com>
Date: Wed, 20 Aug 2025 13:38:46 -0700
Subject: [PATCH 04/46] experimental new form for column index

---
 parquet/src/file/page_index/index_reader.rs | 235 +++++++++++++++++++-
 1 file changed, 224 insertions(+), 11 deletions(-)

diff --git a/parquet/src/file/page_index/index_reader.rs b/parquet/src/file/page_index/index_reader.rs
index fbe6d3984596..fe56d4880d55 100644
--- a/parquet/src/file/page_index/index_reader.rs
+++ b/parquet/src/file/page_index/index_reader.rs
@@ -18,14 +18,17 @@
 //! Support for reading [`Index`] and [`OffsetIndexMetaData`] from parquet metadata.
 
 use crate::basic::{BoundaryOrder, Type};
-use crate::data_type::Int96;
+use crate::data_type::private::ParquetValueType;
+use crate::data_type::{ByteArray, FixedLenByteArray, Int96};
 use crate::errors::{ParquetError, Result};
 use crate::file::metadata::ColumnChunkMetaData;
-use crate::file::page_index::index::{Index, NativeIndex};
+use crate::file::page_index::index::Index;
 use crate::file::page_index::offset_index::OffsetIndexMetaData;
 use crate::file::reader::ChunkReader;
 use crate::parquet_thrift::{FieldType, ThriftCompactInputProtocol};
 use crate::thrift_struct;
+use crate::util::bit_util::*;
+use std::marker::PhantomData;
 use std::ops::Range;
 
 /// Computes the covering range of two optional ranges
@@ -146,22 +149,232 @@ pub(crate) struct ColumnIndex<'a> {
 }
 );
 
+/// column index
+pub struct NativeColumnIndex<T: ParquetValueType> {
+    phantom_data: PhantomData<T>,
+    null_pages: Vec<bool>,
+    boundary_order: BoundaryOrder,
+    null_counts: Option<Vec<i64>>,
+    repetition_level_histograms: Option<Vec<i64>>,
+    definition_level_histograms: Option<Vec<i64>>,
+    // raw bytes for min and max values
+    min_bytes: Vec<u8>,
+    min_offsets: Vec<usize>, // offsets are really only needed for BYTE_ARRAY
+    max_bytes: Vec<u8>,
+    max_offsets: Vec<usize>,
+}
+
+impl<T: ParquetValueType> NativeColumnIndex<T> {
+    fn try_new(index: ColumnIndex) -> Result<Self> {
+        let len = index.null_pages.len();
+
+        let min_len = index.min_values.iter().map(|&v| v.len()).sum();
+        let max_len = index.max_values.iter().map(|&v| v.len()).sum();
+        let mut min_bytes = vec![0u8; min_len];
+        let mut max_bytes = vec![0u8; max_len];
+
+        let mut min_offsets = vec![0usize; len + 1];
+        let mut max_offsets = vec![0usize; len + 1];
+
+        let mut min_pos = 0;
+        let mut max_pos = 0;
+
+        for (i, is_null) in index.null_pages.iter().enumerate().take(len) {
+            if !is_null {
+                let min = index.min_values[i];
+                let dst = &mut min_bytes[min_pos..min_pos + min.len()];
+                dst.copy_from_slice(min);
+                min_offsets[i] = min_pos;
+                min_pos += min.len();
+
+                let max = index.max_values[i];
+                let dst = &mut max_bytes[max_pos..max_pos + min.len()];
+                dst.copy_from_slice(max);
+                max_offsets[i] = max_pos;
+                max_pos += max.len();
+            } else {
+                min_offsets[i] = min_pos;
+                max_offsets[i] = max_pos;
+            }
+        }
+
+        min_offsets[len] = min_pos;
+        max_offsets[len] = max_pos;
+
+        Ok(Self {
+            phantom_data: PhantomData,
+            null_pages: index.null_pages,
+            boundary_order: index.boundary_order,
+            null_counts: index.null_counts,
+            repetition_level_histograms: index.repetition_level_histograms,
+            definition_level_histograms: index.definition_level_histograms,
+            min_bytes,
+            min_offsets,
+            max_bytes,
+            max_offsets,
+        })
+    }
+
+    /// Returns the number of pages
+    pub fn num_pages(&self) -> u64 {
+        self.null_pages.len() as u64
+    }
+
+    /// Returns the number of null values in the page indexed by `idx`
+    pub fn null_count(&self, idx: usize) -> Option<i64> {
+        self.null_counts.as_ref().map(|nc| nc[idx])
+    }
+
+    /// Returns the repetition level histogram for the page indexed by `idx`
+    pub fn repetition_level_histogram(&self, idx: usize) -> Option<&[i64]> {
+        if let Some(rep_hists) = self.repetition_level_histograms.as_ref() {
+            let num_lvls = rep_hists.len() / self.num_pages() as usize;
+            let start = num_lvls * idx;
+            Some(&rep_hists[start..start + num_lvls])
+        } else {
+            None
+        }
+    }
+
+    /// Returns the definition level histogram for the page indexed by `idx`
+    pub fn definition_level_histogram(&self, idx: usize) -> Option<&[i64]> {
+        if let Some(def_hists) = self.definition_level_histograms.as_ref() {
+            let num_lvls = def_hists.len() / self.num_pages() as usize;
+            let start = num_lvls * idx;
+            Some(&def_hists[start..start + num_lvls])
+        } else {
+            None
+        }
+    }
+
+    /// Returns whether this is an all null page
+    pub fn is_null_page(&self, idx: usize) -> bool {
+        self.null_pages[idx]
+    }
+
+    /// Returns the minimum value in the page indexed by `idx` as raw bytes
+    ///
+    /// It is `None` when all values are null
+    pub fn min_value_bytes(&self, idx: usize) -> Option<&[u8]> {
+        if self.null_pages[idx] {
+            None
+        } else {
+            let start = self.min_offsets[idx];
+            let end = self.min_offsets[idx + 1];
+            Some(&self.min_bytes[start..end])
+        }
+    }
+
+    /// Returns the maximum value in the page indexed by `idx` as raw bytes
+    ///
+    /// It is `None` when all values are null
+    pub fn max_value_bytes(&self, idx: usize) -> Option<&[u8]> {
+        if self.null_pages[idx] {
+            None
+        } else {
+            let start = self.max_offsets[idx];
+            let end = self.max_offsets[idx + 1];
+            Some(&self.max_bytes[start..end])
+        }
+    }
+}
+
+macro_rules! min_max_values {
+    ($ty: ty) => {
+        impl NativeColumnIndex<$ty> {
+            /// Returns the minimum value in the page indexed by `idx`
+            ///
+            /// It is `None` when all values are null
+            pub fn min_value(&self, idx: usize) -> Option<$ty> {
+                <$ty>::try_from_le_slice(self.min_value_bytes(idx)?).ok()
+            }
+
+            /// Returns the maximum value in the page indexed by `idx`
+            ///
+            /// It is `None` when all values are null
+            pub fn max_value(&self, idx: usize) -> Option<$ty> {
+                <$ty>::try_from_le_slice(self.max_value_bytes(idx)?).ok()
+            }
+        }
+    };
+}
+
+min_max_values!(bool);
+min_max_values!(i32);
+min_max_values!(i64);
+min_max_values!(f32);
+min_max_values!(f64);
+min_max_values!(Int96);
+
+/// index
+#[allow(non_camel_case_types)]
+pub enum ColumnIndexMetaData {
+    /// Sometimes reading page index from parquet file
+    /// will only return pageLocations without min_max index,
+    /// `NONE` represents this lack of index information
+    NONE,
+    /// Boolean type index
+    BOOLEAN(NativeColumnIndex<bool>),
+    /// 32-bit integer type index
+    INT32(NativeColumnIndex<i32>),
+    /// 64-bit integer type index
+    INT64(NativeColumnIndex<i64>),
+    /// 96-bit integer type (timestamp) index
+    INT96(NativeColumnIndex<Int96>),
+    /// 32-bit floating point type index
+    FLOAT(NativeColumnIndex<f32>),
+    /// 64-bit floating point type index
+    DOUBLE(NativeColumnIndex<f64>),
+    /// Byte array type index
+    BYTE_ARRAY(NativeColumnIndex<ByteArray>),
+    /// Fixed length byte array type index
+    FIXED_LEN_BYTE_ARRAY(NativeColumnIndex<FixedLenByteArray>),
+}
+
+impl ColumnIndexMetaData {
+    /// Return min/max elements inside ColumnIndex are ordered or not.
+    pub fn is_sorted(&self) -> bool {
+        // 0:UNORDERED, 1:ASCENDING ,2:DESCENDING,
+        if let Some(order) = self.get_boundary_order() {
+            order != BoundaryOrder::UNORDERED
+        } else {
+            false
+        }
+    }
+
+    /// Get boundary_order of this page index.
+    pub fn get_boundary_order(&self) -> Option<BoundaryOrder> {
+        match self {
+            ColumnIndexMetaData::NONE => None,
+            ColumnIndexMetaData::BOOLEAN(index) => Some(index.boundary_order),
+            ColumnIndexMetaData::INT32(index) => Some(index.boundary_order),
+            ColumnIndexMetaData::INT64(index) => Some(index.boundary_order),
+            ColumnIndexMetaData::INT96(index) => Some(index.boundary_order),
+            ColumnIndexMetaData::FLOAT(index) => Some(index.boundary_order),
+            ColumnIndexMetaData::DOUBLE(index) => Some(index.boundary_order),
+            ColumnIndexMetaData::BYTE_ARRAY(index) => Some(index.boundary_order),
+            ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(index) => Some(index.boundary_order),
+        }
+    }
+}
+
 pub(crate) fn decode_column_index(data: &[u8], column_type: Type) -> Result<Index, ParquetError> {
     let mut prot = ThriftCompactInputProtocol::new(data);
     let index = ColumnIndex::try_from(&mut prot)?;
 
     let index = match column_type {
-        Type::BOOLEAN => Index::BOOLEAN(NativeIndex::<bool>::try_new_local(index)?),
-        Type::INT32 => Index::INT32(NativeIndex::<i32>::try_new_local(index)?),
-        Type::INT64 => Index::INT64(NativeIndex::<i64>::try_new_local(index)?),
-        Type::INT96 => Index::INT96(NativeIndex::<Int96>::try_new_local(index)?),
-        Type::FLOAT => Index::FLOAT(NativeIndex::<f32>::try_new_local(index)?),
-        Type::DOUBLE => Index::DOUBLE(NativeIndex::<f64>::try_new_local(index)?),
-        Type::BYTE_ARRAY => Index::BYTE_ARRAY(NativeIndex::try_new_local(index)?),
+        Type::BOOLEAN => ColumnIndexMetaData::BOOLEAN(NativeColumnIndex::<bool>::try_new(index)?),
+        Type::INT32 => ColumnIndexMetaData::INT32(NativeColumnIndex::<i32>::try_new(index)?),
+        Type::INT64 => ColumnIndexMetaData::INT64(NativeColumnIndex::<i64>::try_new(index)?),
+        Type::INT96 => ColumnIndexMetaData::INT96(NativeColumnIndex::<Int96>::try_new(index)?),
+        Type::FLOAT => ColumnIndexMetaData::FLOAT(NativeColumnIndex::<f32>::try_new(index)?),
+        Type::DOUBLE => ColumnIndexMetaData::DOUBLE(NativeColumnIndex::<f64>::try_new(index)?),
+        Type::BYTE_ARRAY => ColumnIndexMetaData::BYTE_ARRAY(NativeColumnIndex::try_new(index)?),
         Type::FIXED_LEN_BYTE_ARRAY => {
-            Index::FIXED_LEN_BYTE_ARRAY(NativeIndex::try_new_local(index)?)
+            ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(NativeColumnIndex::try_new(index)?)
         }
     };
 
-    Ok(index)
+    //Ok(index)
+    Ok(Index::NONE)
 }

From 37f3b2086b3108097d55831ae577f6251103bc75 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@live.com>
Date: Thu, 21 Aug 2025 11:27:42 -0700
Subject: [PATCH 05/46] fix for test added in main

---
 parquet/tests/arrow_reader/io/mod.rs | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/parquet/tests/arrow_reader/io/mod.rs b/parquet/tests/arrow_reader/io/mod.rs
index b31f295755b0..9cafcd714e89 100644
--- a/parquet/tests/arrow_reader/io/mod.rs
+++ b/parquet/tests/arrow_reader/io/mod.rs
@@ -49,7 +49,6 @@ use parquet::data_type::AsBytes;
 use parquet::file::metadata::{ParquetMetaData, ParquetMetaDataReader, ParquetOffsetIndex};
 use parquet::file::properties::WriterProperties;
 use parquet::file::FOOTER_SIZE;
-use parquet::format::PageLocation;
 use parquet::schema::types::SchemaDescriptor;
 use std::collections::BTreeMap;
 use std::fmt::Display;
@@ -257,7 +256,7 @@ struct TestColumnChunk {
     dictionary_page_location: Option<i64>,
 
     /// The location of the data pages in the file
-    page_locations: Vec<PageLocation>,
+    page_locations: Vec<parquet::format::PageLocation>,
 }
 
 /// Information about the pages in a single row group
@@ -287,8 +286,11 @@ impl TestRowGroups {
                     .enumerate()
                     .map(|(col_idx, col_meta)| {
                         let column_name = col_meta.column_descr().name().to_string();
-                        let page_locations =
-                            offset_index[rg_index][col_idx].page_locations().to_vec();
+                        let page_locations = offset_index[rg_index][col_idx]
+                            .page_locations()
+                            .iter()
+                            .map(parquet::format::PageLocation::from)
+                            .collect();
                         let dictionary_page_location = col_meta.dictionary_page_offset();
 
                         // We can find the byte range of the entire column chunk

From 3d4e28eade9e5ab4066c0cd4e91311f068d12572 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@live.com>
Date: Thu, 21 Aug 2025 11:29:53 -0700
Subject: [PATCH 06/46] refactor new column index

---
 parquet/src/file/page_index/index.rs        |   5 +-
 parquet/src/file/page_index/index_reader.rs | 353 ++++++++++++++------
 2 files changed, 247 insertions(+), 111 deletions(-)

diff --git a/parquet/src/file/page_index/index.rs b/parquet/src/file/page_index/index.rs
index ed586bcd33d0..22d6e92666db 100644
--- a/parquet/src/file/page_index/index.rs
+++ b/parquet/src/file/page_index/index.rs
@@ -24,7 +24,7 @@ use crate::data_type::private::ParquetValueType;
 use crate::data_type::{AsBytes, ByteArray, FixedLenByteArray, Int96};
 use crate::errors::ParquetError;
 use crate::file::metadata::LevelHistogram;
-use crate::file::page_index::index_reader::ColumnIndex;
+use crate::file::page_index::index_reader::ThriftColumnIndex;
 use std::fmt::Debug;
 
 /// Typed statistics for one data page
@@ -310,7 +310,8 @@ impl<T: ParquetValueType> NativeIndex<T> {
     }
 
     /// Creates a new [`NativeIndex`]
-    pub(crate) fn try_new_local(index: ColumnIndex) -> Result<Self, ParquetError> {
+    #[allow(dead_code)]
+    pub(crate) fn try_new_local(index: ThriftColumnIndex) -> Result<Self, ParquetError> {
         let len = index.min_values.len();
 
         // turn Option<Vec<i64>> into Vec<Option<i64>>
diff --git a/parquet/src/file/page_index/index_reader.rs b/parquet/src/file/page_index/index_reader.rs
index fe56d4880d55..1680f9ddc0ea 100644
--- a/parquet/src/file/page_index/index_reader.rs
+++ b/parquet/src/file/page_index/index_reader.rs
@@ -19,7 +19,7 @@
 
 use crate::basic::{BoundaryOrder, Type};
 use crate::data_type::private::ParquetValueType;
-use crate::data_type::{ByteArray, FixedLenByteArray, Int96};
+use crate::data_type::Int96;
 use crate::errors::{ParquetError, Result};
 use crate::file::metadata::ColumnChunkMetaData;
 use crate::file::page_index::index::Index;
@@ -27,9 +27,7 @@ use crate::file::page_index::offset_index::OffsetIndexMetaData;
 use crate::file::reader::ChunkReader;
 use crate::parquet_thrift::{FieldType, ThriftCompactInputProtocol};
 use crate::thrift_struct;
-use crate::util::bit_util::*;
-use std::marker::PhantomData;
-use std::ops::Range;
+use std::ops::{Deref, Range};
 
 /// Computes the covering range of two optional ranges
 ///
@@ -138,7 +136,7 @@ pub(crate) fn decode_offset_index(data: &[u8]) -> Result<OffsetIndexMetaData, Pa
 }
 
 thrift_struct!(
-pub(crate) struct ColumnIndex<'a> {
+pub(crate) struct ThriftColumnIndex<'a> {
   1: required list<bool> null_pages
   2: required list<'a><binary> min_values
   3: required list<'a><binary> max_values
@@ -149,23 +147,149 @@ pub(crate) struct ColumnIndex<'a> {
 }
 );
 
-/// column index
-pub struct NativeColumnIndex<T: ParquetValueType> {
-    phantom_data: PhantomData<T>,
+// TODO: the following should move to its own module
+
+/// Common bits of the column index
+pub struct ColumnIndex {
     null_pages: Vec<bool>,
     boundary_order: BoundaryOrder,
     null_counts: Option<Vec<i64>>,
     repetition_level_histograms: Option<Vec<i64>>,
     definition_level_histograms: Option<Vec<i64>>,
+}
+
+impl ColumnIndex {
+    /// Returns the number of pages
+    pub fn num_pages(&self) -> u64 {
+        self.null_pages.len() as u64
+    }
+
+    /// Returns the number of null values in the page indexed by `idx`
+    pub fn null_count(&self, idx: usize) -> Option<i64> {
+        self.null_counts.as_ref().map(|nc| nc[idx])
+    }
+
+    /// Returns the repetition level histogram for the page indexed by `idx`
+    pub fn repetition_level_histogram(&self, idx: usize) -> Option<&[i64]> {
+        if let Some(rep_hists) = self.repetition_level_histograms.as_ref() {
+            let num_lvls = rep_hists.len() / self.num_pages() as usize;
+            let start = num_lvls * idx;
+            Some(&rep_hists[start..start + num_lvls])
+        } else {
+            None
+        }
+    }
+
+    /// Returns the definition level histogram for the page indexed by `idx`
+    pub fn definition_level_histogram(&self, idx: usize) -> Option<&[i64]> {
+        if let Some(def_hists) = self.definition_level_histograms.as_ref() {
+            let num_lvls = def_hists.len() / self.num_pages() as usize;
+            let start = num_lvls * idx;
+            Some(&def_hists[start..start + num_lvls])
+        } else {
+            None
+        }
+    }
+
+    /// Returns whether the page indexed by `idx` consists of all null values
+    pub fn is_null_page(&self, idx: usize) -> bool {
+        self.null_pages[idx]
+    }
+}
+
+/// Column index for primitive types
+pub struct PrimitiveColumnIndex<T: ParquetValueType> {
+    column_index: ColumnIndex,
+    min_values: Vec<T>,
+    max_values: Vec<T>,
+}
+
+impl<T: ParquetValueType> PrimitiveColumnIndex<T> {
+    fn try_new(index: ThriftColumnIndex) -> Result<Self> {
+        let len = index.null_pages.len();
+
+        let mut min_values = Vec::with_capacity(len);
+        let mut max_values = Vec::with_capacity(len);
+
+        for (i, is_null) in index.null_pages.iter().enumerate().take(len) {
+            if !is_null {
+                let min = index.min_values[i];
+                min_values.push(T::try_from_le_slice(min)?);
+
+                let max = index.max_values[i];
+                max_values.push(T::try_from_le_slice(max)?);
+            } else {
+                min_values.push(Default::default());
+                max_values.push(Default::default());
+            }
+        }
+
+        Ok(Self {
+            column_index: ColumnIndex {
+                null_pages: index.null_pages,
+                boundary_order: index.boundary_order,
+                null_counts: index.null_counts,
+                repetition_level_histograms: index.repetition_level_histograms,
+                definition_level_histograms: index.definition_level_histograms,
+            },
+            min_values,
+            max_values,
+        })
+    }
+
+    /// Returns an array containing the min values for each page
+    pub fn min_values(&self) -> &[T] {
+        &self.min_values
+    }
+
+    /// Returns an array containing the max values for each page
+    pub fn max_values(&self) -> &[T] {
+        &self.max_values
+    }
+
+    /// Returns the min value for the page indexed by `idx`
+    ///
+    /// It is `None` when all values are null
+    pub fn min_value(&self, idx: usize) -> Option<&T> {
+        if self.null_pages[idx] {
+            None
+        } else {
+            Some(&self.min_values[idx])
+        }
+    }
+
+    /// Returns the max value for the page indexed by `idx`
+    ///
+    /// It is `None` when all values are null
+    pub fn max_value(&self, idx: usize) -> Option<&T> {
+        if self.null_pages[idx] {
+            None
+        } else {
+            Some(&self.max_values[idx])
+        }
+    }
+}
+
+impl<T: ParquetValueType> Deref for PrimitiveColumnIndex<T> {
+    type Target = ColumnIndex;
+
+    fn deref(&self) -> &Self::Target {
+        &self.column_index
+    }
+}
+
+/// Column index for byte arrays (fixed length and variable)
+pub struct ByteArrayColumnIndex {
+    column_index: ColumnIndex,
     // raw bytes for min and max values
     min_bytes: Vec<u8>,
-    min_offsets: Vec<usize>, // offsets are really only needed for BYTE_ARRAY
+    min_offsets: Vec<usize>,
     max_bytes: Vec<u8>,
     max_offsets: Vec<usize>,
 }
 
-impl<T: ParquetValueType> NativeColumnIndex<T> {
-    fn try_new(index: ColumnIndex) -> Result<Self> {
+impl ByteArrayColumnIndex {
+    fn try_new(index: ThriftColumnIndex) -> Result<Self> {
         let len = index.null_pages.len();
 
         let min_len = index.min_values.iter().map(|&v| v.len()).sum();
@@ -202,12 +326,14 @@ impl<T: ParquetValueType> NativeColumnIndex<T> {
         max_offsets[len] = max_pos;
 
         Ok(Self {
-            phantom_data: PhantomData,
-            null_pages: index.null_pages,
-            boundary_order: index.boundary_order,
-            null_counts: index.null_counts,
-            repetition_level_histograms: index.repetition_level_histograms,
-            definition_level_histograms: index.definition_level_histograms,
+            column_index: ColumnIndex {
+                null_pages: index.null_pages,
+                boundary_order: index.boundary_order,
+                null_counts: index.null_counts,
+                repetition_level_histograms: index.repetition_level_histograms,
+                definition_level_histograms: index.definition_level_histograms,
+            },
+
             min_bytes,
             min_offsets,
             max_bytes,
@@ -215,47 +341,10 @@ impl<T: ParquetValueType> NativeColumnIndex<T> {
         })
     }
 
-    /// Returns the number of pages
-    pub fn num_pages(&self) -> u64 {
-        self.null_pages.len() as u64
-    }
-
-    /// Returns the number of null values in the page indexed by `idx`
-    pub fn null_count(&self, idx: usize) -> Option<i64> {
-        self.null_counts.as_ref().map(|nc| nc[idx])
-    }
-
-    /// Returns the repetition level histogram for the page indexed by `idx`
-    pub fn repetition_level_histogram(&self, idx: usize) -> Option<&[i64]> {
-        if let Some(rep_hists) = self.repetition_level_histograms.as_ref() {
-            let num_lvls = rep_hists.len() / self.num_pages() as usize;
-            let start = num_lvls * idx;
-            Some(&rep_hists[start..start + num_lvls])
-        } else {
-            None
-        }
-    }
-
-    /// Returns the definition level histogram for the page indexed by `idx`
-    pub fn definition_level_histogram(&self, idx: usize) -> Option<&[i64]> {
-        if let Some(def_hists) = self.definition_level_histograms.as_ref() {
-            let num_lvls = def_hists.len() / self.num_pages() as usize;
-            let start = num_lvls * idx;
-            Some(&def_hists[start..start + num_lvls])
-        } else {
-            None
-        }
-    }
-
-    /// Returns whether this is an all null page
-    pub fn is_null_page(&self, idx: usize) -> bool {
-        self.null_pages[idx]
-    }
-
-    /// Returns the minimum value in the page indexed by `idx` as raw bytes
+    /// Returns the min value for the page indexed by `idx`
     ///
     /// It is `None` when all values are null
-    pub fn min_value_bytes(&self, idx: usize) -> Option<&[u8]> {
+    pub fn min_value(&self, idx: usize) -> Option<&[u8]> {
         if self.null_pages[idx] {
             None
         } else {
@@ -265,10 +354,10 @@ impl<T: ParquetValueType> NativeColumnIndex<T> {
         }
     }
 
-    /// Returns the maximum value in the page indexed by `idx` as raw bytes
+    /// Returns the max value for the page indexed by `idx`
     ///
     /// It is `None` when all values are null
-    pub fn max_value_bytes(&self, idx: usize) -> Option<&[u8]> {
+    pub fn max_value(&self, idx: usize) -> Option<&[u8]> {
         if self.null_pages[idx] {
             None
         } else {
@@ -279,32 +368,51 @@ impl<T: ParquetValueType> NativeColumnIndex<T> {
     }
 }
 
-macro_rules! min_max_values {
-    ($ty: ty) => {
-        impl NativeColumnIndex<$ty> {
-            /// Returns the minimum value in the page indexed by `idx`
-            ///
-            /// It is `None` when all values are null
-            pub fn min_value(&self, idx: usize) -> Option<$ty> {
-                <$ty>::try_from_le_slice(self.min_value_bytes(idx)?).ok()
-            }
+impl Deref for ByteArrayColumnIndex {
+    type Target = ColumnIndex;
 
-            /// Returns the maximum value in the page indexed by `idx`
-            ///
-            /// It is `None` when all values are null
-            pub fn max_value(&self, idx: usize) -> Option<$ty> {
-                <$ty>::try_from_le_slice(self.max_value_bytes(idx)?).ok()
-            }
-        }
-    };
+    fn deref(&self) -> &Self::Target {
+        &self.column_index
+    }
 }
 
-min_max_values!(bool);
-min_max_values!(i32);
-min_max_values!(i64);
-min_max_values!(f32);
-min_max_values!(f64);
-min_max_values!(Int96);
+// Macro to generate getter functions for ColumnIndexMetaData.
+macro_rules! colidx_enum_func {
+    ($self:ident, $func:ident, $arg:ident) => {{
+        match *$self {
+            Self::BOOLEAN(ref typed) => typed.$func($arg),
+            Self::INT32(ref typed) => typed.$func($arg),
+            Self::INT64(ref typed) => typed.$func($arg),
+            Self::INT96(ref typed) => typed.$func($arg),
+            Self::FLOAT(ref typed) => typed.$func($arg),
+            Self::DOUBLE(ref typed) => typed.$func($arg),
+            Self::BYTE_ARRAY(ref typed) => typed.$func($arg),
+            Self::FIXED_LEN_BYTE_ARRAY(ref typed) => typed.$func($arg),
+            _ => panic!(concat!(
+                "Cannot call ",
+                stringify!($func),
+                " on ColumnIndexMetaData::NONE"
+            )),
+        }
+    }};
+    ($self:ident, $func:ident) => {{
+        match *$self {
+            Self::BOOLEAN(ref typed) => typed.$func(),
+            Self::INT32(ref typed) => typed.$func(),
+            Self::INT64(ref typed) => typed.$func(),
+            Self::INT96(ref typed) => typed.$func(),
+            Self::FLOAT(ref typed) => typed.$func(),
+            Self::DOUBLE(ref typed) => typed.$func(),
+            Self::BYTE_ARRAY(ref typed) => typed.$func(),
+            Self::FIXED_LEN_BYTE_ARRAY(ref typed) => typed.$func(),
+            _ => panic!(concat!(
+                "Cannot call ",
+                stringify!($func),
+                " on ColumnIndexMetaData::NONE"
+            )),
+        }
+    }};
+}
 
 /// index
 #[allow(non_camel_case_types)]
@@ -314,21 +422,21 @@ pub enum ColumnIndexMetaData {
     /// `NONE` represents this lack of index information
     NONE,
     /// Boolean type index
-    BOOLEAN(NativeColumnIndex<bool>),
+    BOOLEAN(PrimitiveColumnIndex<bool>),
     /// 32-bit integer type index
-    INT32(NativeColumnIndex<i32>),
+    INT32(PrimitiveColumnIndex<i32>),
     /// 64-bit integer type index
-    INT64(NativeColumnIndex<i64>),
+    INT64(PrimitiveColumnIndex<i64>),
     /// 96-bit integer type (timestamp) index
-    INT96(NativeColumnIndex<Int96>),
+    INT96(PrimitiveColumnIndex<Int96>),
     /// 32-bit floating point type index
-    FLOAT(NativeColumnIndex<f32>),
+    FLOAT(PrimitiveColumnIndex<f32>),
     /// 64-bit floating point type index
-    DOUBLE(NativeColumnIndex<f64>),
+    DOUBLE(PrimitiveColumnIndex<f64>),
     /// Byte array type index
-    BYTE_ARRAY(NativeColumnIndex<ByteArray>),
+    BYTE_ARRAY(ByteArrayColumnIndex),
     /// Fixed length byte array type index
-    FIXED_LEN_BYTE_ARRAY(NativeColumnIndex<FixedLenByteArray>),
+    FIXED_LEN_BYTE_ARRAY(ByteArrayColumnIndex),
 }
 
 impl ColumnIndexMetaData {
@@ -345,33 +453,60 @@ impl ColumnIndexMetaData {
     /// Get boundary_order of this page index.
     pub fn get_boundary_order(&self) -> Option<BoundaryOrder> {
         match self {
-            ColumnIndexMetaData::NONE => None,
-            ColumnIndexMetaData::BOOLEAN(index) => Some(index.boundary_order),
-            ColumnIndexMetaData::INT32(index) => Some(index.boundary_order),
-            ColumnIndexMetaData::INT64(index) => Some(index.boundary_order),
-            ColumnIndexMetaData::INT96(index) => Some(index.boundary_order),
-            ColumnIndexMetaData::FLOAT(index) => Some(index.boundary_order),
-            ColumnIndexMetaData::DOUBLE(index) => Some(index.boundary_order),
-            ColumnIndexMetaData::BYTE_ARRAY(index) => Some(index.boundary_order),
-            ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(index) => Some(index.boundary_order),
+            Self::NONE => None,
+            Self::BOOLEAN(index) => Some(index.boundary_order),
+            Self::INT32(index) => Some(index.boundary_order),
+            Self::INT64(index) => Some(index.boundary_order),
+            Self::INT96(index) => Some(index.boundary_order),
+            Self::FLOAT(index) => Some(index.boundary_order),
+            Self::DOUBLE(index) => Some(index.boundary_order),
+            Self::BYTE_ARRAY(index) => Some(index.boundary_order),
+            Self::FIXED_LEN_BYTE_ARRAY(index) => Some(index.boundary_order),
         }
     }
+
+    /// Returns the number of pages
+    pub fn num_pages(&self) -> u64 {
+        colidx_enum_func!(self, num_pages)
+    }
+
+    /// Returns the number of null values in the page indexed by `idx`
+    pub fn null_count(&self, idx: usize) -> Option<i64> {
+        colidx_enum_func!(self, null_count, idx)
+    }
+
+    /// Returns the repetition level histogram for the page indexed by `idx`
+    pub fn repetition_level_histogram(&self, idx: usize) -> Option<&[i64]> {
+        colidx_enum_func!(self, repetition_level_histogram, idx)
+    }
+
+    /// Returns the definition level histogram for the page indexed by `idx`
+    pub fn definition_level_histogram(&self, idx: usize) -> Option<&[i64]> {
+        colidx_enum_func!(self, definition_level_histogram, idx)
+    }
+
+    /// Returns whether the page indexed by `idx` consists of all null values
+    pub fn is_null_page(&self, idx: usize) -> bool {
+        colidx_enum_func!(self, is_null_page, idx)
+    }
 }
 
 pub(crate) fn decode_column_index(data: &[u8], column_type: Type) -> Result<Index, ParquetError> {
     let mut prot = ThriftCompactInputProtocol::new(data);
-    let index = ColumnIndex::try_from(&mut prot)?;
-
-    let index = match column_type {
-        Type::BOOLEAN => ColumnIndexMetaData::BOOLEAN(NativeColumnIndex::<bool>::try_new(index)?),
-        Type::INT32 => ColumnIndexMetaData::INT32(NativeColumnIndex::<i32>::try_new(index)?),
-        Type::INT64 => ColumnIndexMetaData::INT64(NativeColumnIndex::<i64>::try_new(index)?),
-        Type::INT96 => ColumnIndexMetaData::INT96(NativeColumnIndex::<Int96>::try_new(index)?),
-        Type::FLOAT => ColumnIndexMetaData::FLOAT(NativeColumnIndex::<f32>::try_new(index)?),
-        Type::DOUBLE => ColumnIndexMetaData::DOUBLE(NativeColumnIndex::<f64>::try_new(index)?),
-        Type::BYTE_ARRAY => ColumnIndexMetaData::BYTE_ARRAY(NativeColumnIndex::try_new(index)?),
+    let index = ThriftColumnIndex::try_from(&mut prot)?;
+
+    let _index = match column_type {
+        Type::BOOLEAN => {
+            ColumnIndexMetaData::BOOLEAN(PrimitiveColumnIndex::<bool>::try_new(index)?)
+        }
+        Type::INT32 => ColumnIndexMetaData::INT32(PrimitiveColumnIndex::<i32>::try_new(index)?),
+        Type::INT64 => ColumnIndexMetaData::INT64(PrimitiveColumnIndex::<i64>::try_new(index)?),
+        Type::INT96 => ColumnIndexMetaData::INT96(PrimitiveColumnIndex::<Int96>::try_new(index)?),
+        Type::FLOAT => ColumnIndexMetaData::FLOAT(PrimitiveColumnIndex::<f32>::try_new(index)?),
+        Type::DOUBLE => ColumnIndexMetaData::DOUBLE(PrimitiveColumnIndex::<f64>::try_new(index)?),
+        Type::BYTE_ARRAY => ColumnIndexMetaData::BYTE_ARRAY(ByteArrayColumnIndex::try_new(index)?),
         Type::FIXED_LEN_BYTE_ARRAY => {
-            ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(NativeColumnIndex::try_new(index)?)
+            ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(ByteArrayColumnIndex::try_new(index)?)
         }
     };
 

From 2b85b89733fafa586287359968ba18e8acb0cef4 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@live.com>
Date: Thu, 21 Aug 2025 14:54:46 -0700
Subject: [PATCH 07/46] checkpoint...everything but stats converter

---
 parquet/src/arrow/arrow_writer/mod.rs       |  13 ++-
 parquet/src/file/metadata/memory.rs         |  50 ++++++++++
 parquet/src/file/metadata/mod.rs            |  24 +++--
 parquet/src/file/metadata/reader.rs         |   8 +-
 parquet/src/file/metadata/writer.rs         |  35 ++++---
 parquet/src/file/page_index/index_reader.rs | 103 ++++++++++++++++----
 parquet/src/file/serialized_reader.rs       |  95 ++++++++++--------
 parquet/src/file/writer.rs                  |  27 ++---
 parquet/tests/encryption/encryption_util.rs |  10 +-
 9 files changed, 258 insertions(+), 107 deletions(-)

diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs
index c6b0b426f9dd..1041a1af1f77 100644
--- a/parquet/src/arrow/arrow_writer/mod.rs
+++ b/parquet/src/arrow/arrow_writer/mod.rs
@@ -1488,6 +1488,7 @@ mod tests {
     use crate::arrow::ARROW_SCHEMA_META_KEY;
     use crate::column::page::{Page, PageReader};
     use crate::file::page_encoding_stats::PageEncodingStats;
+    use crate::file::page_index::index_reader::ColumnIndexMetaData;
     use crate::file::reader::SerializedPageReader;
     use crate::format::PageHeader;
     use crate::schema::types::ColumnPath;
@@ -1507,7 +1508,6 @@ mod tests {
     use crate::basic::Encoding;
     use crate::data_type::AsBytes;
     use crate::file::metadata::{ColumnChunkMetaData, ParquetMetaData, ParquetMetaDataReader};
-    use crate::file::page_index::index::Index;
     use crate::file::properties::{
         BloomFilterPosition, EnabledStatistics, ReaderProperties, WriterVersion,
     };
@@ -4002,9 +4002,12 @@ mod tests {
         assert_eq!(column_index[0].len(), 2); // 2 columns
 
         let a_idx = &column_index[0][0];
-        assert!(matches!(a_idx, Index::BYTE_ARRAY(_)), "{a_idx:?}");
+        assert!(
+            matches!(a_idx, ColumnIndexMetaData::BYTE_ARRAY(_)),
+            "{a_idx:?}"
+        );
         let b_idx = &column_index[0][1];
-        assert!(matches!(b_idx, Index::NONE), "{b_idx:?}");
+        assert!(matches!(b_idx, ColumnIndexMetaData::NONE), "{b_idx:?}");
     }
 
     #[test]
@@ -4070,9 +4073,9 @@ mod tests {
         assert_eq!(column_index[0].len(), 2); // 2 columns
 
         let a_idx = &column_index[0][0];
-        assert!(matches!(a_idx, Index::NONE), "{a_idx:?}");
+        assert!(matches!(a_idx, ColumnIndexMetaData::NONE), "{a_idx:?}");
         let b_idx = &column_index[0][1];
-        assert!(matches!(b_idx, Index::NONE), "{b_idx:?}");
+        assert!(matches!(b_idx, ColumnIndexMetaData::NONE), "{b_idx:?}");
     }
 
     #[test]
diff --git a/parquet/src/file/metadata/memory.rs b/parquet/src/file/metadata/memory.rs
index 0b8d3b336fc0..5c1477e2cb14 100644
--- a/parquet/src/file/metadata/memory.rs
+++ b/parquet/src/file/metadata/memory.rs
@@ -25,6 +25,9 @@ use crate::file::metadata::{
 };
 use crate::file::page_encoding_stats::PageEncodingStats;
 use crate::file::page_index::index::{Index, NativeIndex, PageIndex};
+use crate::file::page_index::index_reader::{
+    ByteArrayColumnIndex, ColumnIndex, ColumnIndexMetaData, PrimitiveColumnIndex,
+};
 use crate::file::page_index::offset_index::{OffsetIndexMetaData, PageLocation};
 use crate::file::statistics::{Statistics, ValueStatistics};
 use std::sync::Arc;
@@ -154,6 +157,48 @@ impl HeapSize for OffsetIndexMetaData {
     }
 }
 
+impl HeapSize for ColumnIndexMetaData {
+    fn heap_size(&self) -> usize {
+        match self {
+            Self::NONE => 0,
+            Self::BOOLEAN(native_index) => native_index.heap_size(),
+            Self::INT32(native_index) => native_index.heap_size(),
+            Self::INT64(native_index) => native_index.heap_size(),
+            Self::INT96(native_index) => native_index.heap_size(),
+            Self::FLOAT(native_index) => native_index.heap_size(),
+            Self::DOUBLE(native_index) => native_index.heap_size(),
+            Self::BYTE_ARRAY(native_index) => native_index.heap_size(),
+            Self::FIXED_LEN_BYTE_ARRAY(native_index) => native_index.heap_size(),
+        }
+    }
+}
+
+impl HeapSize for ColumnIndex {
+    fn heap_size(&self) -> usize {
+        self.null_pages.heap_size()
+            + self.boundary_order.heap_size()
+            + self.null_counts.heap_size()
+            + self.definition_level_histograms.heap_size()
+            + self.repetition_level_histograms.heap_size()
+    }
+}
+
+impl<T: ParquetValueType> HeapSize for PrimitiveColumnIndex<T> {
+    fn heap_size(&self) -> usize {
+        self.column_index.heap_size() + self.min_values.heap_size() + self.max_values.heap_size()
+    }
+}
+
+impl HeapSize for ByteArrayColumnIndex {
+    fn heap_size(&self) -> usize {
+        self.column_index.heap_size()
+            + self.min_bytes.heap_size()
+            + self.min_offsets.heap_size()
+            + self.max_bytes.heap_size()
+            + self.max_offsets.heap_size()
+    }
+}
+
 impl HeapSize for Index {
     fn heap_size(&self) -> usize {
         match self {
@@ -193,6 +238,11 @@ impl HeapSize for bool {
         0 // no heap allocations
     }
 }
+impl HeapSize for u8 {
+    fn heap_size(&self) -> usize {
+        0 // no heap allocations
+    }
+}
 impl HeapSize for i32 {
     fn heap_size(&self) -> usize {
         0 // no heap allocations
diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs
index f2fe9de77e72..a619d76658e9 100644
--- a/parquet/src/file/metadata/mod.rs
+++ b/parquet/src/file/metadata/mod.rs
@@ -106,7 +106,7 @@ use crate::file::column_crypto_metadata::{self, ColumnCryptoMetaData};
 pub(crate) use crate::file::metadata::memory::HeapSize;
 use crate::file::{
     page_encoding_stats::{self, PageEncodingStats},
-    page_index::offset_index::PageLocation,
+    page_index::{index_reader::ColumnIndexMetaData, offset_index::PageLocation},
 };
 use crate::file::{
     page_index::index::PageIndex,
@@ -156,7 +156,7 @@ pub(crate) use writer::ThriftMetadataWriter;
 ///
 /// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
 /// [`ColumnIndex`]: crate::format::ColumnIndex
-pub type ParquetColumnIndex = Vec<Vec<Index>>;
+pub type ParquetColumnIndex = Vec<Vec<ColumnIndexMetaData>>;
 
 /// [`OffsetIndexMetaData`] for each data page of each row group of each column
 ///
@@ -1948,7 +1948,7 @@ impl OffsetIndexBuilder {
 mod tests {
     use super::*;
     use crate::basic::{PageType, SortOrder};
-    use crate::file::page_index::index::NativeIndex;
+    use crate::file::page_index::index_reader::{ColumnIndex, PrimitiveColumnIndex};
 
     #[test]
     fn test_row_group_metadata_thrift_conversion() {
@@ -2223,7 +2223,17 @@ mod tests {
         let mut column_index = ColumnIndexBuilder::new(Type::BOOLEAN);
         column_index.append(false, vec![1u8], vec![2u8, 3u8], 4);
         let column_index = column_index.build_to_thrift();
-        let native_index = NativeIndex::<bool>::try_new(column_index).unwrap();
+        let native_index = PrimitiveColumnIndex::<bool> {
+            column_index: ColumnIndex {
+                null_pages: column_index.null_pages,
+                boundary_order: column_index.boundary_order.try_into().unwrap(),
+                null_counts: column_index.null_counts,
+                repetition_level_histograms: column_index.repetition_level_histograms,
+                definition_level_histograms: column_index.definition_level_histograms,
+            },
+            min_values: vec![],
+            max_values: vec![],
+        };
 
         // Now, add in OffsetIndex
         let mut offset_index = OffsetIndexBuilder::new();
@@ -2237,16 +2247,16 @@ mod tests {
 
         let parquet_meta = ParquetMetaDataBuilder::new(file_metadata)
             .set_row_groups(row_group_meta)
-            .set_column_index(Some(vec![vec![Index::BOOLEAN(native_index)]]))
+            .set_column_index(Some(vec![vec![ColumnIndexMetaData::BOOLEAN(native_index)]]))
             .set_offset_index(Some(vec![vec![
                 OffsetIndexMetaData::try_new(offset_index).unwrap()
             ]]))
             .build();
 
         #[cfg(not(feature = "encryption"))]
-        let bigger_expected_size = 2784;
+        let bigger_expected_size = 2704;
         #[cfg(feature = "encryption")]
-        let bigger_expected_size = 3120;
+        let bigger_expected_size = 3040;
 
         // more set fields means more memory usage
         assert!(bigger_expected_size > base_expected_size);
diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs
index a403f4eee8f0..97ea72ef964c 100644
--- a/parquet/src/file/metadata/reader.rs
+++ b/parquet/src/file/metadata/reader.rs
@@ -34,7 +34,7 @@ use bytes::Bytes;
 
 use crate::errors::{ParquetError, Result};
 use crate::file::metadata::{ColumnChunkMetaData, ParquetMetaData, RowGroupMetaData};
-use crate::file::page_index::index::Index;
+use crate::file::page_index::index_reader::ColumnIndexMetaData;
 use crate::file::page_index::index_reader::{acc_range, decode_column_index, decode_offset_index};
 use crate::file::reader::ChunkReader;
 use crate::file::{FOOTER_SIZE, PARQUET_MAGIC, PARQUET_MAGIC_ENCR_FOOTER};
@@ -566,7 +566,7 @@ impl ParquetMetaDataReader {
                                     col_idx,
                                 )
                             }
-                            None => Ok(Index::NONE),
+                            None => Ok(ColumnIndexMetaData::NONE),
                         })
                         .collect::<Result<Vec<_>>>()
                 })
@@ -584,7 +584,7 @@ impl ParquetMetaDataReader {
         column: &ColumnChunkMetaData,
         row_group_index: usize,
         col_index: usize,
-    ) -> Result<Index> {
+    ) -> Result<ColumnIndexMetaData> {
         match &column.column_crypto_metadata {
             Some(crypto_metadata) => {
                 let file_decryptor = metadata.file_decryptor.as_ref().ok_or_else(|| {
@@ -612,7 +612,7 @@ impl ParquetMetaDataReader {
         column: &ColumnChunkMetaData,
         _row_group_index: usize,
         _col_index: usize,
-    ) -> Result<Index> {
+    ) -> Result<ColumnIndexMetaData> {
         decode_column_index(bytes, column.column_type())
     }
 
diff --git a/parquet/src/file/metadata/writer.rs b/parquet/src/file/metadata/writer.rs
index acae20ec3cef..8c485f7d0e8b 100644
--- a/parquet/src/file/metadata/writer.rs
+++ b/parquet/src/file/metadata/writer.rs
@@ -24,9 +24,7 @@ use crate::encryption::{
 };
 #[cfg(feature = "encryption")]
 use crate::errors::ParquetError;
-use crate::errors::Result;
 use crate::file::metadata::{KeyValue, ParquetMetaData};
-use crate::file::page_index::index::Index;
 use crate::file::writer::{get_file_magic, TrackedWrite};
 use crate::format::EncryptionAlgorithm;
 #[cfg(feature = "encryption")]
@@ -34,6 +32,7 @@ use crate::format::{AesGcmV1, ColumnCryptoMetaData};
 use crate::schema::types;
 use crate::schema::types::{SchemaDescPtr, SchemaDescriptor, TypePtr};
 use crate::thrift::TSerializable;
+use crate::{errors::Result, file::page_index::index_reader::ColumnIndexMetaData};
 use std::io::Write;
 use std::sync::Arc;
 use thrift::protocol::TCompactOutputProtocol;
@@ -391,17 +390,31 @@ impl<'a, W: Write> ParquetMetaDataWriter<'a, W> {
                     column_indexes
                         .iter()
                         .map(|column_index| match column_index {
-                            Index::NONE => None,
-                            Index::BOOLEAN(column_index) => Some(column_index.to_thrift()),
-                            Index::BYTE_ARRAY(column_index) => Some(column_index.to_thrift()),
-                            Index::DOUBLE(column_index) => Some(column_index.to_thrift()),
-                            Index::FIXED_LEN_BYTE_ARRAY(column_index) => {
+                            ColumnIndexMetaData::NONE => None,
+                            ColumnIndexMetaData::BOOLEAN(column_index) => {
+                                Some(column_index.to_thrift())
+                            }
+                            ColumnIndexMetaData::BYTE_ARRAY(column_index) => {
+                                Some(column_index.to_thrift())
+                            }
+                            ColumnIndexMetaData::DOUBLE(column_index) => {
+                                Some(column_index.to_thrift())
+                            }
+                            ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(column_index) => {
+                                Some(column_index.to_thrift())
+                            }
+                            ColumnIndexMetaData::FLOAT(column_index) => {
+                                Some(column_index.to_thrift())
+                            }
+                            ColumnIndexMetaData::INT32(column_index) => {
+                                Some(column_index.to_thrift())
+                            }
+                            ColumnIndexMetaData::INT64(column_index) => {
+                                Some(column_index.to_thrift())
+                            }
+                            ColumnIndexMetaData::INT96(column_index) => {
                                 Some(column_index.to_thrift())
                             }
-                            Index::FLOAT(column_index) => Some(column_index.to_thrift()),
-                            Index::INT32(column_index) => Some(column_index.to_thrift()),
-                            Index::INT64(column_index) => Some(column_index.to_thrift()),
-                            Index::INT96(column_index) => Some(column_index.to_thrift()),
                         })
                         .collect()
                 })
diff --git a/parquet/src/file/page_index/index_reader.rs b/parquet/src/file/page_index/index_reader.rs
index 1680f9ddc0ea..b030b61c4918 100644
--- a/parquet/src/file/page_index/index_reader.rs
+++ b/parquet/src/file/page_index/index_reader.rs
@@ -22,7 +22,6 @@ use crate::data_type::private::ParquetValueType;
 use crate::data_type::Int96;
 use crate::errors::{ParquetError, Result};
 use crate::file::metadata::ColumnChunkMetaData;
-use crate::file::page_index::index::Index;
 use crate::file::page_index::offset_index::OffsetIndexMetaData;
 use crate::file::reader::ChunkReader;
 use crate::parquet_thrift::{FieldType, ThriftCompactInputProtocol};
@@ -57,7 +56,7 @@ pub(crate) fn acc_range(a: Option<Range<u64>>, b: Option<Range<u64>>) -> Option<
 pub fn read_columns_indexes<R: ChunkReader>(
     reader: &R,
     chunks: &[ColumnChunkMetaData],
-) -> Result<Option<Vec<Index>>, ParquetError> {
+) -> Result<Option<Vec<ColumnIndexMetaData>>, ParquetError> {
     let fetch = chunks
         .iter()
         .fold(None, |range, c| acc_range(range, c.column_index_range()));
@@ -78,7 +77,7 @@ pub fn read_columns_indexes<R: ChunkReader>(
                         ..usize::try_from(r.end - fetch.start)?],
                     c.column_type(),
                 ),
-                None => Ok(Index::NONE),
+                None => Ok(ColumnIndexMetaData::NONE),
             })
             .collect(),
     )
@@ -150,12 +149,13 @@ pub(crate) struct ThriftColumnIndex<'a> {
 // TODO: the following should move to its own module
 
 /// Common bits of the column index
+#[derive(Debug, Clone, PartialEq)]
 pub struct ColumnIndex {
-    null_pages: Vec<bool>,
-    boundary_order: BoundaryOrder,
-    null_counts: Option<Vec<i64>>,
-    repetition_level_histograms: Option<Vec<i64>>,
-    definition_level_histograms: Option<Vec<i64>>,
+    pub(crate) null_pages: Vec<bool>,
+    pub(crate) boundary_order: BoundaryOrder,
+    pub(crate) null_counts: Option<Vec<i64>>,
+    pub(crate) repetition_level_histograms: Option<Vec<i64>>,
+    pub(crate) definition_level_histograms: Option<Vec<i64>>,
 }
 
 impl ColumnIndex {
@@ -198,10 +198,11 @@ impl ColumnIndex {
 }
 
 /// Column index for primitive types
+#[derive(Debug, Clone, PartialEq)]
 pub struct PrimitiveColumnIndex<T: ParquetValueType> {
-    column_index: ColumnIndex,
-    min_values: Vec<T>,
-    max_values: Vec<T>,
+    pub(crate) column_index: ColumnIndex,
+    pub(crate) min_values: Vec<T>,
+    pub(crate) max_values: Vec<T>,
 }
 
 impl<T: ParquetValueType> PrimitiveColumnIndex<T> {
@@ -268,6 +269,35 @@ impl<T: ParquetValueType> PrimitiveColumnIndex<T> {
             Some(&self.max_values[idx])
         }
     }
+
+    pub(crate) fn to_thrift(&self) -> crate::format::ColumnIndex {
+        let min_values = self
+            .min_values
+            .iter()
+            .map(|x| x.as_bytes().to_vec())
+            .collect::<Vec<_>>();
+
+        let max_values = self
+            .max_values
+            .iter()
+            .map(|x| x.as_bytes().to_vec())
+            .collect::<Vec<_>>();
+
+        let null_counts = self.null_counts.clone();
+        let repetition_level_histograms = self.repetition_level_histograms.clone();
+        let definition_level_histograms = self.definition_level_histograms.clone();
+        let null_pages = self.null_pages.clone();
+
+        crate::format::ColumnIndex::new(
+            null_pages,
+            min_values,
+            max_values,
+            self.boundary_order.into(),
+            null_counts,
+            repetition_level_histograms,
+            definition_level_histograms,
+        )
+    }
 }
 
 impl<T: ParquetValueType> Deref for PrimitiveColumnIndex<T> {
@@ -279,13 +309,14 @@ impl<T: ParquetValueType> Deref for PrimitiveColumnIndex<T> {
 }
 
 /// Column index for byte arrays (fixed length and variable)
+#[derive(Debug, Clone, PartialEq)]
 pub struct ByteArrayColumnIndex {
-    column_index: ColumnIndex,
+    pub(crate) column_index: ColumnIndex,
     // raw bytes for min and max values
-    min_bytes: Vec<u8>,
-    min_offsets: Vec<usize>,
-    max_bytes: Vec<u8>,
-    max_offsets: Vec<usize>,
+    pub(crate) min_bytes: Vec<u8>,
+    pub(crate) min_offsets: Vec<usize>,
+    pub(crate) max_bytes: Vec<u8>,
+    pub(crate) max_offsets: Vec<usize>,
 }
 
 impl ByteArrayColumnIndex {
@@ -312,7 +343,7 @@ impl ByteArrayColumnIndex {
                 min_pos += min.len();
 
                 let max = index.max_values[i];
-                let dst = &mut max_bytes[max_pos..max_pos + min.len()];
+                let dst = &mut max_bytes[max_pos..max_pos + max.len()];
                 dst.copy_from_slice(max);
                 max_offsets[i] = max_pos;
                 max_pos += max.len();
@@ -366,6 +397,33 @@ impl ByteArrayColumnIndex {
             Some(&self.max_bytes[start..end])
         }
     }
+
+    pub(crate) fn to_thrift(&self) -> crate::format::ColumnIndex {
+        let mut min_values = Vec::with_capacity(self.num_pages() as usize);
+        for i in 0..self.num_pages() as usize {
+            min_values.push(self.min_value(i).unwrap_or(&vec![]).to_owned());
+        }
+
+        let mut max_values = Vec::with_capacity(self.num_pages() as usize);
+        for i in 0..self.num_pages() as usize {
+            max_values.push(self.max_value(i).unwrap_or(&vec![]).to_owned());
+        }
+
+        let null_counts = self.null_counts.clone();
+        let repetition_level_histograms = self.repetition_level_histograms.clone();
+        let definition_level_histograms = self.definition_level_histograms.clone();
+        let null_pages = self.null_pages.clone();
+
+        crate::format::ColumnIndex::new(
+            null_pages,
+            min_values,
+            max_values,
+            self.boundary_order.into(),
+            null_counts,
+            repetition_level_histograms,
+            definition_level_histograms,
+        )
+    }
 }
 
 impl Deref for ByteArrayColumnIndex {
@@ -415,6 +473,7 @@ macro_rules! colidx_enum_func {
 }
 
 /// index
+#[derive(Debug, Clone, PartialEq)]
 #[allow(non_camel_case_types)]
 pub enum ColumnIndexMetaData {
     /// Sometimes reading page index from parquet file
@@ -491,11 +550,14 @@ impl ColumnIndexMetaData {
     }
 }
 
-pub(crate) fn decode_column_index(data: &[u8], column_type: Type) -> Result<Index, ParquetError> {
+pub(crate) fn decode_column_index(
+    data: &[u8],
+    column_type: Type,
+) -> Result<ColumnIndexMetaData, ParquetError> {
     let mut prot = ThriftCompactInputProtocol::new(data);
     let index = ThriftColumnIndex::try_from(&mut prot)?;
 
-    let _index = match column_type {
+    let index = match column_type {
         Type::BOOLEAN => {
             ColumnIndexMetaData::BOOLEAN(PrimitiveColumnIndex::<bool>::try_new(index)?)
         }
@@ -510,6 +572,5 @@ pub(crate) fn decode_column_index(data: &[u8], column_type: Type) -> Result<Inde
         }
     };
 
-    //Ok(index)
-    Ok(Index::NONE)
+    Ok(index)
 }
diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs
index bead048ee20f..101599d3246e 100644
--- a/parquet/src/file/serialized_reader.rs
+++ b/parquet/src/file/serialized_reader.rs
@@ -1102,13 +1102,15 @@ mod tests {
 
     use bytes::Buf;
 
+    use crate::file::page_index::index_reader::{
+        ByteArrayColumnIndex, ColumnIndexMetaData, PrimitiveColumnIndex,
+    };
     use crate::file::properties::{EnabledStatistics, WriterProperties};
 
     use crate::basic::{self, BoundaryOrder, ColumnOrder, SortOrder};
     use crate::column::reader::ColumnReader;
     use crate::data_type::private::ParquetValueType;
     use crate::data_type::{AsBytes, FixedLenByteArrayType, Int32Type};
-    use crate::file::page_index::index::{Index, NativeIndex};
     #[allow(deprecated)]
     use crate::file::page_index::index_reader::{read_columns_indexes, read_offset_indexes};
     use crate::file::writer::SerializedFileWriter;
@@ -1912,21 +1914,19 @@ mod tests {
 
         // only one row group
         assert_eq!(column_index.len(), 1);
-        let index = if let Index::BYTE_ARRAY(index) = &column_index[0][0] {
+        let index = if let ColumnIndexMetaData::BYTE_ARRAY(index) = &column_index[0][0] {
             index
         } else {
             unreachable!()
         };
 
         assert_eq!(index.boundary_order, BoundaryOrder::ASCENDING);
-        let index_in_pages = &index.indexes;
 
         //only one page group
-        assert_eq!(index_in_pages.len(), 1);
+        assert_eq!(index.num_pages(), 1);
 
-        let page0 = &index_in_pages[0];
-        let min = page0.min.as_ref().unwrap();
-        let max = page0.max.as_ref().unwrap();
+        let min = index.min_value(0).unwrap();
+        let max = index.max_value(0).unwrap();
         assert_eq!(b"Hello", min.as_bytes());
         assert_eq!(b"today", max.as_bytes());
 
@@ -1991,7 +1991,7 @@ mod tests {
         let boundary_order = &column_index[0][0].get_boundary_order();
         assert!(boundary_order.is_some());
         matches!(boundary_order.unwrap(), BoundaryOrder::UNORDERED);
-        if let Index::INT32(index) = &column_index[0][0] {
+        if let ColumnIndexMetaData::INT32(index) = &column_index[0][0] {
             check_native_page_index(
                 index,
                 325,
@@ -2004,15 +2004,15 @@ mod tests {
         };
         //col1->bool_col:BOOLEAN UNCOMPRESSED DO:0 FPO:37329 SZ:3022/3022/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: false, max: true, num_nulls: 0]
         assert!(&column_index[0][1].is_sorted());
-        if let Index::BOOLEAN(index) = &column_index[0][1] {
-            assert_eq!(index.indexes.len(), 82);
+        if let ColumnIndexMetaData::BOOLEAN(index) = &column_index[0][1] {
+            assert_eq!(index.num_pages(), 82);
             assert_eq!(row_group_offset_indexes[1].page_locations.len(), 82);
         } else {
             unreachable!()
         };
         //col2->tinyint_col: INT32 UNCOMPRESSED DO:0 FPO:40351 SZ:37325/37325/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 0, max: 9, num_nulls: 0]
         assert!(&column_index[0][2].is_sorted());
-        if let Index::INT32(index) = &column_index[0][2] {
+        if let ColumnIndexMetaData::INT32(index) = &column_index[0][2] {
             check_native_page_index(
                 index,
                 325,
@@ -2025,7 +2025,7 @@ mod tests {
         };
         //col4->smallint_col: INT32 UNCOMPRESSED DO:0 FPO:77676 SZ:37325/37325/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 0, max: 9, num_nulls: 0]
         assert!(&column_index[0][3].is_sorted());
-        if let Index::INT32(index) = &column_index[0][3] {
+        if let ColumnIndexMetaData::INT32(index) = &column_index[0][3] {
             check_native_page_index(
                 index,
                 325,
@@ -2038,7 +2038,7 @@ mod tests {
         };
         //col5->smallint_col: INT32 UNCOMPRESSED DO:0 FPO:77676 SZ:37325/37325/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 0, max: 9, num_nulls: 0]
         assert!(&column_index[0][4].is_sorted());
-        if let Index::INT32(index) = &column_index[0][4] {
+        if let ColumnIndexMetaData::INT32(index) = &column_index[0][4] {
             check_native_page_index(
                 index,
                 325,
@@ -2051,7 +2051,7 @@ mod tests {
         };
         //col6->bigint_col: INT64 UNCOMPRESSED DO:0 FPO:152326 SZ:71598/71598/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 0, max: 90, num_nulls: 0]
         assert!(!&column_index[0][5].is_sorted());
-        if let Index::INT64(index) = &column_index[0][5] {
+        if let ColumnIndexMetaData::INT64(index) = &column_index[0][5] {
             check_native_page_index(
                 index,
                 528,
@@ -2064,7 +2064,7 @@ mod tests {
         };
         //col7->float_col: FLOAT UNCOMPRESSED DO:0 FPO:223924 SZ:37325/37325/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: -0.0, max: 9.9, num_nulls: 0]
         assert!(&column_index[0][6].is_sorted());
-        if let Index::FLOAT(index) = &column_index[0][6] {
+        if let ColumnIndexMetaData::FLOAT(index) = &column_index[0][6] {
             check_native_page_index(
                 index,
                 325,
@@ -2077,7 +2077,7 @@ mod tests {
         };
         //col8->double_col: DOUBLE UNCOMPRESSED DO:0 FPO:261249 SZ:71598/71598/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: -0.0, max: 90.89999999999999, num_nulls: 0]
         assert!(!&column_index[0][7].is_sorted());
-        if let Index::DOUBLE(index) = &column_index[0][7] {
+        if let ColumnIndexMetaData::DOUBLE(index) = &column_index[0][7] {
             check_native_page_index(
                 index,
                 528,
@@ -2090,8 +2090,8 @@ mod tests {
         };
         //col9->date_string_col: BINARY UNCOMPRESSED DO:0 FPO:332847 SZ:111948/111948/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 01/01/09, max: 12/31/10, num_nulls: 0]
         assert!(!&column_index[0][8].is_sorted());
-        if let Index::BYTE_ARRAY(index) = &column_index[0][8] {
-            check_native_page_index(
+        if let ColumnIndexMetaData::BYTE_ARRAY(index) = &column_index[0][8] {
+            check_byte_array_page_index(
                 index,
                 974,
                 get_row_group_min_max_bytes(row_group_metadata, 8),
@@ -2103,8 +2103,8 @@ mod tests {
         };
         //col10->string_col: BINARY UNCOMPRESSED DO:0 FPO:444795 SZ:45298/45298/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 0, max: 9, num_nulls: 0]
         assert!(&column_index[0][9].is_sorted());
-        if let Index::BYTE_ARRAY(index) = &column_index[0][9] {
-            check_native_page_index(
+        if let ColumnIndexMetaData::BYTE_ARRAY(index) = &column_index[0][9] {
+            check_byte_array_page_index(
                 index,
                 352,
                 get_row_group_min_max_bytes(row_group_metadata, 9),
@@ -2117,14 +2117,14 @@ mod tests {
         //col11->timestamp_col: INT96 UNCOMPRESSED DO:0 FPO:490093 SZ:111948/111948/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[num_nulls: 0, min/max not defined]
         //Notice: min_max values for each page for this col not exits.
         assert!(!&column_index[0][10].is_sorted());
-        if let Index::NONE = &column_index[0][10] {
+        if let ColumnIndexMetaData::NONE = &column_index[0][10] {
             assert_eq!(row_group_offset_indexes[10].page_locations.len(), 974);
         } else {
             unreachable!()
         };
         //col12->year: INT32 UNCOMPRESSED DO:0 FPO:602041 SZ:37325/37325/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 2009, max: 2010, num_nulls: 0]
         assert!(&column_index[0][11].is_sorted());
-        if let Index::INT32(index) = &column_index[0][11] {
+        if let ColumnIndexMetaData::INT32(index) = &column_index[0][11] {
             check_native_page_index(
                 index,
                 325,
@@ -2137,7 +2137,7 @@ mod tests {
         };
         //col13->month: INT32 UNCOMPRESSED DO:0 FPO:639366 SZ:37325/37325/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 1, max: 12, num_nulls: 0]
         assert!(!&column_index[0][12].is_sorted());
-        if let Index::INT32(index) = &column_index[0][12] {
+        if let ColumnIndexMetaData::INT32(index) = &column_index[0][12] {
             check_native_page_index(
                 index,
                 325,
@@ -2151,17 +2151,31 @@ mod tests {
     }
 
     fn check_native_page_index<T: ParquetValueType>(
-        row_group_index: &NativeIndex<T>,
+        row_group_index: &PrimitiveColumnIndex<T>,
         page_size: usize,
         min_max: (&[u8], &[u8]),
         boundary_order: BoundaryOrder,
     ) {
-        assert_eq!(row_group_index.indexes.len(), page_size);
+        assert_eq!(row_group_index.num_pages() as usize, page_size);
         assert_eq!(row_group_index.boundary_order, boundary_order);
-        row_group_index.indexes.iter().all(|x| {
-            x.min.as_ref().unwrap() >= &T::try_from_le_slice(min_max.0).unwrap()
-                && x.max.as_ref().unwrap() <= &T::try_from_le_slice(min_max.1).unwrap()
-        });
+        assert!(row_group_index.min_values().iter().all(|x| {
+            x >= &T::try_from_le_slice(min_max.0).unwrap()
+                && x <= &T::try_from_le_slice(min_max.1).unwrap()
+        }));
+    }
+
+    fn check_byte_array_page_index(
+        row_group_index: &ByteArrayColumnIndex,
+        page_size: usize,
+        min_max: (&[u8], &[u8]),
+        boundary_order: BoundaryOrder,
+    ) {
+        assert_eq!(row_group_index.num_pages() as usize, page_size);
+        assert_eq!(row_group_index.boundary_order, boundary_order);
+        for i in 0..row_group_index.num_pages() as usize {
+            let x = row_group_index.min_value(i).unwrap();
+            assert!(x >= min_max.0 && x <= min_max.1);
+        }
     }
 
     fn get_row_group_min_max_bytes(r: &RowGroupMetaData, col_num: usize) -> (&[u8], &[u8]) {
@@ -2402,12 +2416,11 @@ mod tests {
         assert_eq!(c.len(), 1);
 
         match &c[0] {
-            Index::FIXED_LEN_BYTE_ARRAY(v) => {
-                assert_eq!(v.indexes.len(), 1);
-                let page_idx = &v.indexes[0];
-                assert_eq!(page_idx.null_count.unwrap(), 1);
-                assert_eq!(page_idx.min.as_ref().unwrap().as_ref(), &[0; 11]);
-                assert_eq!(page_idx.max.as_ref().unwrap().as_ref(), &[5; 11]);
+            ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(v) => {
+                assert_eq!(v.num_pages(), 1);
+                assert_eq!(v.null_count(0).unwrap(), 1);
+                assert_eq!(v.min_value(0).unwrap(), &[0; 11]);
+                assert_eq!(v.max_value(0).unwrap(), &[5; 11]);
             }
             _ => unreachable!(),
         }
@@ -2538,11 +2551,11 @@ mod tests {
 
         // test that we got the index matching the row group
         match pg_idx {
-            Index::INT32(int_idx) => {
+            ColumnIndexMetaData::INT32(int_idx) => {
                 let min = col_stats.min_bytes_opt().unwrap().get_i32_le();
                 let max = col_stats.max_bytes_opt().unwrap().get_i32_le();
-                assert_eq!(int_idx.indexes[0].min(), Some(min).as_ref());
-                assert_eq!(int_idx.indexes[0].max(), Some(max).as_ref());
+                assert_eq!(int_idx.min_value(0), Some(min).as_ref());
+                assert_eq!(int_idx.max_value(0), Some(max).as_ref());
             }
             _ => panic!("wrong stats type"),
         }
@@ -2583,11 +2596,11 @@ mod tests {
 
             // test that we got the index matching the row group
             match pg_idx {
-                Index::INT32(int_idx) => {
+                ColumnIndexMetaData::INT32(int_idx) => {
                     let min = col_stats.min_bytes_opt().unwrap().get_i32_le();
                     let max = col_stats.max_bytes_opt().unwrap().get_i32_le();
-                    assert_eq!(int_idx.indexes[0].min(), Some(min).as_ref());
-                    assert_eq!(int_idx.indexes[0].max(), Some(max).as_ref());
+                    assert_eq!(int_idx.min_value(0), Some(min).as_ref());
+                    assert_eq!(int_idx.max_value(0), Some(max).as_ref());
                 }
                 _ => panic!("wrong stats type"),
             }
diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs
index 7db517ced5b2..1808e88878e7 100644
--- a/parquet/src/file/writer.rs
+++ b/parquet/src/file/writer.rs
@@ -1063,6 +1063,7 @@ mod tests {
     use crate::compression::{create_codec, Codec, CodecOptionsBuilder};
     use crate::data_type::{BoolType, ByteArrayType, Int32Type};
     use crate::file::page_index::index::Index;
+    use crate::file::page_index::index_reader::ColumnIndexMetaData;
     use crate::file::properties::EnabledStatistics;
     use crate::file::serialized_reader::ReadOptionsBuilder;
     use crate::file::{
@@ -2083,9 +2084,9 @@ mod tests {
         assert_eq!(column_index[0].len(), 2); // 2 column
 
         let a_idx = &column_index[0][0];
-        assert!(matches!(a_idx, Index::INT32(_)), "{a_idx:?}");
+        assert!(matches!(a_idx, ColumnIndexMetaData::INT32(_)), "{a_idx:?}");
         let b_idx = &column_index[0][1];
-        assert!(matches!(b_idx, Index::NONE), "{b_idx:?}");
+        assert!(matches!(b_idx, ColumnIndexMetaData::NONE), "{b_idx:?}");
     }
 
     #[test]
@@ -2169,16 +2170,16 @@ mod tests {
         let column_index = reader.metadata().column_index().unwrap();
         assert_eq!(column_index.len(), 1);
         assert_eq!(column_index[0].len(), 1);
-        let col_idx = if let Index::BYTE_ARRAY(index) = &column_index[0][0] {
-            assert_eq!(index.indexes.len(), 1);
-            &index.indexes[0]
+        let col_idx = if let ColumnIndexMetaData::BYTE_ARRAY(index) = &column_index[0][0] {
+            assert_eq!(index.num_pages(), 1);
+            index
         } else {
             unreachable!()
         };
 
-        assert!(col_idx.repetition_level_histogram().is_none());
-        assert!(col_idx.definition_level_histogram().is_some());
-        check_def_hist(col_idx.definition_level_histogram().unwrap().values());
+        assert!(col_idx.repetition_level_histogram(0).is_none());
+        assert!(col_idx.definition_level_histogram(0).is_some());
+        check_def_hist(col_idx.definition_level_histogram(0).unwrap());
 
         assert!(reader.metadata().offset_index().is_some());
         let offset_index = reader.metadata().offset_index().unwrap();
@@ -2324,15 +2325,15 @@ mod tests {
         let column_index = reader.metadata().column_index().unwrap();
         assert_eq!(column_index.len(), 1);
         assert_eq!(column_index[0].len(), 1);
-        let col_idx = if let Index::INT32(index) = &column_index[0][0] {
-            assert_eq!(index.indexes.len(), 1);
-            &index.indexes[0]
+        let col_idx = if let ColumnIndexMetaData::INT32(index) = &column_index[0][0] {
+            assert_eq!(index.num_pages(), 1);
+            index
         } else {
             unreachable!()
         };
 
-        check_def_hist(col_idx.definition_level_histogram().unwrap().values());
-        check_rep_hist(col_idx.repetition_level_histogram().unwrap().values());
+        check_def_hist(col_idx.definition_level_histogram(0).unwrap());
+        check_rep_hist(col_idx.repetition_level_histogram(0).unwrap());
 
         assert!(reader.metadata().offset_index().is_some());
         let offset_index = reader.metadata().offset_index().unwrap();
diff --git a/parquet/tests/encryption/encryption_util.rs b/parquet/tests/encryption/encryption_util.rs
index bf7fd08109f6..549bdec47343 100644
--- a/parquet/tests/encryption/encryption_util.rs
+++ b/parquet/tests/encryption/encryption_util.rs
@@ -191,11 +191,11 @@ pub(crate) fn verify_column_indexes(metadata: &ParquetMetaData) {
     let column_index = &column_index[0][float_col_idx];
 
     match column_index {
-        parquet::file::page_index::index::Index::FLOAT(float_index) => {
-            assert_eq!(float_index.indexes.len(), 1);
-            assert_eq!(float_index.indexes[0].min, Some(0.0f32));
-            assert!(float_index.indexes[0]
-                .max
+        parquet::file::page_index::index_reader::ColumnIndexMetaData::FLOAT(float_index) => {
+            assert_eq!(float_index.num_pages(), 1);
+            assert_eq!(float_index.min_value(0), Some(&0.0f32));
+            assert!(float_index
+                .max_value(0)
                 .is_some_and(|max| (max - 53.9).abs() < 1e-6));
         }
         _ => {

From 5ee1b8f8e1fac74f462eda2d7481833fc2c976d3 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@live.com>
Date: Thu, 21 Aug 2025 14:57:56 -0700
Subject: [PATCH 08/46] fix bug found in testing

---
 parquet/src/file/page_index/index_reader.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/parquet/src/file/page_index/index_reader.rs b/parquet/src/file/page_index/index_reader.rs
index 1680f9ddc0ea..3dc5a8d2dc18 100644
--- a/parquet/src/file/page_index/index_reader.rs
+++ b/parquet/src/file/page_index/index_reader.rs
@@ -312,7 +312,7 @@ impl ByteArrayColumnIndex {
                 min_pos += min.len();
 
                 let max = index.max_values[i];
-                let dst = &mut max_bytes[max_pos..max_pos + min.len()];
+                let dst = &mut max_bytes[max_pos..max_pos + max.len()];
                 dst.copy_from_slice(max);
                 max_offsets[i] = max_pos;
                 max_pos += max.len();

From d99a06acf077906dc1f9611757d35780b7a15b38 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@live.com>
Date: Thu, 21 Aug 2025 17:03:26 -0700
Subject: [PATCH 09/46] stats converter works

---
 parquet/src/arrow/arrow_reader/statistics.rs | 233 +++++++++----------
 parquet/src/file/page_index/index_reader.rs  |  84 ++++++-
 2 files changed, 188 insertions(+), 129 deletions(-)

diff --git a/parquet/src/arrow/arrow_reader/statistics.rs b/parquet/src/arrow/arrow_reader/statistics.rs
index eba1f561203c..b719d81fe0a1 100644
--- a/parquet/src/arrow/arrow_reader/statistics.rs
+++ b/parquet/src/arrow/arrow_reader/statistics.rs
@@ -25,7 +25,7 @@ use crate::basic::Type as PhysicalType;
 use crate::data_type::{ByteArray, FixedLenByteArray};
 use crate::errors::{ParquetError, Result};
 use crate::file::metadata::{ParquetColumnIndex, ParquetOffsetIndex, RowGroupMetaData};
-use crate::file::page_index::index::{Index, PageIndex};
+use crate::file::page_index::index_reader::ColumnIndexMetaData;
 use crate::file::statistics::Statistics as ParquetStatistics;
 use crate::schema::types::SchemaDescriptor;
 use arrow_array::builder::{
@@ -597,17 +597,17 @@ macro_rules! get_statistics {
 }
 
 macro_rules! make_data_page_stats_iterator {
-    ($iterator_type: ident, $func: expr, $index_type: path, $stat_value_type: ty) => {
+    ($iterator_type: ident, $func: ident, $index_type: path, $stat_value_type: ty, $conv:expr) => {
         struct $iterator_type<'a, I>
         where
-            I: Iterator<Item = (usize, &'a Index)>,
+            I: Iterator<Item = (usize, &'a ColumnIndexMetaData)>,
         {
             iter: I,
         }
 
         impl<'a, I> $iterator_type<'a, I>
         where
-            I: Iterator<Item = (usize, &'a Index)>,
+            I: Iterator<Item = (usize, &'a ColumnIndexMetaData)>,
         {
             fn new(iter: I) -> Self {
                 Self { iter }
@@ -616,7 +616,7 @@ macro_rules! make_data_page_stats_iterator {
 
         impl<'a, I> Iterator for $iterator_type<'a, I>
         where
-            I: Iterator<Item = (usize, &'a Index)>,
+            I: Iterator<Item = (usize, &'a ColumnIndexMetaData)>,
         {
             type Item = Vec<Option<$stat_value_type>>;
 
@@ -624,9 +624,12 @@ macro_rules! make_data_page_stats_iterator {
                 let next = self.iter.next();
                 match next {
                     Some((len, index)) => match index {
-                        $index_type(native_index) => {
-                            Some(native_index.indexes.iter().map($func).collect::<Vec<_>>())
-                        }
+                        $index_type(native_index) => Some(
+                            native_index
+                                .$func()
+                                .map(|v| v.map($conv))
+                                .collect::<Vec<_>>(),
+                        ),
                         // No matching `Index` found;
                         // thus no statistics that can be extracted.
                         // We return vec![None; len] to effectively
@@ -648,114 +651,130 @@ macro_rules! make_data_page_stats_iterator {
 
 make_data_page_stats_iterator!(
     MinBooleanDataPageStatsIterator,
-    |x: &PageIndex<bool>| { x.min },
-    Index::BOOLEAN,
-    bool
+    min_values_iter,
+    ColumnIndexMetaData::BOOLEAN,
+    bool,
+    |m| m.clone()
 );
 make_data_page_stats_iterator!(
     MaxBooleanDataPageStatsIterator,
-    |x: &PageIndex<bool>| { x.max },
-    Index::BOOLEAN,
-    bool
+    max_values_iter,
+    ColumnIndexMetaData::BOOLEAN,
+    bool,
+    |m| m.clone()
 );
 make_data_page_stats_iterator!(
     MinInt32DataPageStatsIterator,
-    |x: &PageIndex<i32>| { x.min },
-    Index::INT32,
-    i32
+    min_values_iter,
+    ColumnIndexMetaData::INT32,
+    i32,
+    |m| m.clone()
 );
 make_data_page_stats_iterator!(
     MaxInt32DataPageStatsIterator,
-    |x: &PageIndex<i32>| { x.max },
-    Index::INT32,
-    i32
+    max_values_iter,
+    ColumnIndexMetaData::INT32,
+    i32,
+    |m| m.clone()
 );
 make_data_page_stats_iterator!(
     MinInt64DataPageStatsIterator,
-    |x: &PageIndex<i64>| { x.min },
-    Index::INT64,
-    i64
+    min_values_iter,
+    ColumnIndexMetaData::INT64,
+    i64,
+    |m| m.clone()
 );
 make_data_page_stats_iterator!(
     MaxInt64DataPageStatsIterator,
-    |x: &PageIndex<i64>| { x.max },
-    Index::INT64,
-    i64
+    max_values_iter,
+    ColumnIndexMetaData::INT64,
+    i64,
+    |m| m.clone()
 );
 make_data_page_stats_iterator!(
     MinFloat16DataPageStatsIterator,
-    |x: &PageIndex<FixedLenByteArray>| { x.min.clone() },
-    Index::FIXED_LEN_BYTE_ARRAY,
-    FixedLenByteArray
+    min_values_iter,
+    ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY,
+    FixedLenByteArray,
+    |m| FixedLenByteArray::from(m.to_owned())
 );
 make_data_page_stats_iterator!(
     MaxFloat16DataPageStatsIterator,
-    |x: &PageIndex<FixedLenByteArray>| { x.max.clone() },
-    Index::FIXED_LEN_BYTE_ARRAY,
-    FixedLenByteArray
+    max_values_iter,
+    ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY,
+    FixedLenByteArray,
+    |m| FixedLenByteArray::from(m.to_owned())
 );
 make_data_page_stats_iterator!(
     MinFloat32DataPageStatsIterator,
-    |x: &PageIndex<f32>| { x.min },
-    Index::FLOAT,
-    f32
+    min_values_iter,
+    ColumnIndexMetaData::FLOAT,
+    f32,
+    |m| m.clone()
 );
 make_data_page_stats_iterator!(
     MaxFloat32DataPageStatsIterator,
-    |x: &PageIndex<f32>| { x.max },
-    Index::FLOAT,
-    f32
+    max_values_iter,
+    ColumnIndexMetaData::FLOAT,
+    f32,
+    |m| m.clone()
 );
 make_data_page_stats_iterator!(
     MinFloat64DataPageStatsIterator,
-    |x: &PageIndex<f64>| { x.min },
-    Index::DOUBLE,
-    f64
+    min_values_iter,
+    ColumnIndexMetaData::DOUBLE,
+    f64,
+    |m| m.clone()
 );
 make_data_page_stats_iterator!(
     MaxFloat64DataPageStatsIterator,
-    |x: &PageIndex<f64>| { x.max },
-    Index::DOUBLE,
-    f64
+    max_values_iter,
+    ColumnIndexMetaData::DOUBLE,
+    f64,
+    |m| m.clone()
 );
 make_data_page_stats_iterator!(
     MinByteArrayDataPageStatsIterator,
-    |x: &PageIndex<ByteArray>| { x.min.clone() },
-    Index::BYTE_ARRAY,
-    ByteArray
+    min_values_iter,
+    ColumnIndexMetaData::BYTE_ARRAY,
+    ByteArray,
+    |m| ByteArray::from(m.to_owned())
 );
 make_data_page_stats_iterator!(
     MaxByteArrayDataPageStatsIterator,
-    |x: &PageIndex<ByteArray>| { x.max.clone() },
-    Index::BYTE_ARRAY,
-    ByteArray
+    max_values_iter,
+    ColumnIndexMetaData::BYTE_ARRAY,
+    ByteArray,
+    |m| ByteArray::from(m.to_owned())
 );
 make_data_page_stats_iterator!(
     MaxFixedLenByteArrayDataPageStatsIterator,
-    |x: &PageIndex<FixedLenByteArray>| { x.max.clone() },
-    Index::FIXED_LEN_BYTE_ARRAY,
-    FixedLenByteArray
+    max_values_iter,
+    ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY,
+    FixedLenByteArray,
+    |m| FixedLenByteArray::from(m.to_owned())
 );
 
 make_data_page_stats_iterator!(
     MinFixedLenByteArrayDataPageStatsIterator,
-    |x: &PageIndex<FixedLenByteArray>| { x.min.clone() },
-    Index::FIXED_LEN_BYTE_ARRAY,
-    FixedLenByteArray
+    min_values_iter,
+    ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY,
+    FixedLenByteArray,
+    |m| FixedLenByteArray::from(m.to_owned())
 );
 
 macro_rules! get_decimal_page_stats_iterator {
     ($iterator_type: ident, $func: ident, $stat_value_type: ident, $convert_func: ident) => {
         struct $iterator_type<'a, I>
         where
-            I: Iterator<Item = (usize, &'a Index)>,
+            I: Iterator<Item = (usize, &'a ColumnIndexMetaData)>,
         {
             iter: I,
         }
 
         impl<'a, I> $iterator_type<'a, I>
         where
-            I: Iterator<Item = (usize, &'a Index)>,
+            I: Iterator<Item = (usize, &'a ColumnIndexMetaData)>,
         {
             fn new(iter: I) -> Self {
                 Self { iter }
@@ -764,44 +783,37 @@ macro_rules! get_decimal_page_stats_iterator {
 
         impl<'a, I> Iterator for $iterator_type<'a, I>
         where
-            I: Iterator<Item = (usize, &'a Index)>,
+            I: Iterator<Item = (usize, &'a ColumnIndexMetaData)>,
         {
             type Item = Vec<Option<$stat_value_type>>;
 
+            // Some(native_index.$func().map(|v| v.map($conv)).collect::<Vec<_>>())
             fn next(&mut self) -> Option<Self::Item> {
                 let next = self.iter.next();
                 match next {
                     Some((len, index)) => match index {
-                        Index::INT32(native_index) => Some(
+                        ColumnIndexMetaData::INT32(native_index) => Some(
                             native_index
-                                .indexes
-                                .iter()
-                                .map(|x| x.$func.and_then(|x| Some($stat_value_type::from(x))))
+                                .$func()
+                                .map(|x| x.map(|x| $stat_value_type::from(*x)))
                                 .collect::<Vec<_>>(),
                         ),
-                        Index::INT64(native_index) => Some(
+                        ColumnIndexMetaData::INT64(native_index) => Some(
                             native_index
-                                .indexes
-                                .iter()
-                                .map(|x| x.$func.and_then(|x| $stat_value_type::try_from(x).ok()))
+                                .$func()
+                                .map(|x| x.map(|x| $stat_value_type::try_from(*x).unwrap()))
                                 .collect::<Vec<_>>(),
                         ),
-                        Index::BYTE_ARRAY(native_index) => Some(
+                        ColumnIndexMetaData::BYTE_ARRAY(native_index) => Some(
                             native_index
-                                .indexes
-                                .iter()
-                                .map(|x| {
-                                    x.clone().$func.and_then(|x| Some($convert_func(x.data())))
-                                })
+                                .$func()
+                                .map(|x| x.map(|x| $convert_func(x)))
                                 .collect::<Vec<_>>(),
                         ),
-                        Index::FIXED_LEN_BYTE_ARRAY(native_index) => Some(
+                        ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(native_index) => Some(
                             native_index
-                                .indexes
-                                .iter()
-                                .map(|x| {
-                                    x.clone().$func.and_then(|x| Some($convert_func(x.data())))
-                                })
+                                .$func()
+                                .map(|x| x.map(|x| $convert_func(x)))
                                 .collect::<Vec<_>>(),
                         ),
                         _ => Some(vec![None; len]),
@@ -819,56 +831,56 @@ macro_rules! get_decimal_page_stats_iterator {
 
 get_decimal_page_stats_iterator!(
     MinDecimal32DataPageStatsIterator,
-    min,
+    min_values_iter,
     i32,
     from_bytes_to_i32
 );
 
 get_decimal_page_stats_iterator!(
     MaxDecimal32DataPageStatsIterator,
-    max,
+    max_values_iter,
     i32,
     from_bytes_to_i32
 );
 
 get_decimal_page_stats_iterator!(
     MinDecimal64DataPageStatsIterator,
-    min,
+    min_values_iter,
     i64,
     from_bytes_to_i64
 );
 
 get_decimal_page_stats_iterator!(
     MaxDecimal64DataPageStatsIterator,
-    max,
+    max_values_iter,
     i64,
     from_bytes_to_i64
 );
 
 get_decimal_page_stats_iterator!(
     MinDecimal128DataPageStatsIterator,
-    min,
+    min_values_iter,
     i128,
     from_bytes_to_i128
 );
 
 get_decimal_page_stats_iterator!(
     MaxDecimal128DataPageStatsIterator,
-    max,
+    max_values_iter,
     i128,
     from_bytes_to_i128
 );
 
 get_decimal_page_stats_iterator!(
     MinDecimal256DataPageStatsIterator,
-    min,
+    min_values_iter,
     i256,
     from_bytes_to_i256
 );
 
 get_decimal_page_stats_iterator!(
     MaxDecimal256DataPageStatsIterator,
-    max,
+    max_values_iter,
     i256,
     from_bytes_to_i256
 );
@@ -1181,7 +1193,7 @@ pub(crate) fn min_page_statistics<'a, I>(
     physical_type: Option<PhysicalType>,
 ) -> Result<ArrayRef>
 where
-    I: Iterator<Item = (usize, &'a Index)>,
+    I: Iterator<Item = (usize, &'a ColumnIndexMetaData)>,
 {
     get_data_page_statistics!(Min, data_type, iterator, physical_type)
 }
@@ -1194,7 +1206,7 @@ pub(crate) fn max_page_statistics<'a, I>(
     physical_type: Option<PhysicalType>,
 ) -> Result<ArrayRef>
 where
-    I: Iterator<Item = (usize, &'a Index)>,
+    I: Iterator<Item = (usize, &'a ColumnIndexMetaData)>,
 {
     get_data_page_statistics!(Max, data_type, iterator, physical_type)
 }
@@ -1205,46 +1217,13 @@ where
 /// The returned Array is an [`UInt64Array`]
 pub(crate) fn null_counts_page_statistics<'a, I>(iterator: I) -> Result<UInt64Array>
 where
-    I: Iterator<Item = (usize, &'a Index)>,
+    I: Iterator<Item = (usize, &'a ColumnIndexMetaData)>,
 {
     let iter = iterator.flat_map(|(len, index)| match index {
-        Index::NONE => vec![None; len],
-        Index::BOOLEAN(native_index) => native_index
-            .indexes
-            .iter()
-            .map(|x| x.null_count.map(|x| x as u64))
-            .collect::<Vec<_>>(),
-        Index::INT32(native_index) => native_index
-            .indexes
-            .iter()
-            .map(|x| x.null_count.map(|x| x as u64))
-            .collect::<Vec<_>>(),
-        Index::INT64(native_index) => native_index
-            .indexes
-            .iter()
-            .map(|x| x.null_count.map(|x| x as u64))
-            .collect::<Vec<_>>(),
-        Index::FLOAT(native_index) => native_index
-            .indexes
-            .iter()
-            .map(|x| x.null_count.map(|x| x as u64))
-            .collect::<Vec<_>>(),
-        Index::DOUBLE(native_index) => native_index
-            .indexes
-            .iter()
-            .map(|x| x.null_count.map(|x| x as u64))
-            .collect::<Vec<_>>(),
-        Index::FIXED_LEN_BYTE_ARRAY(native_index) => native_index
-            .indexes
-            .iter()
-            .map(|x| x.null_count.map(|x| x as u64))
-            .collect::<Vec<_>>(),
-        Index::BYTE_ARRAY(native_index) => native_index
-            .indexes
-            .iter()
-            .map(|x| x.null_count.map(|x| x as u64))
-            .collect::<Vec<_>>(),
-        _ => unimplemented!(),
+        ColumnIndexMetaData::NONE => vec![None; len],
+        column_index => column_index.null_counts().map_or(vec![None; len], |v| {
+            v.iter().map(|i| Some(*i as u64)).collect::<Vec<_>>()
+        }),
     });
 
     Ok(UInt64Array::from_iter(iter))
diff --git a/parquet/src/file/page_index/index_reader.rs b/parquet/src/file/page_index/index_reader.rs
index b030b61c4918..f13ac2aab407 100644
--- a/parquet/src/file/page_index/index_reader.rs
+++ b/parquet/src/file/page_index/index_reader.rs
@@ -165,6 +165,8 @@ impl ColumnIndex {
     }
 
     /// Returns the number of null values in the page indexed by `idx`
+    ///
+    /// Returns `None` if no null counts have been set in the index
     pub fn null_count(&self, idx: usize) -> Option<i64> {
         self.null_counts.as_ref().map(|nc| nc[idx])
     }
@@ -220,6 +222,7 @@ impl<T: ParquetValueType> PrimitiveColumnIndex<T> {
                 let max = index.max_values[i];
                 max_values.push(T::try_from_le_slice(max)?);
             } else {
+                // need placeholders
                 min_values.push(Default::default());
                 max_values.push(Default::default());
             }
@@ -238,16 +241,48 @@ impl<T: ParquetValueType> PrimitiveColumnIndex<T> {
         })
     }
 
-    /// Returns an array containing the min values for each page
+    /// Returns an array containing the min values for each page.
+    ///
+    /// Values in the returned slice are only valid if [`ColumnIndex::is_null_page()`]
+    /// is `false` for the same index.
     pub fn min_values(&self) -> &[T] {
         &self.min_values
     }
 
-    /// Returns an array containing the max values for each page
+    /// Returns an array containing the max values for each page.
+    ///
+    /// Values in the returned slice are only valid if [`ColumnIndex::is_null_page()`]
+    /// is `false` for the same index.
     pub fn max_values(&self) -> &[T] {
         &self.max_values
     }
 
+    /// Returns an iterator over the min values.
+    ///
+    /// Values may be `None` when [`ColumnIndex::is_null_page()`] is `true`.
+    pub fn min_values_iter(&self) -> impl Iterator<Item = Option<&T>> {
+        self.min_values.iter().enumerate().map(|(i, min)| {
+            if self.is_null_page(i) {
+                None
+            } else {
+                Some(min)
+            }
+        })
+    }
+
+    /// Returns an iterator over the max values.
+    ///
+    /// Values may be `None` when [`ColumnIndex::is_null_page()`] is `true`.
+    pub fn max_values_iter(&self) -> impl Iterator<Item = Option<&T>> {
+        self.max_values.iter().enumerate().map(|(i, min)| {
+            if self.is_null_page(i) {
+                None
+            } else {
+                Some(min)
+            }
+        })
+    }
+
     /// Returns the min value for the page indexed by `idx`
     ///
     /// It is `None` when all values are null
@@ -398,6 +433,32 @@ impl ByteArrayColumnIndex {
         }
     }
 
+    /// Returns an iterator over the min values.
+    ///
+    /// Values may be `None` when [`ColumnIndex::is_null_page()`] is `true`.
+    pub fn min_values_iter(&self) -> impl Iterator<Item = Option<&[u8]>> {
+        (0..self.num_pages() as usize).into_iter().map(|i| {
+            if self.is_null_page(i) {
+                None
+            } else {
+                self.min_value(i)
+            }
+        })
+    }
+
+    /// Returns an iterator over the max values.
+    ///
+    /// Values may be `None` when [`ColumnIndex::is_null_page()`] is `true`.
+    pub fn max_values_iter(&self) -> impl Iterator<Item = Option<&[u8]>> {
+        (0..self.num_pages() as usize).into_iter().map(|i| {
+            if self.is_null_page(i) {
+                None
+            } else {
+                self.max_value(i)
+            }
+        })
+    }
+
     pub(crate) fn to_thrift(&self) -> crate::format::ColumnIndex {
         let mut min_values = Vec::with_capacity(self.num_pages() as usize);
         for i in 0..self.num_pages() as usize {
@@ -524,12 +585,31 @@ impl ColumnIndexMetaData {
         }
     }
 
+    /// Returns array of null counts, one per page.
+    ///
+    /// Returns `None` if now null counts have been set in the index
+    pub fn null_counts(&self) -> Option<&Vec<i64>> {
+        match self {
+            Self::NONE => None,
+            Self::BOOLEAN(index) => index.null_counts.as_ref(),
+            Self::INT32(index) => index.null_counts.as_ref(),
+            Self::INT64(index) => index.null_counts.as_ref(),
+            Self::INT96(index) => index.null_counts.as_ref(),
+            Self::FLOAT(index) => index.null_counts.as_ref(),
+            Self::DOUBLE(index) => index.null_counts.as_ref(),
+            Self::BYTE_ARRAY(index) => index.null_counts.as_ref(),
+            Self::FIXED_LEN_BYTE_ARRAY(index) => index.null_counts.as_ref(),
+        }
+    }
+
     /// Returns the number of pages
     pub fn num_pages(&self) -> u64 {
         colidx_enum_func!(self, num_pages)
     }
 
     /// Returns the number of null values in the page indexed by `idx`
+    ///
+    /// Returns `None` if no null counts have been set in the index
     pub fn null_count(&self, idx: usize) -> Option<i64> {
         colidx_enum_func!(self, null_count, idx)
     }

From 79a6917efb7a8123329450de21ecce461729f796 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@live.com>
Date: Thu, 21 Aug 2025 17:10:03 -0700
Subject: [PATCH 10/46] get rid of import

---
 parquet/src/file/writer.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs
index 1808e88878e7..d0101aa84a35 100644
--- a/parquet/src/file/writer.rs
+++ b/parquet/src/file/writer.rs
@@ -1062,7 +1062,6 @@ mod tests {
     use crate::column::reader::get_typed_column_reader;
     use crate::compression::{create_codec, Codec, CodecOptionsBuilder};
     use crate::data_type::{BoolType, ByteArrayType, Int32Type};
-    use crate::file::page_index::index::Index;
     use crate::file::page_index::index_reader::ColumnIndexMetaData;
     use crate::file::properties::EnabledStatistics;
     use crate::file::serialized_reader::ReadOptionsBuilder;

From 878d4607dc3ac28d3b0e00b2b5647951cb48e329 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@live.com>
Date: Thu, 21 Aug 2025 20:05:42 -0700
Subject: [PATCH 11/46] get parquet-index working

---
 parquet/src/arrow/arrow_reader/statistics.rs | 20 ++---
 parquet/src/bin/parquet-index.rs             | 89 ++++++++++++++++----
 parquet/src/file/page_index/index_reader.rs  | 72 ++++++++--------
 3 files changed, 118 insertions(+), 63 deletions(-)

diff --git a/parquet/src/arrow/arrow_reader/statistics.rs b/parquet/src/arrow/arrow_reader/statistics.rs
index b719d81fe0a1..68dd36d0437a 100644
--- a/parquet/src/arrow/arrow_reader/statistics.rs
+++ b/parquet/src/arrow/arrow_reader/statistics.rs
@@ -654,42 +654,42 @@ make_data_page_stats_iterator!(
     min_values_iter,
     ColumnIndexMetaData::BOOLEAN,
     bool,
-    |m| m.clone()
+    |m| *m
 );
 make_data_page_stats_iterator!(
     MaxBooleanDataPageStatsIterator,
     max_values_iter,
     ColumnIndexMetaData::BOOLEAN,
     bool,
-    |m| m.clone()
+    |m| *m
 );
 make_data_page_stats_iterator!(
     MinInt32DataPageStatsIterator,
     min_values_iter,
     ColumnIndexMetaData::INT32,
     i32,
-    |m| m.clone()
+    |m| *m
 );
 make_data_page_stats_iterator!(
     MaxInt32DataPageStatsIterator,
     max_values_iter,
     ColumnIndexMetaData::INT32,
     i32,
-    |m| m.clone()
+    |m| *m
 );
 make_data_page_stats_iterator!(
     MinInt64DataPageStatsIterator,
     min_values_iter,
     ColumnIndexMetaData::INT64,
     i64,
-    |m| m.clone()
+    |m| *m
 );
 make_data_page_stats_iterator!(
     MaxInt64DataPageStatsIterator,
     max_values_iter,
     ColumnIndexMetaData::INT64,
     i64,
-    |m| m.clone()
+    |m| *m
 );
 make_data_page_stats_iterator!(
     MinFloat16DataPageStatsIterator,
@@ -710,28 +710,28 @@ make_data_page_stats_iterator!(
     min_values_iter,
     ColumnIndexMetaData::FLOAT,
     f32,
-    |m| m.clone()
+    |m| *m
 );
 make_data_page_stats_iterator!(
     MaxFloat32DataPageStatsIterator,
     max_values_iter,
     ColumnIndexMetaData::FLOAT,
     f32,
-    |m| m.clone()
+    |m| *m
 );
 make_data_page_stats_iterator!(
     MinFloat64DataPageStatsIterator,
     min_values_iter,
     ColumnIndexMetaData::DOUBLE,
     f64,
-    |m| m.clone()
+    |m| *m
 );
 make_data_page_stats_iterator!(
     MaxFloat64DataPageStatsIterator,
     max_values_iter,
     ColumnIndexMetaData::DOUBLE,
     f64,
-    |m| m.clone()
+    |m| *m
 );
 make_data_page_stats_iterator!(
     MinByteArrayDataPageStatsIterator,
diff --git a/parquet/src/bin/parquet-index.rs b/parquet/src/bin/parquet-index.rs
index e91f5e5a9f17..161a1507a146 100644
--- a/parquet/src/bin/parquet-index.rs
+++ b/parquet/src/bin/parquet-index.rs
@@ -35,8 +35,11 @@
 //! [page index]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
 
 use clap::Parser;
+use parquet::data_type::ByteArray;
 use parquet::errors::{ParquetError, Result};
-use parquet::file::page_index::index::{Index, PageIndex};
+use parquet::file::page_index::index_reader::{
+    ByteArrayColumnIndex, ColumnIndexMetaData, PrimitiveColumnIndex,
+};
 use parquet::file::page_index::offset_index::{OffsetIndexMetaData, PageLocation};
 use parquet::file::reader::{FileReader, SerializedFileReader};
 use parquet::file::serialized_reader::ReadOptionsBuilder;
@@ -96,16 +99,20 @@ impl Args {
             let row_counts =
                 compute_row_counts(offset_index.page_locations.as_slice(), row_group.num_rows());
             match &column_indices[column_idx] {
-                Index::NONE => println!("NO INDEX"),
-                Index::BOOLEAN(v) => print_index(&v.indexes, offset_index, &row_counts)?,
-                Index::INT32(v) => print_index(&v.indexes, offset_index, &row_counts)?,
-                Index::INT64(v) => print_index(&v.indexes, offset_index, &row_counts)?,
-                Index::INT96(v) => print_index(&v.indexes, offset_index, &row_counts)?,
-                Index::FLOAT(v) => print_index(&v.indexes, offset_index, &row_counts)?,
-                Index::DOUBLE(v) => print_index(&v.indexes, offset_index, &row_counts)?,
-                Index::BYTE_ARRAY(v) => print_index(&v.indexes, offset_index, &row_counts)?,
-                Index::FIXED_LEN_BYTE_ARRAY(v) => {
-                    print_index(&v.indexes, offset_index, &row_counts)?
+                ColumnIndexMetaData::NONE => println!("NO INDEX"),
+                ColumnIndexMetaData::BOOLEAN(v) => {
+                    print_index::<bool>(v, offset_index, &row_counts)?
+                }
+                ColumnIndexMetaData::INT32(v) => print_index(v, offset_index, &row_counts)?,
+                ColumnIndexMetaData::INT64(v) => print_index(v, offset_index, &row_counts)?,
+                ColumnIndexMetaData::INT96(v) => print_index(v, offset_index, &row_counts)?,
+                ColumnIndexMetaData::FLOAT(v) => print_index(v, offset_index, &row_counts)?,
+                ColumnIndexMetaData::DOUBLE(v) => print_index(v, offset_index, &row_counts)?,
+                ColumnIndexMetaData::BYTE_ARRAY(v) => {
+                    print_bytes_index(v, offset_index, &row_counts)?
+                }
+                ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(v) => {
+                    print_bytes_index(v, offset_index, &row_counts)?
                 }
             }
         }
@@ -131,20 +138,21 @@ fn compute_row_counts(offset_index: &[PageLocation], rows: i64) -> Vec<i64> {
 
 /// Prints index information for a single column chunk
 fn print_index<T: std::fmt::Display>(
-    column_index: &[PageIndex<T>],
+    column_index: &PrimitiveColumnIndex<T>,
     offset_index: &OffsetIndexMetaData,
     row_counts: &[i64],
 ) -> Result<()> {
-    if column_index.len() != offset_index.page_locations.len() {
+    if column_index.num_pages() as usize != offset_index.page_locations.len() {
         return Err(ParquetError::General(format!(
             "Index length mismatch, got {} and {}",
-            column_index.len(),
+            column_index.num_pages(),
             offset_index.page_locations.len()
         )));
     }
 
-    for (idx, ((c, o), row_count)) in column_index
-        .iter()
+    for (idx, (((min, max), o), row_count)) in column_index
+        .min_values_iter()
+        .zip(column_index.max_values_iter())
         .zip(offset_index.page_locations())
         .zip(row_counts)
         .enumerate()
@@ -153,12 +161,12 @@ fn print_index<T: std::fmt::Display>(
             "Page {:>5} at offset {:#010x} with length {:>10} and row count {:>10}",
             idx, o.offset, o.compressed_page_size, row_count
         );
-        match &c.min {
+        match min {
             Some(m) => print!(", min {m:>10}"),
             None => print!(", min {:>10}", "NONE"),
         }
 
-        match &c.max {
+        match max {
             Some(m) => print!(", max {m:>10}"),
             None => print!(", max {:>10}", "NONE"),
         }
@@ -168,6 +176,51 @@ fn print_index<T: std::fmt::Display>(
     Ok(())
 }
 
+fn print_bytes_index(
+    column_index: &ByteArrayColumnIndex,
+    offset_index: &OffsetIndexMetaData,
+    row_counts: &[i64],
+) -> Result<()> {
+    if column_index.num_pages() as usize != offset_index.page_locations.len() {
+        return Err(ParquetError::General(format!(
+            "Index length mismatch, got {} and {}",
+            column_index.num_pages(),
+            offset_index.page_locations.len()
+        )));
+    }
+
+    for (idx, (((min, max), o), row_count)) in column_index
+        .min_values_iter()
+        .zip(column_index.max_values_iter())
+        .zip(offset_index.page_locations())
+        .zip(row_counts)
+        .enumerate()
+    {
+        print!(
+            "Page {:>5} at offset {:#010x} with length {:>10} and row count {:>10}",
+            idx, o.offset, o.compressed_page_size, row_count
+        );
+        match min {
+            Some(m) => match String::from_utf8(m.to_vec()) {
+                Ok(s) => print!(", min {s:>10}"),
+                Err(_) => print!(", min {:>10}", ByteArray::from(m)),
+            },
+            None => print!(", min {:>10}", "NONE"),
+        }
+
+        match max {
+            Some(m) => match String::from_utf8(m.to_vec()) {
+                Ok(s) => print!(", max {s:>10}"),
+                Err(_) => print!(", min {:>10}", ByteArray::from(m)),
+            },
+            None => print!(", max {:>10}", "NONE"),
+        }
+        println!()
+    }
+
+    Ok(())
+}
+
 fn main() -> Result<()> {
     Args::parse().run()
 }
diff --git a/parquet/src/file/page_index/index_reader.rs b/parquet/src/file/page_index/index_reader.rs
index f13ac2aab407..d37ee789f728 100644
--- a/parquet/src/file/page_index/index_reader.rs
+++ b/parquet/src/file/page_index/index_reader.rs
@@ -201,7 +201,7 @@ impl ColumnIndex {
 
 /// Column index for primitive types
 #[derive(Debug, Clone, PartialEq)]
-pub struct PrimitiveColumnIndex<T: ParquetValueType> {
+pub struct PrimitiveColumnIndex<T> {
     pub(crate) column_index: ColumnIndex,
     pub(crate) min_values: Vec<T>,
     pub(crate) max_values: Vec<T>,
@@ -241,6 +241,37 @@ impl<T: ParquetValueType> PrimitiveColumnIndex<T> {
         })
     }
 
+    pub(crate) fn to_thrift(&self) -> crate::format::ColumnIndex {
+        let min_values = self
+            .min_values
+            .iter()
+            .map(|x| x.as_bytes().to_vec())
+            .collect::<Vec<_>>();
+
+        let max_values = self
+            .max_values
+            .iter()
+            .map(|x| x.as_bytes().to_vec())
+            .collect::<Vec<_>>();
+
+        let null_counts = self.null_counts.clone();
+        let repetition_level_histograms = self.repetition_level_histograms.clone();
+        let definition_level_histograms = self.definition_level_histograms.clone();
+        let null_pages = self.null_pages.clone();
+
+        crate::format::ColumnIndex::new(
+            null_pages,
+            min_values,
+            max_values,
+            self.boundary_order.into(),
+            null_counts,
+            repetition_level_histograms,
+            definition_level_histograms,
+        )
+    }
+}
+
+impl<T> PrimitiveColumnIndex<T> {
     /// Returns an array containing the min values for each page.
     ///
     /// Values in the returned slice are only valid if [`ColumnIndex::is_null_page()`]
@@ -304,38 +335,9 @@ impl<T: ParquetValueType> PrimitiveColumnIndex<T> {
             Some(&self.max_values[idx])
         }
     }
-
-    pub(crate) fn to_thrift(&self) -> crate::format::ColumnIndex {
-        let min_values = self
-            .min_values
-            .iter()
-            .map(|x| x.as_bytes().to_vec())
-            .collect::<Vec<_>>();
-
-        let max_values = self
-            .max_values
-            .iter()
-            .map(|x| x.as_bytes().to_vec())
-            .collect::<Vec<_>>();
-
-        let null_counts = self.null_counts.clone();
-        let repetition_level_histograms = self.repetition_level_histograms.clone();
-        let definition_level_histograms = self.definition_level_histograms.clone();
-        let null_pages = self.null_pages.clone();
-
-        crate::format::ColumnIndex::new(
-            null_pages,
-            min_values,
-            max_values,
-            self.boundary_order.into(),
-            null_counts,
-            repetition_level_histograms,
-            definition_level_histograms,
-        )
-    }
 }
 
-impl<T: ParquetValueType> Deref for PrimitiveColumnIndex<T> {
+impl<T> Deref for PrimitiveColumnIndex<T> {
     type Target = ColumnIndex;
 
     fn deref(&self) -> &Self::Target {
@@ -437,7 +439,7 @@ impl ByteArrayColumnIndex {
     ///
     /// Values may be `None` when [`ColumnIndex::is_null_page()`] is `true`.
     pub fn min_values_iter(&self) -> impl Iterator<Item = Option<&[u8]>> {
-        (0..self.num_pages() as usize).into_iter().map(|i| {
+        (0..self.num_pages() as usize).map(|i| {
             if self.is_null_page(i) {
                 None
             } else {
@@ -450,7 +452,7 @@ impl ByteArrayColumnIndex {
     ///
     /// Values may be `None` when [`ColumnIndex::is_null_page()`] is `true`.
     pub fn max_values_iter(&self) -> impl Iterator<Item = Option<&[u8]>> {
-        (0..self.num_pages() as usize).into_iter().map(|i| {
+        (0..self.num_pages() as usize).map(|i| {
             if self.is_null_page(i) {
                 None
             } else {
@@ -462,12 +464,12 @@ impl ByteArrayColumnIndex {
     pub(crate) fn to_thrift(&self) -> crate::format::ColumnIndex {
         let mut min_values = Vec::with_capacity(self.num_pages() as usize);
         for i in 0..self.num_pages() as usize {
-            min_values.push(self.min_value(i).unwrap_or(&vec![]).to_owned());
+            min_values.push(self.min_value(i).unwrap_or(&[]).to_owned());
         }
 
         let mut max_values = Vec::with_capacity(self.num_pages() as usize);
         for i in 0..self.num_pages() as usize {
-            max_values.push(self.max_value(i).unwrap_or(&vec![]).to_owned());
+            max_values.push(self.max_value(i).unwrap_or(&[]).to_owned());
         }
 
         let null_counts = self.null_counts.clone();

From 009632a91d6b04e519593ceb154e217d955a9c05 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@live.com>
Date: Thu, 21 Aug 2025 20:20:28 -0700
Subject: [PATCH 12/46] doc fixes

---
 parquet/src/arrow/arrow_reader/statistics.rs | 8 ++++----
 parquet/src/file/page_index/index_reader.rs  | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/parquet/src/arrow/arrow_reader/statistics.rs b/parquet/src/arrow/arrow_reader/statistics.rs
index 68dd36d0437a..c8d0c6581288 100644
--- a/parquet/src/arrow/arrow_reader/statistics.rs
+++ b/parquet/src/arrow/arrow_reader/statistics.rs
@@ -1186,7 +1186,7 @@ fn max_statistics<'a, I: Iterator<Item = Option<&'a ParquetStatistics>>>(
 }
 
 /// Extracts the min statistics from an iterator
-/// of parquet page [`Index`]'es to an [`ArrayRef`]
+/// of parquet page [`ColumnIndexMetaData`]'s to an [`ArrayRef`]
 pub(crate) fn min_page_statistics<'a, I>(
     data_type: &DataType,
     iterator: I,
@@ -1199,7 +1199,7 @@ where
 }
 
 /// Extracts the max statistics from an iterator
-/// of parquet page [`Index`]'es to an [`ArrayRef`]
+/// of parquet page [`ColumnIndexMetaData`]'s to an [`ArrayRef`]
 pub(crate) fn max_page_statistics<'a, I>(
     data_type: &DataType,
     iterator: I,
@@ -1212,7 +1212,7 @@ where
 }
 
 /// Extracts the null count statistics from an iterator
-/// of parquet page [`Index`]'es to an [`ArrayRef`]
+/// of parquet page [`ColumnIndexMetaData`]'s to an [`ArrayRef`]
 ///
 /// The returned Array is an [`UInt64Array`]
 pub(crate) fn null_counts_page_statistics<'a, I>(iterator: I) -> Result<UInt64Array>
@@ -1552,7 +1552,7 @@ impl<'a> StatisticsConverter<'a> {
     /// page level statistics can prune at a finer granularity.
     ///
     /// However since they are stored in a separate metadata
-    /// structure ([`Index`]) there is different code to extract them as
+    /// structure ([`ColumnIndexMetaData`]) there is different code to extract them as
     /// compared to arrow statistics.
     ///
     /// # Parameters:
diff --git a/parquet/src/file/page_index/index_reader.rs b/parquet/src/file/page_index/index_reader.rs
index d37ee789f728..d9358486ed84 100644
--- a/parquet/src/file/page_index/index_reader.rs
+++ b/parquet/src/file/page_index/index_reader.rs
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! Support for reading [`Index`] and [`OffsetIndexMetaData`] from parquet metadata.
+//! Support for reading [`ColumnIndexMetaData`] and [`OffsetIndexMetaData`] from parquet metadata.
 
 use crate::basic::{BoundaryOrder, Type};
 use crate::data_type::private::ParquetValueType;
@@ -38,7 +38,7 @@ pub(crate) fn acc_range(a: Option<Range<u64>>, b: Option<Range<u64>>) -> Option<
     }
 }
 
-/// Reads per-column [`Index`] for all columns of a row group by
+/// Reads per-column [`ColumnIndexMetaData`] for all columns of a row group by
 /// decoding [`ColumnIndex`] .
 ///
 /// Returns a vector of `index[column_number]`.

From a822dfd1f97b1e7f3722a36e8cc98b989e029994 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@live.com>
Date: Fri, 22 Aug 2025 08:35:47 -0700
Subject: [PATCH 13/46] move column index to its own module

---
 parquet/src/arrow/arrow_reader/statistics.rs |   2 +-
 parquet/src/arrow/arrow_writer/mod.rs        |   2 +-
 parquet/src/bin/parquet-index.rs             |   2 +-
 parquet/src/file/metadata/memory.rs          |   4 +-
 parquet/src/file/metadata/mod.rs             |   4 +-
 parquet/src/file/metadata/reader.rs          |   2 +-
 parquet/src/file/metadata/writer.rs          |   2 +-
 parquet/src/file/page_index/column_index.rs  | 514 +++++++++++++++++++
 parquet/src/file/page_index/index.rs         |   2 +-
 parquet/src/file/page_index/index_reader.rs  | 495 +-----------------
 parquet/src/file/page_index/mod.rs           |   1 +
 parquet/src/file/serialized_reader.rs        |   2 +-
 parquet/src/file/writer.rs                   |   2 +-
 parquet/tests/encryption/encryption_util.rs  |   2 +-
 14 files changed, 534 insertions(+), 502 deletions(-)
 create mode 100644 parquet/src/file/page_index/column_index.rs

diff --git a/parquet/src/arrow/arrow_reader/statistics.rs b/parquet/src/arrow/arrow_reader/statistics.rs
index c8d0c6581288..d98732f5d075 100644
--- a/parquet/src/arrow/arrow_reader/statistics.rs
+++ b/parquet/src/arrow/arrow_reader/statistics.rs
@@ -25,7 +25,7 @@ use crate::basic::Type as PhysicalType;
 use crate::data_type::{ByteArray, FixedLenByteArray};
 use crate::errors::{ParquetError, Result};
 use crate::file::metadata::{ParquetColumnIndex, ParquetOffsetIndex, RowGroupMetaData};
-use crate::file::page_index::index_reader::ColumnIndexMetaData;
+use crate::file::page_index::column_index::ColumnIndexMetaData;
 use crate::file::statistics::Statistics as ParquetStatistics;
 use crate::schema::types::SchemaDescriptor;
 use arrow_array::builder::{
diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs
index 1041a1af1f77..bd9f30c36103 100644
--- a/parquet/src/arrow/arrow_writer/mod.rs
+++ b/parquet/src/arrow/arrow_writer/mod.rs
@@ -1488,7 +1488,7 @@ mod tests {
     use crate::arrow::ARROW_SCHEMA_META_KEY;
     use crate::column::page::{Page, PageReader};
     use crate::file::page_encoding_stats::PageEncodingStats;
-    use crate::file::page_index::index_reader::ColumnIndexMetaData;
+    use crate::file::page_index::column_index::ColumnIndexMetaData;
     use crate::file::reader::SerializedPageReader;
     use crate::format::PageHeader;
     use crate::schema::types::ColumnPath;
diff --git a/parquet/src/bin/parquet-index.rs b/parquet/src/bin/parquet-index.rs
index 161a1507a146..397a75c76ae4 100644
--- a/parquet/src/bin/parquet-index.rs
+++ b/parquet/src/bin/parquet-index.rs
@@ -37,7 +37,7 @@
 use clap::Parser;
 use parquet::data_type::ByteArray;
 use parquet::errors::{ParquetError, Result};
-use parquet::file::page_index::index_reader::{
+use parquet::file::page_index::column_index::{
     ByteArrayColumnIndex, ColumnIndexMetaData, PrimitiveColumnIndex,
 };
 use parquet::file::page_index::offset_index::{OffsetIndexMetaData, PageLocation};
diff --git a/parquet/src/file/metadata/memory.rs b/parquet/src/file/metadata/memory.rs
index 5c1477e2cb14..69eee3c2999d 100644
--- a/parquet/src/file/metadata/memory.rs
+++ b/parquet/src/file/metadata/memory.rs
@@ -24,10 +24,10 @@ use crate::file::metadata::{
     ColumnChunkMetaData, FileMetaData, KeyValue, RowGroupMetaData, SortingColumn,
 };
 use crate::file::page_encoding_stats::PageEncodingStats;
-use crate::file::page_index::index::{Index, NativeIndex, PageIndex};
-use crate::file::page_index::index_reader::{
+use crate::file::page_index::column_index::{
     ByteArrayColumnIndex, ColumnIndex, ColumnIndexMetaData, PrimitiveColumnIndex,
 };
+use crate::file::page_index::index::{Index, NativeIndex, PageIndex};
 use crate::file::page_index::offset_index::{OffsetIndexMetaData, PageLocation};
 use crate::file::statistics::{Statistics, ValueStatistics};
 use std::sync::Arc;
diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs
index a619d76658e9..69cdf8f10714 100644
--- a/parquet/src/file/metadata/mod.rs
+++ b/parquet/src/file/metadata/mod.rs
@@ -106,7 +106,7 @@ use crate::file::column_crypto_metadata::{self, ColumnCryptoMetaData};
 pub(crate) use crate::file::metadata::memory::HeapSize;
 use crate::file::{
     page_encoding_stats::{self, PageEncodingStats},
-    page_index::{index_reader::ColumnIndexMetaData, offset_index::PageLocation},
+    page_index::{column_index::ColumnIndexMetaData, offset_index::PageLocation},
 };
 use crate::file::{
     page_index::index::PageIndex,
@@ -1948,7 +1948,7 @@ impl OffsetIndexBuilder {
 mod tests {
     use super::*;
     use crate::basic::{PageType, SortOrder};
-    use crate::file::page_index::index_reader::{ColumnIndex, PrimitiveColumnIndex};
+    use crate::file::page_index::column_index::{ColumnIndex, PrimitiveColumnIndex};
 
     #[test]
     fn test_row_group_metadata_thrift_conversion() {
diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs
index 97ea72ef964c..57cc7c57ac66 100644
--- a/parquet/src/file/metadata/reader.rs
+++ b/parquet/src/file/metadata/reader.rs
@@ -34,7 +34,7 @@ use bytes::Bytes;
 
 use crate::errors::{ParquetError, Result};
 use crate::file::metadata::{ColumnChunkMetaData, ParquetMetaData, RowGroupMetaData};
-use crate::file::page_index::index_reader::ColumnIndexMetaData;
+use crate::file::page_index::column_index::ColumnIndexMetaData;
 use crate::file::page_index::index_reader::{acc_range, decode_column_index, decode_offset_index};
 use crate::file::reader::ChunkReader;
 use crate::file::{FOOTER_SIZE, PARQUET_MAGIC, PARQUET_MAGIC_ENCR_FOOTER};
diff --git a/parquet/src/file/metadata/writer.rs b/parquet/src/file/metadata/writer.rs
index 8c485f7d0e8b..404bcf5dba8a 100644
--- a/parquet/src/file/metadata/writer.rs
+++ b/parquet/src/file/metadata/writer.rs
@@ -32,7 +32,7 @@ use crate::format::{AesGcmV1, ColumnCryptoMetaData};
 use crate::schema::types;
 use crate::schema::types::{SchemaDescPtr, SchemaDescriptor, TypePtr};
 use crate::thrift::TSerializable;
-use crate::{errors::Result, file::page_index::index_reader::ColumnIndexMetaData};
+use crate::{errors::Result, file::page_index::column_index::ColumnIndexMetaData};
 use std::io::Write;
 use std::sync::Arc;
 use thrift::protocol::TCompactOutputProtocol;
diff --git a/parquet/src/file/page_index/column_index.rs b/parquet/src/file/page_index/column_index.rs
new file mode 100644
index 000000000000..3fb6003e7c66
--- /dev/null
+++ b/parquet/src/file/page_index/column_index.rs
@@ -0,0 +1,514 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! [`ColumnIndexMetaData`] structures holding decoded [`ColumnIndex`] information
+//!
+//! [`ColumnIndex`]: crate::format::ColumnIndex
+//!
+
+use crate::errors::Result;
+use std::ops::Deref;
+
+use crate::{
+    basic::BoundaryOrder,
+    data_type::{private::ParquetValueType, Int96},
+    file::page_index::index_reader::ThriftColumnIndex,
+};
+
+/// Common bits of the column index
+#[derive(Debug, Clone, PartialEq)]
+pub struct ColumnIndex {
+    pub(crate) null_pages: Vec<bool>,
+    pub(crate) boundary_order: BoundaryOrder,
+    pub(crate) null_counts: Option<Vec<i64>>,
+    pub(crate) repetition_level_histograms: Option<Vec<i64>>,
+    pub(crate) definition_level_histograms: Option<Vec<i64>>,
+}
+
+impl ColumnIndex {
+    /// Returns the number of pages
+    pub fn num_pages(&self) -> u64 {
+        self.null_pages.len() as u64
+    }
+
+    /// Returns the number of null values in the page indexed by `idx`
+    ///
+    /// Returns `None` if no null counts have been set in the index
+    pub fn null_count(&self, idx: usize) -> Option<i64> {
+        self.null_counts.as_ref().map(|nc| nc[idx])
+    }
+
+    /// Returns the repetition level histogram for the page indexed by `idx`
+    pub fn repetition_level_histogram(&self, idx: usize) -> Option<&[i64]> {
+        if let Some(rep_hists) = self.repetition_level_histograms.as_ref() {
+            let num_lvls = rep_hists.len() / self.num_pages() as usize;
+            let start = num_lvls * idx;
+            Some(&rep_hists[start..start + num_lvls])
+        } else {
+            None
+        }
+    }
+
+    /// Returns the definition level histogram for the page indexed by `idx`
+    pub fn definition_level_histogram(&self, idx: usize) -> Option<&[i64]> {
+        if let Some(def_hists) = self.definition_level_histograms.as_ref() {
+            let num_lvls = def_hists.len() / self.num_pages() as usize;
+            let start = num_lvls * idx;
+            Some(&def_hists[start..start + num_lvls])
+        } else {
+            None
+        }
+    }
+
+    /// Returns whether the page indexed by `idx` consists of all null values
+    pub fn is_null_page(&self, idx: usize) -> bool {
+        self.null_pages[idx]
+    }
+}
+
+/// Column index for primitive types
+#[derive(Debug, Clone, PartialEq)]
+pub struct PrimitiveColumnIndex<T> {
+    pub(crate) column_index: ColumnIndex,
+    pub(crate) min_values: Vec<T>,
+    pub(crate) max_values: Vec<T>,
+}
+
+impl<T: ParquetValueType> PrimitiveColumnIndex<T> {
+    pub(super) fn try_new(index: ThriftColumnIndex) -> Result<Self> {
+        let len = index.null_pages.len();
+
+        let mut min_values = Vec::with_capacity(len);
+        let mut max_values = Vec::with_capacity(len);
+
+        for (i, is_null) in index.null_pages.iter().enumerate().take(len) {
+            if !is_null {
+                let min = index.min_values[i];
+                min_values.push(T::try_from_le_slice(min)?);
+
+                let max = index.max_values[i];
+                max_values.push(T::try_from_le_slice(max)?);
+            } else {
+                // need placeholders
+                min_values.push(Default::default());
+                max_values.push(Default::default());
+            }
+        }
+
+        Ok(Self {
+            column_index: ColumnIndex {
+                null_pages: index.null_pages,
+                boundary_order: index.boundary_order,
+                null_counts: index.null_counts,
+                repetition_level_histograms: index.repetition_level_histograms,
+                definition_level_histograms: index.definition_level_histograms,
+            },
+            min_values,
+            max_values,
+        })
+    }
+
+    pub(crate) fn to_thrift(&self) -> crate::format::ColumnIndex {
+        let min_values = self
+            .min_values
+            .iter()
+            .map(|x| x.as_bytes().to_vec())
+            .collect::<Vec<_>>();
+
+        let max_values = self
+            .max_values
+            .iter()
+            .map(|x| x.as_bytes().to_vec())
+            .collect::<Vec<_>>();
+
+        let null_counts = self.null_counts.clone();
+        let repetition_level_histograms = self.repetition_level_histograms.clone();
+        let definition_level_histograms = self.definition_level_histograms.clone();
+        let null_pages = self.null_pages.clone();
+
+        crate::format::ColumnIndex::new(
+            null_pages,
+            min_values,
+            max_values,
+            self.boundary_order.into(),
+            null_counts,
+            repetition_level_histograms,
+            definition_level_histograms,
+        )
+    }
+}
+
+impl<T> PrimitiveColumnIndex<T> {
+    /// Returns an array containing the min values for each page.
+    ///
+    /// Values in the returned slice are only valid if [`ColumnIndex::is_null_page()`]
+    /// is `false` for the same index.
+    pub fn min_values(&self) -> &[T] {
+        &self.min_values
+    }
+
+    /// Returns an array containing the max values for each page.
+    ///
+    /// Values in the returned slice are only valid if [`ColumnIndex::is_null_page()`]
+    /// is `false` for the same index.
+    pub fn max_values(&self) -> &[T] {
+        &self.max_values
+    }
+
+    /// Returns an iterator over the min values.
+    ///
+    /// Values may be `None` when [`ColumnIndex::is_null_page()`] is `true`.
+    pub fn min_values_iter(&self) -> impl Iterator<Item = Option<&T>> {
+        self.min_values.iter().enumerate().map(|(i, min)| {
+            if self.is_null_page(i) {
+                None
+            } else {
+                Some(min)
+            }
+        })
+    }
+
+    /// Returns an iterator over the max values.
+    ///
+    /// Values may be `None` when [`ColumnIndex::is_null_page()`] is `true`.
+    pub fn max_values_iter(&self) -> impl Iterator<Item = Option<&T>> {
+        self.max_values.iter().enumerate().map(|(i, min)| {
+            if self.is_null_page(i) {
+                None
+            } else {
+                Some(min)
+            }
+        })
+    }
+
+    /// Returns the min value for the page indexed by `idx`
+    ///
+    /// It is `None` when all values are null
+    pub fn min_value(&self, idx: usize) -> Option<&T> {
+        if self.null_pages[idx] {
+            None
+        } else {
+            Some(&self.min_values[idx])
+        }
+    }
+
+    /// Returns the max value for the page indexed by `idx`
+    ///
+    /// It is `None` when all values are null
+    pub fn max_value(&self, idx: usize) -> Option<&T> {
+        if self.null_pages[idx] {
+            None
+        } else {
+            Some(&self.max_values[idx])
+        }
+    }
+}
+
+impl<T> Deref for PrimitiveColumnIndex<T> {
+    type Target = ColumnIndex;
+
+    fn deref(&self) -> &Self::Target {
+        &self.column_index
+    }
+}
+
+/// Column index for byte arrays (fixed length and variable)
+#[derive(Debug, Clone, PartialEq)]
+pub struct ByteArrayColumnIndex {
+    pub(crate) column_index: ColumnIndex,
+    // raw bytes for min and max values
+    pub(crate) min_bytes: Vec<u8>,
+    pub(crate) min_offsets: Vec<usize>,
+    pub(crate) max_bytes: Vec<u8>,
+    pub(crate) max_offsets: Vec<usize>,
+}
+
+impl ByteArrayColumnIndex {
+    pub(super) fn try_new(index: ThriftColumnIndex) -> Result<Self> {
+        let len = index.null_pages.len();
+
+        let min_len = index.min_values.iter().map(|&v| v.len()).sum();
+        let max_len = index.max_values.iter().map(|&v| v.len()).sum();
+        let mut min_bytes = vec![0u8; min_len];
+        let mut max_bytes = vec![0u8; max_len];
+
+        let mut min_offsets = vec![0usize; len + 1];
+        let mut max_offsets = vec![0usize; len + 1];
+
+        let mut min_pos = 0;
+        let mut max_pos = 0;
+
+        for (i, is_null) in index.null_pages.iter().enumerate().take(len) {
+            if !is_null {
+                let min = index.min_values[i];
+                let dst = &mut min_bytes[min_pos..min_pos + min.len()];
+                dst.copy_from_slice(min);
+                min_offsets[i] = min_pos;
+                min_pos += min.len();
+
+                let max = index.max_values[i];
+                let dst = &mut max_bytes[max_pos..max_pos + max.len()];
+                dst.copy_from_slice(max);
+                max_offsets[i] = max_pos;
+                max_pos += max.len();
+            } else {
+                min_offsets[i] = min_pos;
+                max_offsets[i] = max_pos;
+            }
+        }
+
+        min_offsets[len] = min_pos;
+        max_offsets[len] = max_pos;
+
+        Ok(Self {
+            column_index: ColumnIndex {
+                null_pages: index.null_pages,
+                boundary_order: index.boundary_order,
+                null_counts: index.null_counts,
+                repetition_level_histograms: index.repetition_level_histograms,
+                definition_level_histograms: index.definition_level_histograms,
+            },
+
+            min_bytes,
+            min_offsets,
+            max_bytes,
+            max_offsets,
+        })
+    }
+
+    /// Returns the min value for the page indexed by `idx`
+    ///
+    /// It is `None` when all values are null
+    pub fn min_value(&self, idx: usize) -> Option<&[u8]> {
+        if self.null_pages[idx] {
+            None
+        } else {
+            let start = self.min_offsets[idx];
+            let end = self.min_offsets[idx + 1];
+            Some(&self.min_bytes[start..end])
+        }
+    }
+
+    /// Returns the max value for the page indexed by `idx`
+    ///
+    /// It is `None` when all values are null
+    pub fn max_value(&self, idx: usize) -> Option<&[u8]> {
+        if self.null_pages[idx] {
+            None
+        } else {
+            let start = self.max_offsets[idx];
+            let end = self.max_offsets[idx + 1];
+            Some(&self.max_bytes[start..end])
+        }
+    }
+
+    /// Returns an iterator over the min values.
+    ///
+    /// Values may be `None` when [`ColumnIndex::is_null_page()`] is `true`.
+    pub fn min_values_iter(&self) -> impl Iterator<Item = Option<&[u8]>> {
+        (0..self.num_pages() as usize).map(|i| {
+            if self.is_null_page(i) {
+                None
+            } else {
+                self.min_value(i)
+            }
+        })
+    }
+
+    /// Returns an iterator over the max values.
+    ///
+    /// Values may be `None` when [`ColumnIndex::is_null_page()`] is `true`.
+    pub fn max_values_iter(&self) -> impl Iterator<Item = Option<&[u8]>> {
+        (0..self.num_pages() as usize).map(|i| {
+            if self.is_null_page(i) {
+                None
+            } else {
+                self.max_value(i)
+            }
+        })
+    }
+
+    pub(crate) fn to_thrift(&self) -> crate::format::ColumnIndex {
+        let mut min_values = Vec::with_capacity(self.num_pages() as usize);
+        for i in 0..self.num_pages() as usize {
+            min_values.push(self.min_value(i).unwrap_or(&[]).to_owned());
+        }
+
+        let mut max_values = Vec::with_capacity(self.num_pages() as usize);
+        for i in 0..self.num_pages() as usize {
+            max_values.push(self.max_value(i).unwrap_or(&[]).to_owned());
+        }
+
+        let null_counts = self.null_counts.clone();
+        let repetition_level_histograms = self.repetition_level_histograms.clone();
+        let definition_level_histograms = self.definition_level_histograms.clone();
+        let null_pages = self.null_pages.clone();
+
+        crate::format::ColumnIndex::new(
+            null_pages,
+            min_values,
+            max_values,
+            self.boundary_order.into(),
+            null_counts,
+            repetition_level_histograms,
+            definition_level_histograms,
+        )
+    }
+}
+
+impl Deref for ByteArrayColumnIndex {
+    type Target = ColumnIndex;
+
+    fn deref(&self) -> &Self::Target {
+        &self.column_index
+    }
+}
+
+// Macro to generate getter functions for ColumnIndexMetaData.
+macro_rules! colidx_enum_func {
+    ($self:ident, $func:ident, $arg:ident) => {{
+        match *$self {
+            Self::BOOLEAN(ref typed) => typed.$func($arg),
+            Self::INT32(ref typed) => typed.$func($arg),
+            Self::INT64(ref typed) => typed.$func($arg),
+            Self::INT96(ref typed) => typed.$func($arg),
+            Self::FLOAT(ref typed) => typed.$func($arg),
+            Self::DOUBLE(ref typed) => typed.$func($arg),
+            Self::BYTE_ARRAY(ref typed) => typed.$func($arg),
+            Self::FIXED_LEN_BYTE_ARRAY(ref typed) => typed.$func($arg),
+            _ => panic!(concat!(
+                "Cannot call ",
+                stringify!($func),
+                " on ColumnIndexMetaData::NONE"
+            )),
+        }
+    }};
+    ($self:ident, $func:ident) => {{
+        match *$self {
+            Self::BOOLEAN(ref typed) => typed.$func(),
+            Self::INT32(ref typed) => typed.$func(),
+            Self::INT64(ref typed) => typed.$func(),
+            Self::INT96(ref typed) => typed.$func(),
+            Self::FLOAT(ref typed) => typed.$func(),
+            Self::DOUBLE(ref typed) => typed.$func(),
+            Self::BYTE_ARRAY(ref typed) => typed.$func(),
+            Self::FIXED_LEN_BYTE_ARRAY(ref typed) => typed.$func(),
+            _ => panic!(concat!(
+                "Cannot call ",
+                stringify!($func),
+                " on ColumnIndexMetaData::NONE"
+            )),
+        }
+    }};
+}
+
+/// index
+#[derive(Debug, Clone, PartialEq)]
+#[allow(non_camel_case_types)]
+pub enum ColumnIndexMetaData {
+    /// Sometimes reading page index from parquet file
+    /// will only return pageLocations without min_max index,
+    /// `NONE` represents this lack of index information
+    NONE,
+    /// Boolean type index
+    BOOLEAN(PrimitiveColumnIndex<bool>),
+    /// 32-bit integer type index
+    INT32(PrimitiveColumnIndex<i32>),
+    /// 64-bit integer type index
+    INT64(PrimitiveColumnIndex<i64>),
+    /// 96-bit integer type (timestamp) index
+    INT96(PrimitiveColumnIndex<Int96>),
+    /// 32-bit floating point type index
+    FLOAT(PrimitiveColumnIndex<f32>),
+    /// 64-bit floating point type index
+    DOUBLE(PrimitiveColumnIndex<f64>),
+    /// Byte array type index
+    BYTE_ARRAY(ByteArrayColumnIndex),
+    /// Fixed length byte array type index
+    FIXED_LEN_BYTE_ARRAY(ByteArrayColumnIndex),
+}
+
+impl ColumnIndexMetaData {
+    /// Return min/max elements inside ColumnIndex are ordered or not.
+    pub fn is_sorted(&self) -> bool {
+        // 0:UNORDERED, 1:ASCENDING ,2:DESCENDING,
+        if let Some(order) = self.get_boundary_order() {
+            order != BoundaryOrder::UNORDERED
+        } else {
+            false
+        }
+    }
+
+    /// Get boundary_order of this page index.
+    pub fn get_boundary_order(&self) -> Option<BoundaryOrder> {
+        match self {
+            Self::NONE => None,
+            Self::BOOLEAN(index) => Some(index.boundary_order),
+            Self::INT32(index) => Some(index.boundary_order),
+            Self::INT64(index) => Some(index.boundary_order),
+            Self::INT96(index) => Some(index.boundary_order),
+            Self::FLOAT(index) => Some(index.boundary_order),
+            Self::DOUBLE(index) => Some(index.boundary_order),
+            Self::BYTE_ARRAY(index) => Some(index.boundary_order),
+            Self::FIXED_LEN_BYTE_ARRAY(index) => Some(index.boundary_order),
+        }
+    }
+
+    /// Returns array of null counts, one per page.
+    ///
+    /// Returns `None` if now null counts have been set in the index
+    pub fn null_counts(&self) -> Option<&Vec<i64>> {
+        match self {
+            Self::NONE => None,
+            Self::BOOLEAN(index) => index.null_counts.as_ref(),
+            Self::INT32(index) => index.null_counts.as_ref(),
+            Self::INT64(index) => index.null_counts.as_ref(),
+            Self::INT96(index) => index.null_counts.as_ref(),
+            Self::FLOAT(index) => index.null_counts.as_ref(),
+            Self::DOUBLE(index) => index.null_counts.as_ref(),
+            Self::BYTE_ARRAY(index) => index.null_counts.as_ref(),
+            Self::FIXED_LEN_BYTE_ARRAY(index) => index.null_counts.as_ref(),
+        }
+    }
+
+    /// Returns the number of pages
+    pub fn num_pages(&self) -> u64 {
+        colidx_enum_func!(self, num_pages)
+    }
+
+    /// Returns the number of null values in the page indexed by `idx`
+    ///
+    /// Returns `None` if no null counts have been set in the index
+    pub fn null_count(&self, idx: usize) -> Option<i64> {
+        colidx_enum_func!(self, null_count, idx)
+    }
+
+    /// Returns the repetition level histogram for the page indexed by `idx`
+    pub fn repetition_level_histogram(&self, idx: usize) -> Option<&[i64]> {
+        colidx_enum_func!(self, repetition_level_histogram, idx)
+    }
+
+    /// Returns the definition level histogram for the page indexed by `idx`
+    pub fn definition_level_histogram(&self, idx: usize) -> Option<&[i64]> {
+        colidx_enum_func!(self, definition_level_histogram, idx)
+    }
+
+    /// Returns whether the page indexed by `idx` consists of all null values
+    pub fn is_null_page(&self, idx: usize) -> bool {
+        colidx_enum_func!(self, is_null_page, idx)
+    }
+}
diff --git a/parquet/src/file/page_index/index.rs b/parquet/src/file/page_index/index.rs
index 22d6e92666db..861dc0c3b04e 100644
--- a/parquet/src/file/page_index/index.rs
+++ b/parquet/src/file/page_index/index.rs
@@ -311,7 +311,7 @@ impl<T: ParquetValueType> NativeIndex<T> {
 
     /// Creates a new [`NativeIndex`]
     #[allow(dead_code)]
-    pub(crate) fn try_new_local(index: ThriftColumnIndex) -> Result<Self, ParquetError> {
+    pub(super) fn try_new_local(index: ThriftColumnIndex) -> Result<Self, ParquetError> {
         let len = index.min_values.len();
 
         // turn Option<Vec<i64>> into Vec<Option<i64>>
diff --git a/parquet/src/file/page_index/index_reader.rs b/parquet/src/file/page_index/index_reader.rs
index d9358486ed84..f35241689e1c 100644
--- a/parquet/src/file/page_index/index_reader.rs
+++ b/parquet/src/file/page_index/index_reader.rs
@@ -18,15 +18,17 @@
 //! Support for reading [`ColumnIndexMetaData`] and [`OffsetIndexMetaData`] from parquet metadata.
 
 use crate::basic::{BoundaryOrder, Type};
-use crate::data_type::private::ParquetValueType;
 use crate::data_type::Int96;
 use crate::errors::{ParquetError, Result};
 use crate::file::metadata::ColumnChunkMetaData;
+use crate::file::page_index::column_index::{
+    ByteArrayColumnIndex, ColumnIndexMetaData, PrimitiveColumnIndex,
+};
 use crate::file::page_index::offset_index::OffsetIndexMetaData;
 use crate::file::reader::ChunkReader;
 use crate::parquet_thrift::{FieldType, ThriftCompactInputProtocol};
 use crate::thrift_struct;
-use std::ops::{Deref, Range};
+use std::ops::Range;
 
 /// Computes the covering range of two optional ranges
 ///
@@ -134,8 +136,9 @@ pub(crate) fn decode_offset_index(data: &[u8]) -> Result<OffsetIndexMetaData, Pa
     OffsetIndexMetaData::try_from(&mut prot)
 }
 
+// private struct only used for decoding then discarded
 thrift_struct!(
-pub(crate) struct ThriftColumnIndex<'a> {
+pub(super) struct ThriftColumnIndex<'a> {
   1: required list<bool> null_pages
   2: required list<'a><binary> min_values
   3: required list<'a><binary> max_values
@@ -146,492 +149,6 @@ pub(crate) struct ThriftColumnIndex<'a> {
 }
 );
 
-// TODO: the following should move to its own module
-
-/// Common bits of the column index
-#[derive(Debug, Clone, PartialEq)]
-pub struct ColumnIndex {
-    pub(crate) null_pages: Vec<bool>,
-    pub(crate) boundary_order: BoundaryOrder,
-    pub(crate) null_counts: Option<Vec<i64>>,
-    pub(crate) repetition_level_histograms: Option<Vec<i64>>,
-    pub(crate) definition_level_histograms: Option<Vec<i64>>,
-}
-
-impl ColumnIndex {
-    /// Returns the number of pages
-    pub fn num_pages(&self) -> u64 {
-        self.null_pages.len() as u64
-    }
-
-    /// Returns the number of null values in the page indexed by `idx`
-    ///
-    /// Returns `None` if no null counts have been set in the index
-    pub fn null_count(&self, idx: usize) -> Option<i64> {
-        self.null_counts.as_ref().map(|nc| nc[idx])
-    }
-
-    /// Returns the repetition level histogram for the page indexed by `idx`
-    pub fn repetition_level_histogram(&self, idx: usize) -> Option<&[i64]> {
-        if let Some(rep_hists) = self.repetition_level_histograms.as_ref() {
-            let num_lvls = rep_hists.len() / self.num_pages() as usize;
-            let start = num_lvls * idx;
-            Some(&rep_hists[start..start + num_lvls])
-        } else {
-            None
-        }
-    }
-
-    /// Returns the definition level histogram for the page indexed by `idx`
-    pub fn definition_level_histogram(&self, idx: usize) -> Option<&[i64]> {
-        if let Some(def_hists) = self.definition_level_histograms.as_ref() {
-            let num_lvls = def_hists.len() / self.num_pages() as usize;
-            let start = num_lvls * idx;
-            Some(&def_hists[start..start + num_lvls])
-        } else {
-            None
-        }
-    }
-
-    /// Returns whether the page indexed by `idx` consists of all null values
-    pub fn is_null_page(&self, idx: usize) -> bool {
-        self.null_pages[idx]
-    }
-}
-
-/// Column index for primitive types
-#[derive(Debug, Clone, PartialEq)]
-pub struct PrimitiveColumnIndex<T> {
-    pub(crate) column_index: ColumnIndex,
-    pub(crate) min_values: Vec<T>,
-    pub(crate) max_values: Vec<T>,
-}
-
-impl<T: ParquetValueType> PrimitiveColumnIndex<T> {
-    fn try_new(index: ThriftColumnIndex) -> Result<Self> {
-        let len = index.null_pages.len();
-
-        let mut min_values = Vec::with_capacity(len);
-        let mut max_values = Vec::with_capacity(len);
-
-        for (i, is_null) in index.null_pages.iter().enumerate().take(len) {
-            if !is_null {
-                let min = index.min_values[i];
-                min_values.push(T::try_from_le_slice(min)?);
-
-                let max = index.max_values[i];
-                max_values.push(T::try_from_le_slice(max)?);
-            } else {
-                // need placeholders
-                min_values.push(Default::default());
-                max_values.push(Default::default());
-            }
-        }
-
-        Ok(Self {
-            column_index: ColumnIndex {
-                null_pages: index.null_pages,
-                boundary_order: index.boundary_order,
-                null_counts: index.null_counts,
-                repetition_level_histograms: index.repetition_level_histograms,
-                definition_level_histograms: index.definition_level_histograms,
-            },
-            min_values,
-            max_values,
-        })
-    }
-
-    pub(crate) fn to_thrift(&self) -> crate::format::ColumnIndex {
-        let min_values = self
-            .min_values
-            .iter()
-            .map(|x| x.as_bytes().to_vec())
-            .collect::<Vec<_>>();
-
-        let max_values = self
-            .max_values
-            .iter()
-            .map(|x| x.as_bytes().to_vec())
-            .collect::<Vec<_>>();
-
-        let null_counts = self.null_counts.clone();
-        let repetition_level_histograms = self.repetition_level_histograms.clone();
-        let definition_level_histograms = self.definition_level_histograms.clone();
-        let null_pages = self.null_pages.clone();
-
-        crate::format::ColumnIndex::new(
-            null_pages,
-            min_values,
-            max_values,
-            self.boundary_order.into(),
-            null_counts,
-            repetition_level_histograms,
-            definition_level_histograms,
-        )
-    }
-}
-
-impl<T> PrimitiveColumnIndex<T> {
-    /// Returns an array containing the min values for each page.
-    ///
-    /// Values in the returned slice are only valid if [`ColumnIndex::is_null_page()`]
-    /// is `false` for the same index.
-    pub fn min_values(&self) -> &[T] {
-        &self.min_values
-    }
-
-    /// Returns an array containing the max values for each page.
-    ///
-    /// Values in the returned slice are only valid if [`ColumnIndex::is_null_page()`]
-    /// is `false` for the same index.
-    pub fn max_values(&self) -> &[T] {
-        &self.max_values
-    }
-
-    /// Returns an iterator over the min values.
-    ///
-    /// Values may be `None` when [`ColumnIndex::is_null_page()`] is `true`.
-    pub fn min_values_iter(&self) -> impl Iterator<Item = Option<&T>> {
-        self.min_values.iter().enumerate().map(|(i, min)| {
-            if self.is_null_page(i) {
-                None
-            } else {
-                Some(min)
-            }
-        })
-    }
-
-    /// Returns an iterator over the max values.
-    ///
-    /// Values may be `None` when [`ColumnIndex::is_null_page()`] is `true`.
-    pub fn max_values_iter(&self) -> impl Iterator<Item = Option<&T>> {
-        self.max_values.iter().enumerate().map(|(i, min)| {
-            if self.is_null_page(i) {
-                None
-            } else {
-                Some(min)
-            }
-        })
-    }
-
-    /// Returns the min value for the page indexed by `idx`
-    ///
-    /// It is `None` when all values are null
-    pub fn min_value(&self, idx: usize) -> Option<&T> {
-        if self.null_pages[idx] {
-            None
-        } else {
-            Some(&self.min_values[idx])
-        }
-    }
-
-    /// Returns the max value for the page indexed by `idx`
-    ///
-    /// It is `None` when all values are null
-    pub fn max_value(&self, idx: usize) -> Option<&T> {
-        if self.null_pages[idx] {
-            None
-        } else {
-            Some(&self.max_values[idx])
-        }
-    }
-}
-
-impl<T> Deref for PrimitiveColumnIndex<T> {
-    type Target = ColumnIndex;
-
-    fn deref(&self) -> &Self::Target {
-        &self.column_index
-    }
-}
-
-/// Column index for byte arrays (fixed length and variable)
-#[derive(Debug, Clone, PartialEq)]
-pub struct ByteArrayColumnIndex {
-    pub(crate) column_index: ColumnIndex,
-    // raw bytes for min and max values
-    pub(crate) min_bytes: Vec<u8>,
-    pub(crate) min_offsets: Vec<usize>,
-    pub(crate) max_bytes: Vec<u8>,
-    pub(crate) max_offsets: Vec<usize>,
-}
-
-impl ByteArrayColumnIndex {
-    fn try_new(index: ThriftColumnIndex) -> Result<Self> {
-        let len = index.null_pages.len();
-
-        let min_len = index.min_values.iter().map(|&v| v.len()).sum();
-        let max_len = index.max_values.iter().map(|&v| v.len()).sum();
-        let mut min_bytes = vec![0u8; min_len];
-        let mut max_bytes = vec![0u8; max_len];
-
-        let mut min_offsets = vec![0usize; len + 1];
-        let mut max_offsets = vec![0usize; len + 1];
-
-        let mut min_pos = 0;
-        let mut max_pos = 0;
-
-        for (i, is_null) in index.null_pages.iter().enumerate().take(len) {
-            if !is_null {
-                let min = index.min_values[i];
-                let dst = &mut min_bytes[min_pos..min_pos + min.len()];
-                dst.copy_from_slice(min);
-                min_offsets[i] = min_pos;
-                min_pos += min.len();
-
-                let max = index.max_values[i];
-                let dst = &mut max_bytes[max_pos..max_pos + max.len()];
-                dst.copy_from_slice(max);
-                max_offsets[i] = max_pos;
-                max_pos += max.len();
-            } else {
-                min_offsets[i] = min_pos;
-                max_offsets[i] = max_pos;
-            }
-        }
-
-        min_offsets[len] = min_pos;
-        max_offsets[len] = max_pos;
-
-        Ok(Self {
-            column_index: ColumnIndex {
-                null_pages: index.null_pages,
-                boundary_order: index.boundary_order,
-                null_counts: index.null_counts,
-                repetition_level_histograms: index.repetition_level_histograms,
-                definition_level_histograms: index.definition_level_histograms,
-            },
-
-            min_bytes,
-            min_offsets,
-            max_bytes,
-            max_offsets,
-        })
-    }
-
-    /// Returns the min value for the page indexed by `idx`
-    ///
-    /// It is `None` when all values are null
-    pub fn min_value(&self, idx: usize) -> Option<&[u8]> {
-        if self.null_pages[idx] {
-            None
-        } else {
-            let start = self.min_offsets[idx];
-            let end = self.min_offsets[idx + 1];
-            Some(&self.min_bytes[start..end])
-        }
-    }
-
-    /// Returns the max value for the page indexed by `idx`
-    ///
-    /// It is `None` when all values are null
-    pub fn max_value(&self, idx: usize) -> Option<&[u8]> {
-        if self.null_pages[idx] {
-            None
-        } else {
-            let start = self.max_offsets[idx];
-            let end = self.max_offsets[idx + 1];
-            Some(&self.max_bytes[start..end])
-        }
-    }
-
-    /// Returns an iterator over the min values.
-    ///
-    /// Values may be `None` when [`ColumnIndex::is_null_page()`] is `true`.
-    pub fn min_values_iter(&self) -> impl Iterator<Item = Option<&[u8]>> {
-        (0..self.num_pages() as usize).map(|i| {
-            if self.is_null_page(i) {
-                None
-            } else {
-                self.min_value(i)
-            }
-        })
-    }
-
-    /// Returns an iterator over the max values.
-    ///
-    /// Values may be `None` when [`ColumnIndex::is_null_page()`] is `true`.
-    pub fn max_values_iter(&self) -> impl Iterator<Item = Option<&[u8]>> {
-        (0..self.num_pages() as usize).map(|i| {
-            if self.is_null_page(i) {
-                None
-            } else {
-                self.max_value(i)
-            }
-        })
-    }
-
-    pub(crate) fn to_thrift(&self) -> crate::format::ColumnIndex {
-        let mut min_values = Vec::with_capacity(self.num_pages() as usize);
-        for i in 0..self.num_pages() as usize {
-            min_values.push(self.min_value(i).unwrap_or(&[]).to_owned());
-        }
-
-        let mut max_values = Vec::with_capacity(self.num_pages() as usize);
-        for i in 0..self.num_pages() as usize {
-            max_values.push(self.max_value(i).unwrap_or(&[]).to_owned());
-        }
-
-        let null_counts = self.null_counts.clone();
-        let repetition_level_histograms = self.repetition_level_histograms.clone();
-        let definition_level_histograms = self.definition_level_histograms.clone();
-        let null_pages = self.null_pages.clone();
-
-        crate::format::ColumnIndex::new(
-            null_pages,
-            min_values,
-            max_values,
-            self.boundary_order.into(),
-            null_counts,
-            repetition_level_histograms,
-            definition_level_histograms,
-        )
-    }
-}
-
-impl Deref for ByteArrayColumnIndex {
-    type Target = ColumnIndex;
-
-    fn deref(&self) -> &Self::Target {
-        &self.column_index
-    }
-}
-
-// Macro to generate getter functions for ColumnIndexMetaData.
-macro_rules! colidx_enum_func {
-    ($self:ident, $func:ident, $arg:ident) => {{
-        match *$self {
-            Self::BOOLEAN(ref typed) => typed.$func($arg),
-            Self::INT32(ref typed) => typed.$func($arg),
-            Self::INT64(ref typed) => typed.$func($arg),
-            Self::INT96(ref typed) => typed.$func($arg),
-            Self::FLOAT(ref typed) => typed.$func($arg),
-            Self::DOUBLE(ref typed) => typed.$func($arg),
-            Self::BYTE_ARRAY(ref typed) => typed.$func($arg),
-            Self::FIXED_LEN_BYTE_ARRAY(ref typed) => typed.$func($arg),
-            _ => panic!(concat!(
-                "Cannot call ",
-                stringify!($func),
-                " on ColumnIndexMetaData::NONE"
-            )),
-        }
-    }};
-    ($self:ident, $func:ident) => {{
-        match *$self {
-            Self::BOOLEAN(ref typed) => typed.$func(),
-            Self::INT32(ref typed) => typed.$func(),
-            Self::INT64(ref typed) => typed.$func(),
-            Self::INT96(ref typed) => typed.$func(),
-            Self::FLOAT(ref typed) => typed.$func(),
-            Self::DOUBLE(ref typed) => typed.$func(),
-            Self::BYTE_ARRAY(ref typed) => typed.$func(),
-            Self::FIXED_LEN_BYTE_ARRAY(ref typed) => typed.$func(),
-            _ => panic!(concat!(
-                "Cannot call ",
-                stringify!($func),
-                " on ColumnIndexMetaData::NONE"
-            )),
-        }
-    }};
-}
-
-/// index
-#[derive(Debug, Clone, PartialEq)]
-#[allow(non_camel_case_types)]
-pub enum ColumnIndexMetaData {
-    /// Sometimes reading page index from parquet file
-    /// will only return pageLocations without min_max index,
-    /// `NONE` represents this lack of index information
-    NONE,
-    /// Boolean type index
-    BOOLEAN(PrimitiveColumnIndex<bool>),
-    /// 32-bit integer type index
-    INT32(PrimitiveColumnIndex<i32>),
-    /// 64-bit integer type index
-    INT64(PrimitiveColumnIndex<i64>),
-    /// 96-bit integer type (timestamp) index
-    INT96(PrimitiveColumnIndex<Int96>),
-    /// 32-bit floating point type index
-    FLOAT(PrimitiveColumnIndex<f32>),
-    /// 64-bit floating point type index
-    DOUBLE(PrimitiveColumnIndex<f64>),
-    /// Byte array type index
-    BYTE_ARRAY(ByteArrayColumnIndex),
-    /// Fixed length byte array type index
-    FIXED_LEN_BYTE_ARRAY(ByteArrayColumnIndex),
-}
-
-impl ColumnIndexMetaData {
-    /// Return min/max elements inside ColumnIndex are ordered or not.
-    pub fn is_sorted(&self) -> bool {
-        // 0:UNORDERED, 1:ASCENDING ,2:DESCENDING,
-        if let Some(order) = self.get_boundary_order() {
-            order != BoundaryOrder::UNORDERED
-        } else {
-            false
-        }
-    }
-
-    /// Get boundary_order of this page index.
-    pub fn get_boundary_order(&self) -> Option<BoundaryOrder> {
-        match self {
-            Self::NONE => None,
-            Self::BOOLEAN(index) => Some(index.boundary_order),
-            Self::INT32(index) => Some(index.boundary_order),
-            Self::INT64(index) => Some(index.boundary_order),
-            Self::INT96(index) => Some(index.boundary_order),
-            Self::FLOAT(index) => Some(index.boundary_order),
-            Self::DOUBLE(index) => Some(index.boundary_order),
-            Self::BYTE_ARRAY(index) => Some(index.boundary_order),
-            Self::FIXED_LEN_BYTE_ARRAY(index) => Some(index.boundary_order),
-        }
-    }
-
-    /// Returns array of null counts, one per page.
-    ///
-    /// Returns `None` if now null counts have been set in the index
-    pub fn null_counts(&self) -> Option<&Vec<i64>> {
-        match self {
-            Self::NONE => None,
-            Self::BOOLEAN(index) => index.null_counts.as_ref(),
-            Self::INT32(index) => index.null_counts.as_ref(),
-            Self::INT64(index) => index.null_counts.as_ref(),
-            Self::INT96(index) => index.null_counts.as_ref(),
-            Self::FLOAT(index) => index.null_counts.as_ref(),
-            Self::DOUBLE(index) => index.null_counts.as_ref(),
-            Self::BYTE_ARRAY(index) => index.null_counts.as_ref(),
-            Self::FIXED_LEN_BYTE_ARRAY(index) => index.null_counts.as_ref(),
-        }
-    }
-
-    /// Returns the number of pages
-    pub fn num_pages(&self) -> u64 {
-        colidx_enum_func!(self, num_pages)
-    }
-
-    /// Returns the number of null values in the page indexed by `idx`
-    ///
-    /// Returns `None` if no null counts have been set in the index
-    pub fn null_count(&self, idx: usize) -> Option<i64> {
-        colidx_enum_func!(self, null_count, idx)
-    }
-
-    /// Returns the repetition level histogram for the page indexed by `idx`
-    pub fn repetition_level_histogram(&self, idx: usize) -> Option<&[i64]> {
-        colidx_enum_func!(self, repetition_level_histogram, idx)
-    }
-
-    /// Returns the definition level histogram for the page indexed by `idx`
-    pub fn definition_level_histogram(&self, idx: usize) -> Option<&[i64]> {
-        colidx_enum_func!(self, definition_level_histogram, idx)
-    }
-
-    /// Returns whether the page indexed by `idx` consists of all null values
-    pub fn is_null_page(&self, idx: usize) -> bool {
-        colidx_enum_func!(self, is_null_page, idx)
-    }
-}
-
 pub(crate) fn decode_column_index(
     data: &[u8],
     column_type: Type,
diff --git a/parquet/src/file/page_index/mod.rs b/parquet/src/file/page_index/mod.rs
index a8077896db34..ff70e2eca5dd 100644
--- a/parquet/src/file/page_index/mod.rs
+++ b/parquet/src/file/page_index/mod.rs
@@ -19,6 +19,7 @@
 //!
 //! [Column Index]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
 
+pub mod column_index;
 pub mod index;
 pub mod index_reader;
 pub mod offset_index;
diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs
index 101599d3246e..5308825b0976 100644
--- a/parquet/src/file/serialized_reader.rs
+++ b/parquet/src/file/serialized_reader.rs
@@ -1102,7 +1102,7 @@ mod tests {
 
     use bytes::Buf;
 
-    use crate::file::page_index::index_reader::{
+    use crate::file::page_index::column_index::{
         ByteArrayColumnIndex, ColumnIndexMetaData, PrimitiveColumnIndex,
     };
     use crate::file::properties::{EnabledStatistics, WriterProperties};
diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs
index d0101aa84a35..65b96246ea03 100644
--- a/parquet/src/file/writer.rs
+++ b/parquet/src/file/writer.rs
@@ -1062,7 +1062,7 @@ mod tests {
     use crate::column::reader::get_typed_column_reader;
     use crate::compression::{create_codec, Codec, CodecOptionsBuilder};
     use crate::data_type::{BoolType, ByteArrayType, Int32Type};
-    use crate::file::page_index::index_reader::ColumnIndexMetaData;
+    use crate::file::page_index::column_index::ColumnIndexMetaData;
     use crate::file::properties::EnabledStatistics;
     use crate::file::serialized_reader::ReadOptionsBuilder;
     use crate::file::{
diff --git a/parquet/tests/encryption/encryption_util.rs b/parquet/tests/encryption/encryption_util.rs
index 549bdec47343..6817491b3024 100644
--- a/parquet/tests/encryption/encryption_util.rs
+++ b/parquet/tests/encryption/encryption_util.rs
@@ -191,7 +191,7 @@ pub(crate) fn verify_column_indexes(metadata: &ParquetMetaData) {
     let column_index = &column_index[0][float_col_idx];
 
     match column_index {
-        parquet::file::page_index::index_reader::ColumnIndexMetaData::FLOAT(float_index) => {
+        parquet::file::page_index::column_index::ColumnIndexMetaData::FLOAT(float_index) => {
             assert_eq!(float_index.num_pages(), 1);
             assert_eq!(float_index.min_value(0), Some(&0.0f32));
             assert!(float_index

From 20df075edd7f71e5c5b5127ddea07582861d7c93 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@live.com>
Date: Fri, 22 Aug 2025 11:21:35 -0700
Subject: [PATCH 14/46] add ColumnIndexIterators trait, simplify stats
 converter a little

---
 parquet/src/arrow/arrow_reader/statistics.rs | 123 +++----------------
 parquet/src/file/page_index/column_index.rs  |  57 ++++++++-
 2 files changed, 76 insertions(+), 104 deletions(-)

diff --git a/parquet/src/arrow/arrow_reader/statistics.rs b/parquet/src/arrow/arrow_reader/statistics.rs
index d98732f5d075..21a06050d849 100644
--- a/parquet/src/arrow/arrow_reader/statistics.rs
+++ b/parquet/src/arrow/arrow_reader/statistics.rs
@@ -25,7 +25,7 @@ use crate::basic::Type as PhysicalType;
 use crate::data_type::{ByteArray, FixedLenByteArray};
 use crate::errors::{ParquetError, Result};
 use crate::file::metadata::{ParquetColumnIndex, ParquetOffsetIndex, RowGroupMetaData};
-use crate::file::page_index::column_index::ColumnIndexMetaData;
+use crate::file::page_index::column_index::{ColumnIndexIterators, ColumnIndexMetaData};
 use crate::file::statistics::Statistics as ParquetStatistics;
 use crate::schema::types::SchemaDescriptor;
 use arrow_array::builder::{
@@ -597,7 +597,7 @@ macro_rules! get_statistics {
 }
 
 macro_rules! make_data_page_stats_iterator {
-    ($iterator_type: ident, $func: ident, $index_type: path, $stat_value_type: ty, $conv:expr) => {
+    ($iterator_type: ident, $func: ident, $stat_value_type: ty) => {
         struct $iterator_type<'a, I>
         where
             I: Iterator<Item = (usize, &'a ColumnIndexMetaData)>,
@@ -624,19 +624,8 @@ macro_rules! make_data_page_stats_iterator {
                 let next = self.iter.next();
                 match next {
                     Some((len, index)) => match index {
-                        $index_type(native_index) => Some(
-                            native_index
-                                .$func()
-                                .map(|v| v.map($conv))
-                                .collect::<Vec<_>>(),
-                        ),
-                        // No matching `Index` found;
-                        // thus no statistics that can be extracted.
-                        // We return vec![None; len] to effectively
-                        // create an arrow null-array with the length
-                        // corresponding to the number of entries in
-                        // `ParquetOffsetIndex` per row group per column.
-                        _ => Some(vec![None; len]),
+                        ColumnIndexMetaData::NONE => Some(vec![None; len]),
+                        _ => Some(<$stat_value_type>::$func(&index).collect::<Vec<_>>()),
                     },
                     _ => None,
                 }
@@ -649,118 +638,46 @@ macro_rules! make_data_page_stats_iterator {
     };
 }
 
-make_data_page_stats_iterator!(
-    MinBooleanDataPageStatsIterator,
-    min_values_iter,
-    ColumnIndexMetaData::BOOLEAN,
-    bool,
-    |m| *m
-);
-make_data_page_stats_iterator!(
-    MaxBooleanDataPageStatsIterator,
-    max_values_iter,
-    ColumnIndexMetaData::BOOLEAN,
-    bool,
-    |m| *m
-);
-make_data_page_stats_iterator!(
-    MinInt32DataPageStatsIterator,
-    min_values_iter,
-    ColumnIndexMetaData::INT32,
-    i32,
-    |m| *m
-);
-make_data_page_stats_iterator!(
-    MaxInt32DataPageStatsIterator,
-    max_values_iter,
-    ColumnIndexMetaData::INT32,
-    i32,
-    |m| *m
-);
-make_data_page_stats_iterator!(
-    MinInt64DataPageStatsIterator,
-    min_values_iter,
-    ColumnIndexMetaData::INT64,
-    i64,
-    |m| *m
-);
-make_data_page_stats_iterator!(
-    MaxInt64DataPageStatsIterator,
-    max_values_iter,
-    ColumnIndexMetaData::INT64,
-    i64,
-    |m| *m
-);
+make_data_page_stats_iterator!(MinBooleanDataPageStatsIterator, min_values_iter, bool);
+make_data_page_stats_iterator!(MaxBooleanDataPageStatsIterator, max_values_iter, bool);
+make_data_page_stats_iterator!(MinInt32DataPageStatsIterator, min_values_iter, i32);
+make_data_page_stats_iterator!(MaxInt32DataPageStatsIterator, max_values_iter, i32);
+make_data_page_stats_iterator!(MinInt64DataPageStatsIterator, min_values_iter, i64);
+make_data_page_stats_iterator!(MaxInt64DataPageStatsIterator, max_values_iter, i64);
 make_data_page_stats_iterator!(
     MinFloat16DataPageStatsIterator,
     min_values_iter,
-    ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY,
-    FixedLenByteArray,
-    |m| FixedLenByteArray::from(m.to_owned())
+    FixedLenByteArray
 );
 make_data_page_stats_iterator!(
     MaxFloat16DataPageStatsIterator,
     max_values_iter,
-    ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY,
-    FixedLenByteArray,
-    |m| FixedLenByteArray::from(m.to_owned())
-);
-make_data_page_stats_iterator!(
-    MinFloat32DataPageStatsIterator,
-    min_values_iter,
-    ColumnIndexMetaData::FLOAT,
-    f32,
-    |m| *m
-);
-make_data_page_stats_iterator!(
-    MaxFloat32DataPageStatsIterator,
-    max_values_iter,
-    ColumnIndexMetaData::FLOAT,
-    f32,
-    |m| *m
-);
-make_data_page_stats_iterator!(
-    MinFloat64DataPageStatsIterator,
-    min_values_iter,
-    ColumnIndexMetaData::DOUBLE,
-    f64,
-    |m| *m
-);
-make_data_page_stats_iterator!(
-    MaxFloat64DataPageStatsIterator,
-    max_values_iter,
-    ColumnIndexMetaData::DOUBLE,
-    f64,
-    |m| *m
+    FixedLenByteArray
 );
+make_data_page_stats_iterator!(MinFloat32DataPageStatsIterator, min_values_iter, f32);
+make_data_page_stats_iterator!(MaxFloat32DataPageStatsIterator, max_values_iter, f32);
+make_data_page_stats_iterator!(MinFloat64DataPageStatsIterator, min_values_iter, f64);
+make_data_page_stats_iterator!(MaxFloat64DataPageStatsIterator, max_values_iter, f64);
 make_data_page_stats_iterator!(
     MinByteArrayDataPageStatsIterator,
     min_values_iter,
-    ColumnIndexMetaData::BYTE_ARRAY,
-    ByteArray,
-    |m| ByteArray::from(m.to_owned())
+    ByteArray
 );
 make_data_page_stats_iterator!(
     MaxByteArrayDataPageStatsIterator,
     max_values_iter,
-    ColumnIndexMetaData::BYTE_ARRAY,
-    ByteArray,
-    |m| ByteArray::from(m.to_owned())
+    ByteArray
 );
 make_data_page_stats_iterator!(
     MaxFixedLenByteArrayDataPageStatsIterator,
     max_values_iter,
-    ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY,
-    FixedLenByteArray,
-    |m| FixedLenByteArray::from(m.to_owned())
+    FixedLenByteArray
 );
 
 make_data_page_stats_iterator!(
     MinFixedLenByteArrayDataPageStatsIterator,
     min_values_iter,
-    ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY,
-    FixedLenByteArray,
-    |m| FixedLenByteArray::from(m.to_owned())
+    FixedLenByteArray
 );
 
 macro_rules! get_decimal_page_stats_iterator {
diff --git a/parquet/src/file/page_index/column_index.rs b/parquet/src/file/page_index/column_index.rs
index 3fb6003e7c66..2d43c93b2e4b 100644
--- a/parquet/src/file/page_index/column_index.rs
+++ b/parquet/src/file/page_index/column_index.rs
@@ -20,7 +20,10 @@
 //! [`ColumnIndex`]: crate::format::ColumnIndex
 //!
 
-use crate::errors::Result;
+use crate::{
+    data_type::{ByteArray, FixedLenByteArray},
+    errors::Result,
+};
 use std::ops::Deref;
 
 use crate::{
@@ -512,3 +515,55 @@ impl ColumnIndexMetaData {
         colidx_enum_func!(self, is_null_page, idx)
     }
 }
+
+/// Provides iterators over min and max values of a [`ColumnIndexMetaData`]
+pub trait ColumnIndexIterators {
+    /// Can be one of `bool`, `i32`, `i64`, `Int96`, `f32`, `f64`, [`ByteArray`],
+    /// or [`FixedLenByteArray`]
+    type Item;
+
+    /// Return iterator over the min values for the index
+    fn min_values_iter(colidx: &ColumnIndexMetaData) -> impl Iterator<Item = Option<Self::Item>>;
+
+    /// Return iterator over the max values for the index
+    fn max_values_iter(colidx: &ColumnIndexMetaData) -> impl Iterator<Item = Option<Self::Item>>;
+}
+
+macro_rules! column_index_iters {
+    ($item: ident, $variant: ident, $conv:expr) => {
+        impl ColumnIndexIterators for $item {
+            type Item = $item;
+
+            fn min_values_iter(
+                colidx: &ColumnIndexMetaData,
+            ) -> impl Iterator<Item = Option<Self::Item>> {
+                if let ColumnIndexMetaData::$variant(index) = colidx {
+                    index.min_values_iter().map($conv)
+                } else {
+                    panic!(concat!("Wrong type for ", stringify!($item), " iterator"))
+                }
+            }
+
+            fn max_values_iter(
+                colidx: &ColumnIndexMetaData,
+            ) -> impl Iterator<Item = Option<Self::Item>> {
+                if let ColumnIndexMetaData::$variant(index) = colidx {
+                    index.max_values_iter().map($conv)
+                } else {
+                    panic!(concat!("Wrong type for ", stringify!($item), " iterator"))
+                }
+            }
+        }
+    };
+}
+
+column_index_iters!(bool, BOOLEAN, |v| v.copied());
+column_index_iters!(i32, INT32, |v| v.copied());
+column_index_iters!(i64, INT64, |v| v.copied());
+column_index_iters!(Int96, INT96, |v| v.copied());
+column_index_iters!(f32, FLOAT, |v| v.copied());
+column_index_iters!(f64, DOUBLE, |v| v.copied());
+column_index_iters!(ByteArray, BYTE_ARRAY, |v| v
+    .map(|v| ByteArray::from(v.to_owned())));
+column_index_iters!(FixedLenByteArray, FIXED_LEN_BYTE_ARRAY, |v| v
+    .map(|v| FixedLenByteArray::from(v.to_owned())));

From 7755b7b0af1cfb9d49354efc060a9486f908ae7a Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@live.com>
Date: Fri, 22 Aug 2025 11:24:33 -0700
Subject: [PATCH 15/46] restore comment

---
 parquet/src/arrow/arrow_reader/statistics.rs | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/parquet/src/arrow/arrow_reader/statistics.rs b/parquet/src/arrow/arrow_reader/statistics.rs
index 21a06050d849..1613656ab9ae 100644
--- a/parquet/src/arrow/arrow_reader/statistics.rs
+++ b/parquet/src/arrow/arrow_reader/statistics.rs
@@ -624,6 +624,12 @@ macro_rules! make_data_page_stats_iterator {
                 let next = self.iter.next();
                 match next {
                     Some((len, index)) => match index {
+                        // No matching `Index` found;
+                        // thus no statistics that can be extracted.
+                        // We return vec![None; len] to effectively
+                        // create an arrow null-array with the length
+                        // corresponding to the number of entries in
+                        // `ParquetOffsetIndex` per row group per column.
                         ColumnIndexMetaData::NONE => Some(vec![None; len]),
                         _ => Some(<$stat_value_type>::$func(&index).collect::<Vec<_>>()),
                     },

From f6c5738846df0e62b6f855549e8b824ae83aa9f9 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@live.com>
Date: Sat, 23 Aug 2025 17:09:40 -0700
Subject: [PATCH 16/46] further rework...allow for fallback to slow decoder

---
 parquet/src/file/page_index/index_reader.rs |  13 +-
 parquet/src/file/page_index/offset_index.rs | 156 ++++++++++++--------
 2 files changed, 105 insertions(+), 64 deletions(-)

diff --git a/parquet/src/file/page_index/index_reader.rs b/parquet/src/file/page_index/index_reader.rs
index fbe6d3984596..27ad753a0c24 100644
--- a/parquet/src/file/page_index/index_reader.rs
+++ b/parquet/src/file/page_index/index_reader.rs
@@ -22,7 +22,7 @@ use crate::data_type::Int96;
 use crate::errors::{ParquetError, Result};
 use crate::file::metadata::ColumnChunkMetaData;
 use crate::file::page_index::index::{Index, NativeIndex};
-use crate::file::page_index::offset_index::OffsetIndexMetaData;
+use crate::file::page_index::offset_index::{read_offset_index, OffsetIndexMetaData};
 use crate::file::reader::ChunkReader;
 use crate::parquet_thrift::{FieldType, ThriftCompactInputProtocol};
 use crate::thrift_struct;
@@ -131,7 +131,16 @@ pub fn read_offset_indexes<R: ChunkReader>(
 
 pub(crate) fn decode_offset_index(data: &[u8]) -> Result<OffsetIndexMetaData, ParquetError> {
     let mut prot = ThriftCompactInputProtocol::new(data);
-    OffsetIndexMetaData::try_from(&mut prot)
+
+    // Try to read fast-path index first. If that fails, fall back to slower but more robust
+    // reader
+    match read_offset_index(&mut prot) {
+        Ok(offset_index) => Ok(offset_index),
+        Err(_) => {
+            prot = ThriftCompactInputProtocol::new(data);
+            OffsetIndexMetaData::try_from(&mut prot)
+        }
+    }
 }
 
 thrift_struct!(
diff --git a/parquet/src/file/page_index/offset_index.rs b/parquet/src/file/page_index/offset_index.rs
index 791f61d37eae..2ec51c5d6ee4 100644
--- a/parquet/src/file/page_index/offset_index.rs
+++ b/parquet/src/file/page_index/offset_index.rs
@@ -25,7 +25,7 @@ use crate::{
     thrift_struct,
 };
 
-/*thrift_struct!(
+thrift_struct!(
 /// Page location information for [`OffsetIndexMetaData`]
 pub struct PageLocation {
   /// Offset of the page in the file
@@ -37,67 +37,7 @@ pub struct PageLocation {
   /// (repetition_level = 0).
   3: required i64 first_row_index
 }
-);*/
-
-// hand coding this one because it is very time critical
-
-/// Page location information for [`OffsetIndexMetaData`]
-#[derive(Clone, Debug, Eq, PartialEq)]
-pub struct PageLocation {
-    /// Offset of the page in the file
-    pub offset: i64,
-    /// Size of the page, including header. Sum of compressed_page_size and header
-    pub compressed_page_size: i32,
-    /// Index within the RowGroup of the first row of the page. When an
-    /// OffsetIndex is present, pages must begin on row boundaries
-    /// (repetition_level = 0).
-    pub first_row_index: i64,
-}
-
-// Note: this will fail if the fields are either out of order, or if a suboptimal
-// encoder doesn't use field deltas. If that ever occurs, remove this code and
-// revert to the commented out thrift_struct!() implementation above.
-impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for PageLocation {
-    type Error = ParquetError;
-    fn try_from(prot: &mut ThriftCompactInputProtocol<'a>) -> Result<Self> {
-        // there are 3 fields, all mandatory, so all field deltas should be 1
-        let (field_type, delta) = prot.read_field_header()?;
-        if delta != 1 || field_type != FieldType::I64 as u8 {
-            return Err(general_err!("error reading PageLocation::offset"));
-        }
-        let offset = prot.read_i64()?;
-
-        let (field_type, delta) = prot.read_field_header()?;
-        if delta != 1 || field_type != FieldType::I32 as u8 {
-            return Err(general_err!(
-                "error reading PageLocation::compressed_page_size"
-            ));
-        }
-        let compressed_page_size = prot.read_i32()?;
-
-        let (field_type, delta) = prot.read_field_header()?;
-        if delta != 1 || field_type != FieldType::I64 as u8 {
-            return Err(general_err!("error reading PageLocation::first_row_index"));
-        }
-        let first_row_index = prot.read_i64()?;
-
-        // This loop slows things down a bit, but it's an acceptible price to allow
-        // forwards compatibility. We could instead assert the next field is Stop.
-        loop {
-            let (field_type, _) = prot.read_field_header()?;
-            if field_type == FieldType::Stop as u8 {
-                break;
-            }
-            prot.skip(FieldType::try_from(field_type)?)?;
-        }
-
-        Ok(Self {
-            offset,
-            compressed_page_size,
-            first_row_index,
-        })
-    }
-}
+);
 
 impl From<&crate::format::PageLocation> for PageLocation {
     fn from(value: &crate::format::PageLocation) -> Self {
@@ -165,3 +105,95 @@ impl OffsetIndexMetaData {
         )
     }
 }
+
+// hand coding this one because it is very time critical
+
+// Note: this will fail if the fields are either out of order, or if a suboptimal
+// encoder doesn't use field deltas.
+fn read_page_location<'a>(prot: &mut ThriftCompactInputProtocol<'a>) -> Result<PageLocation> {
+    // there are 3 fields, all mandatory, so all field deltas should be 1
+    let (field_type, delta) = prot.read_field_header()?;
+    if delta != 1 || field_type != FieldType::I64 as u8 {
+        return Err(general_err!("error reading PageLocation::offset"));
+    }
+    let offset = prot.read_i64()?;
+
+    let (field_type, delta) = prot.read_field_header()?;
+    if delta != 1 || field_type != FieldType::I32 as u8 {
+        return Err(general_err!(
+            "error reading PageLocation::compressed_page_size"
+        ));
+    }
+    let compressed_page_size = prot.read_i32()?;
+
+    let (field_type, delta) = prot.read_field_header()?;
+    if delta != 1 || field_type != FieldType::I64 as u8 {
+        return Err(general_err!("error reading PageLocation::first_row_index"));
+    }
+    let first_row_index = prot.read_i64()?;
+
+    // read end of struct...return error if there are unknown fields present
+    let (field_type, _) = prot.read_field_header()?;
+    if field_type != FieldType::Stop as u8 {
+        return Err(general_err!("unexpected field in PageLocation"));
+    }
+
+    Ok(PageLocation {
+        offset,
+        compressed_page_size,
+        first_row_index,
+    })
+}
+
+// Fast-path read of offset index. this all works because we expect all field deltas to be 1,
+// and there's no nesting beyond PageLocation, so no need to save the last field id. Like
+// read_page_locations(), this will fail if absolute field id's are used.
+pub(super) fn read_offset_index<'a>(
+    prot: &mut ThriftCompactInputProtocol<'a>,
+) -> Result<OffsetIndexMetaData> {
+    // Offset index is a struct with 2 fields. First field is an array of PageLocations,
+    // the second an optional array of i64.
+
+    // read field 1 header, then list header, then vec of PageLocations
+    let (field_type, delta) = prot.read_field_header()?;
+    if delta != 1 || field_type != FieldType::List as u8 {
+        return Err(general_err!("error reading OffsetIndex::page_locations"));
+    }
+    let list_ident = prot.read_list_begin()?;
+    let mut page_locations = Vec::with_capacity(list_ident.size as usize);
+    for _ in 0..list_ident.size {
+        page_locations.push(read_page_location(prot)?);
+    }
+
+    let mut unencoded_byte_array_data_bytes: Option<Vec<i64>> = None;
+
+    // read second field...if it's Stop we're done
+    let (mut field_type, delta) = prot.read_field_header()?;
+    if field_type == FieldType::List as u8 {
+        if delta != 1 {
+            return Err(general_err!(
+                "encountered unknown field while reading OffsetIndex"
+            ));
+        }
+        let list_ident = prot.read_list_begin()?;
+        let mut vec = Vec::with_capacity(list_ident.size as usize);
+        for _ in 0..list_ident.size {
+            vec.push(prot.read_i64()?);
+        }
+        unencoded_byte_array_data_bytes = Some(vec);
+
+        // this one should be Stop
+        (field_type, _) = prot.read_field_header()?;
+    }
+
+    if field_type != FieldType::Stop as u8 {
+        return Err(general_err!(
+            "encountered unknown field while reading OffsetIndex"
+        ));
+    }
+
+    Ok(OffsetIndexMetaData {
+        page_locations,
+        unencoded_byte_array_data_bytes,
+    })
+}

From 09d71e179ffe4273f95ae6493d260d22a5346c22 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@live.com>
Date: Sat, 23 Aug 2025 18:50:36 -0700
Subject: [PATCH 17/46] refactor a bit

---
 parquet/src/file/page_index/index_reader.rs |   8 +-
 parquet/src/file/page_index/offset_index.rs | 104 ++++++++++----------
 2 files changed, 55 insertions(+), 57 deletions(-)

diff --git a/parquet/src/file/page_index/index_reader.rs b/parquet/src/file/page_index/index_reader.rs
index 27ad753a0c24..fb3519b5cbb2 100644
--- a/parquet/src/file/page_index/index_reader.rs
+++ b/parquet/src/file/page_index/index_reader.rs
@@ -22,7 +22,7 @@ use crate::data_type::Int96;
 use crate::errors::{ParquetError, Result};
 use crate::file::metadata::ColumnChunkMetaData;
 use crate::file::page_index::index::{Index, NativeIndex};
-use crate::file::page_index::offset_index::{read_offset_index, OffsetIndexMetaData};
+use crate::file::page_index::offset_index::OffsetIndexMetaData;
 use crate::file::reader::ChunkReader;
 use crate::parquet_thrift::{FieldType, ThriftCompactInputProtocol};
 use crate::thrift_struct;
@@ -132,9 +132,9 @@ pub fn read_offset_indexes<R: ChunkReader>(
 pub(crate) fn decode_offset_index(data: &[u8]) -> Result<OffsetIndexMetaData, ParquetError> {
     let mut prot = ThriftCompactInputProtocol::new(data);
 
-    // Try to read fast-path index first. If that fails, fall back to slower but more robust
-    // reader
-    match read_offset_index(&mut prot) {
+    // Try to read fast-path first. If that fails, fall back to slower but more robust
+    // decoder.
+    match OffsetIndexMetaData::try_from_fast(&mut prot) {
         Ok(offset_index) => Ok(offset_index),
         Err(_) => {
             prot = ThriftCompactInputProtocol::new(data);
diff --git a/parquet/src/file/page_index/offset_index.rs b/parquet/src/file/page_index/offset_index.rs
index 2ec51c5d6ee4..d6baa1e44892 100644
--- a/parquet/src/file/page_index/offset_index.rs
+++ b/parquet/src/file/page_index/offset_index.rs
@@ -104,6 +104,57 @@ impl OffsetIndexMetaData {
             self.unencoded_byte_array_data_bytes.clone(),
         )
     }
+
+    // Fast-path read of offset index. This works because we expect all field deltas to be 1,
+    // and there's no nesting beyond PageLocation, so no need to save the last field id. Like
+    // read_page_locations(), this will fail if absolute field id's are used.
+    pub(super) fn try_from_fast<'a>(prot: &mut ThriftCompactInputProtocol<'a>) -> Result<Self> {
+        // Offset index is a struct with 2 fields. First field is an array of PageLocations,
+        // the second an optional array of i64.
+
+        // read field 1 header, then list header, then vec of PageLocations
+        let (field_type, delta) = prot.read_field_header()?;
+        if delta != 1 || field_type != FieldType::List as u8 {
+            return Err(general_err!("error reading OffsetIndex::page_locations"));
+        }
+        let list_ident = prot.read_list_begin()?;
+        let mut page_locations = Vec::with_capacity(list_ident.size as usize);
+        for _ in 0..list_ident.size {
+            page_locations.push(read_page_location(prot)?);
+        }
+
+        let mut unencoded_byte_array_data_bytes: Option<Vec<i64>> = None;
+
+        // read second field...if it's Stop we're done
+        let (mut field_type, delta) = prot.read_field_header()?;
+        if field_type == FieldType::List as u8 {
+            if delta != 1 {
+                return Err(general_err!(
+                    "encountered unknown field while reading OffsetIndex"
+                ));
+            }
+            let list_ident = prot.read_list_begin()?;
+            let mut vec = Vec::with_capacity(list_ident.size as usize);
+            for _ in 0..list_ident.size {
+                vec.push(prot.read_i64()?);
+            }
+            unencoded_byte_array_data_bytes = Some(vec);
+
+            // this one should be Stop
+            (field_type, _) = prot.read_field_header()?;
+        }
+
+        if field_type != FieldType::Stop as u8 {
+            return Err(general_err!(
+                "encountered unknown field while reading OffsetIndex"
+            ));
+        }
+
+        Ok(Self {
+            page_locations,
+            unencoded_byte_array_data_bytes,
+        })
+    }
 }
 
 // hand coding this one because it is very time critical
@@ -144,56 +195,3 @@ fn read_page_location<'a>(prot: &mut ThriftCompactInputProtocol<'a>) -> Result<P
         first_row_index,
     })
 }
-
-// Fast-path read of offset index. this all works because we expect all field deltas to be 1,
-// and there's no nesting beyond PageLocation, so no need to save the last field id. Like
-// read_page_locations(), this will fail if absolute field id's are used.
-pub(super) fn read_offset_index<'a>(
-    prot: &mut ThriftCompactInputProtocol<'a>,
-) -> Result<OffsetIndexMetaData> {
-    // Offset index is a struct with 2 fields. First field is an array of PageLocations,
-    // the second an optional array of i64.
-
-    // read field 1 header, then list header, then vec of PageLocations
-    let (field_type, delta) = prot.read_field_header()?;
-    if delta != 1 || field_type != FieldType::List as u8 {
-        return Err(general_err!("error reading OffsetIndex::page_locations"));
-    }
-    let list_ident = prot.read_list_begin()?;
-    let mut page_locations = Vec::with_capacity(list_ident.size as usize);
-    for _ in 0..list_ident.size {
-        page_locations.push(read_page_location(prot)?);
-    }
-
-    let mut unencoded_byte_array_data_bytes: Option<Vec<i64>> = None;
-
-    // read second field...if it's Stop we're done
-    let (mut field_type, delta) = prot.read_field_header()?;
-    if field_type == FieldType::List as u8 {
-        if delta != 1 {
-            return Err(general_err!(
-                "encountered unknown field while reading OffsetIndex"
-            ));
-        }
-        let list_ident = prot.read_list_begin()?;
-        let mut vec = Vec::with_capacity(list_ident.size as usize);
-        for _ in 0..list_ident.size {
-            vec.push(prot.read_i64()?);
-        }
-        unencoded_byte_array_data_bytes = Some(vec);
-
-        // this one should be Stop
-        (field_type, _) = prot.read_field_header()?;
-    }
-
-    if field_type != FieldType::Stop as u8 {
-        return Err(general_err!(
-            "encountered unknown field while reading OffsetIndex"
-        ));
-    }
-
-    Ok(OffsetIndexMetaData {
-        page_locations,
-        unencoded_byte_array_data_bytes,
-    })
-}

From 1ddaa35b7b7deb8779c84af1da4a8a77fb790aaa Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@live.com>
Date: Sat, 23 Aug 2025 19:17:25 -0700
Subject: [PATCH 18/46] simplify reading of int array

---
 parquet/src/file/page_index/offset_index.rs | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/parquet/src/file/page_index/offset_index.rs b/parquet/src/file/page_index/offset_index.rs
index d6baa1e44892..6cb7539cb573 100644
--- a/parquet/src/file/page_index/offset_index.rs
+++ b/parquet/src/file/page_index/offset_index.rs
@@ -117,6 +117,8 @@ impl OffsetIndexMetaData {
         if delta != 1 || field_type != FieldType::List as u8 {
             return Err(general_err!("error reading OffsetIndex::page_locations"));
         }
+
+        // we have to do this manually because we want to use the fast PageLocation decoder
         let list_ident = prot.read_list_begin()?;
         let mut page_locations = Vec::with_capacity(list_ident.size as usize);
         for _ in 0..list_ident.size {
@@ -133,11 +135,7 @@ impl OffsetIndexMetaData {
                     "encountered unknown field while reading OffsetIndex"
                 ));
             }
-            let list_ident = prot.read_list_begin()?;
-            let mut vec = Vec::with_capacity(list_ident.size as usize);
-            for _ in 0..list_ident.size {
-                vec.push(prot.read_i64()?);
-            }
+            let vec = Vec::<i64>::try_from(&mut *prot)?;
             unencoded_byte_array_data_bytes = Some(vec);
 
             // this one should be Stop

From c271085fb1153ce0617df2148fe95fda6065853e Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@live.com>
Date: Mon, 25 Aug 2025 08:48:04 -0700
Subject: [PATCH 19/46] get write working for enum and some unions

---
 parquet/src/basic.rs          |   5 +-
 parquet/src/parquet_macros.rs |  18 ++++
 parquet/src/parquet_thrift.rs | 149 +++++++++++++++++++++++++++++++++-
 3 files changed, 170 insertions(+), 2 deletions(-)

diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs
index c325cf5dbf2b..79891822a242 100644
--- a/parquet/src/basic.rs
+++ b/parquet/src/basic.rs
@@ -20,11 +20,14 @@
 //! Refer to [`parquet.thrift`](https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift)
 //! file to see raw definitions.
 
+use std::io::Write;
 use std::str::FromStr;
 use std::{fmt, str};
 
 pub use crate::compression::{BrotliLevel, GzipLevel, ZstdLevel};
-use crate::parquet_thrift::{FieldType, ThriftCompactInputProtocol};
+use crate::parquet_thrift::{
+    FieldType, ThriftCompactInputProtocol, ThriftCompactOutputProtocol, WriteThrift,
+};
 use crate::{thrift_enum, thrift_struct, thrift_union_all_empty};
 
 use crate::errors::{ParquetError, Result};
diff --git a/parquet/src/parquet_macros.rs b/parquet/src/parquet_macros.rs
index 2d1ccd819b37..9a8a9ae4e4f9 100644
--- a/parquet/src/parquet_macros.rs
+++ b/parquet/src/parquet_macros.rs
@@ -51,6 +51,12 @@ macro_rules! thrift_enum {
             }
         }
 
+        impl<W: Write> WriteThrift<W> for $identifier {
+            fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
+                (*self as i32).write_thrift(writer)
+            }
+        }
+
         // TODO: remove when we finally get rid of the format module
         impl TryFrom<crate::format::$identifier> for $identifier {
             type Error = ParquetError;
@@ -119,6 +125,18 @@ macro_rules! thrift_union_all_empty {
             }
         }
 
+        impl<W: Write> WriteThrift<W> for $identifier {
+            fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
+                match *self {
+                    $(Self::$field_name => writer.write_field_begin(FieldType::Struct, $field_id, 0)?,)*
+                }
+                // write end of struct for empty struct
+                writer.write_struct_end()?;
+                // write end of struct for this union
+                writer.write_struct_end()
+            }
+        }
+
         // TODO: remove when we finally get rid of the format module
         impl From<crate::format::$identifier> for $identifier {
             fn from(value: crate::format::$identifier) -> Self {
diff --git a/parquet/src/parquet_thrift.rs b/parquet/src/parquet_thrift.rs
index 2dff498372f0..f156eed31bb4 100644
--- a/parquet/src/parquet_thrift.rs
+++ b/parquet/src/parquet_thrift.rs
@@ -20,7 +20,7 @@
 // to not allocate byte arrays or strings.
 #![allow(dead_code)]
 
-use std::cmp::Ordering;
+use std::{cmp::Ordering, io::Write};
 
 use crate::errors::{ParquetError, Result};
 
@@ -539,3 +539,150 @@ where
         Ok(res)
     }
 }
+
+/////////////////////////
+// thrift compact output
+
+pub(crate) struct ThriftCompactOutputProtocol<W: Write> {
+    writer: W,
+}
+
+impl<W: Write> ThriftCompactOutputProtocol<W> {
+    pub(crate) fn new(writer: W) -> Self {
+        Self { writer }
+    }
+
+    pub(crate) fn inner(&self) -> &W {
+        &self.writer
+    }
+
+    fn write_byte(&mut self, b: u8) -> Result<()> {
+        self.writer.write_all(&[b])?;
+        Ok(())
+    }
+
+    fn write_vlq(&mut self, val: u64) -> Result<()> {
+        let mut v = val;
+        while v > 0x7f {
+            self.write_byte(v as u8 | 0x80)?;
+            v >>= 7;
+        }
+        self.write_byte(v as u8)
+    }
+
+    fn write_zig_zag(&mut self, val: i64) -> Result<()> {
+        let s = (val < 0) as i64;
+        self.write_vlq((((val ^ -s) << 1) + s) as u64)
+    }
+
+    pub(crate) fn write_field_begin(
+        &mut self,
+        field_type: FieldType,
+        field_id: i16,
+        last_field_id: i16,
+    ) -> Result<()> {
+        let mut delta = field_id - last_field_id;
+        if delta > 0xf || delta < 0 {
+            delta = 0;
+        }
+        if delta > 0 {
+            self.write_byte((delta as u8) << 4 | field_type as u8)
+        } else {
+            self.write_byte(field_type as u8)?;
+            self.write_i16(delta)
+        }
+    }
+
+    pub(crate) fn write_struct_end(&mut self) -> Result<()> {
+        self.write_byte(0)
+    }
+
+    pub(crate) fn write_i8(&mut self, val: i8) -> Result<()> {
+        self.write_byte(val as u8)
+    }
+
+    pub(crate) fn write_i16(&mut self, val: i16) -> Result<()> {
+        self.write_zig_zag(val as _)
+    }
+
+    pub(crate) fn write_i32(&mut self, val: i32) -> Result<()> {
+        self.write_zig_zag(val as _)
+    }
+
+    pub(crate) fn write_i64(&mut self, val: i64) -> Result<()> {
+        self.write_zig_zag(val as _)
+    }
+}
+
+pub(crate) trait WriteThrift<W: Write> {
+    fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()>;
+}
+
+impl<W: Write> WriteThrift<W> for i8 {
+    fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
+        writer.write_i8(*self)
+    }
+}
+
+impl<W: Write> WriteThrift<W> for i16 {
+    fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
+        writer.write_i16(*self)
+    }
+}
+
+impl<W: Write> WriteThrift<W> for i32 {
+    fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
+        writer.write_i32(*self)
+    }
+}
+
+impl<W: Write> WriteThrift<W> for i64 {
+    fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
+        writer.write_i64(*self)
+    }
+}
+
+#[cfg(test)]
+#[allow(deprecated)] // allow BIT_PACKED encoding for the whole test module
+mod tests {
+    use crate::basic::{TimeUnit, Type};
+
+    use super::*;
+    use std::fmt::Debug;
+
+    fn test_roundtrip<T>(val: T)
+    where
+        T: for<'a> TryFrom<&'a mut ThriftCompactInputProtocol<'a>>
+            + WriteThrift<Vec<u8>>
+            + PartialEq
+            + Debug,
+        for<'a> <T as TryFrom<&'a mut ThriftCompactInputProtocol<'a>>>::Error: Debug,
+    {
+        let buf = Vec::<u8>::new();
+        let mut writer = ThriftCompactOutputProtocol::new(buf);
+        val.write_thrift(&mut writer).unwrap();
+
+        let mut prot = ThriftCompactInputProtocol::new(writer.inner());
+        let read_val = T::try_from(&mut prot).unwrap();
+        assert_eq!(val, read_val);
+    }
+
+    #[test]
+    fn test_enum_roundtrip() {
+        test_roundtrip(Type::BOOLEAN);
+        test_roundtrip(Type::INT32);
+        test_roundtrip(Type::INT64);
+        test_roundtrip(Type::INT96);
+        test_roundtrip(Type::FLOAT);
+        test_roundtrip(Type::DOUBLE);
+        test_roundtrip(Type::BYTE_ARRAY);
+        test_roundtrip(Type::FIXED_LEN_BYTE_ARRAY);
+    }
+
+    #[test]
+    fn test_union_all_empty_roundtrip() {
+        test_roundtrip(TimeUnit::MILLIS);
+        test_roundtrip(TimeUnit::MICROS);
+        test_roundtrip(TimeUnit::NANOS);
+    }
+}

From 34cdaf2df90e5995b029fcd8acbd6123d2f8fdce Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@live.com>
Date: Mon, 25 Aug 2025 10:25:33 -0700
Subject: [PATCH 20/46] make test_roundtrip visible

---
 parquet/src/parquet_thrift.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/parquet/src/parquet_thrift.rs b/parquet/src/parquet_thrift.rs
index f156eed31bb4..fef0b2faecda 100644
--- a/parquet/src/parquet_thrift.rs
+++ b/parquet/src/parquet_thrift.rs
@@ -644,13 +644,13 @@ impl<W: Write> WriteThrift<W> for i64 {
 
 #[cfg(test)]
 #[allow(deprecated)] // allow BIT_PACKED encoding for the whole test module
-mod tests {
+pub(crate) mod tests {
     use crate::basic::{TimeUnit, Type};
 
     use super::*;
     use std::fmt::Debug;
 
-    fn test_roundtrip<T>(val: T)
+    pub(crate) fn test_roundtrip<T>(val: T)
     where
         T: for<'a> TryFrom<&'a mut ThriftCompactInputProtocol<'a>>
             + WriteThrift<Vec<u8>>

From c9be57047d8095442e7248be0a49ef98e9ce5f3f Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@live.com>
Date: Mon, 25 Aug 2025 10:28:02 -0700
Subject: [PATCH 21/46] add test for converted_type, start on logical_type

---
 parquet/src/basic.rs | 83 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 83 insertions(+)

diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs
index 79891822a242..788a38743db9 100644
--- a/parquet/src/basic.rs
+++ b/parquet/src/basic.rs
@@ -196,6 +196,13 @@ impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for ConvertedType {
     }
 }
 
+impl<W: Write> WriteThrift<W> for ConvertedType {
+    fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
+        // because we've added NONE, the variant values are off by 1, so correct that here
+        writer.write_i32(*self as i32 - 1)
+    }
+}
+
 // ----------------------------------------------------------------------
 // Mirrors thrift union `crate::format::TimeUnit`
 
@@ -453,6 +460,35 @@ impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for LogicalType {
     }
 }
 
+impl<W: Write> WriteThrift<W> for LogicalType {
+    fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
+        match *self {
+            Self::String => {
+                writer.write_field_begin(FieldType::Struct, 1, 0)?;
+                writer.write_struct_end()?;
+            }
+            Self::Map => {
+                writer.write_field_begin(FieldType::Struct, 2, 0)?;
+                writer.write_struct_end()?;
+            }
+            Self::List => {
+                writer.write_field_begin(FieldType::Struct, 3, 0)?;
+                writer.write_struct_end()?;
+            }
+            Self::Enum => {
+                writer.write_field_begin(FieldType::Struct, 4, 0)?;
+                writer.write_struct_end()?;
+            }
+            Self::Decimal { scale, precision } => {
+                writer.write_field_begin(FieldType::Struct, 4, 0)?;
+                DecimalType { scale, precision }.write_thrift(writer)?;
+            }
+            _ => return Err(nyi_err!("logical type")),
+        }
+        writer.write_struct_end()
+    }
+}
+
 // ----------------------------------------------------------------------
 // Mirrors thrift enum `crate::format::FieldRepetitionType`
 //
@@ -996,6 +1032,20 @@ impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for ColumnOrder {
     }
 }
 
+impl<W: Write> WriteThrift<W> for ColumnOrder {
+    fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
+        match *self {
+            Self::TYPE_DEFINED_ORDER(_) => {
+                writer.write_field_begin(FieldType::Struct, 1, 0)?;
+                writer.write_struct_end()?;
+            }
+            _ => return Err(general_err!("Attempt to write undefined ColumnOrder")),
+        }
+        // write end of struct for this union
+        writer.write_struct_end()
+    }
+}
+
 // ----------------------------------------------------------------------
 // Display handlers
 
@@ -1445,6 +1495,7 @@ impl str::FromStr for LogicalType {
 #[allow(deprecated)] // allow BIT_PACKED encoding for the whole test module
 mod tests {
     use super::*;
+    use crate::parquet_thrift::tests::test_roundtrip;
 
     #[test]
     fn test_display_type() {
@@ -1552,6 +1603,32 @@ mod tests {
         );
     }
 
+    #[test]
+    fn test_converted_type_roundtrip() {
+        test_roundtrip(ConvertedType::UTF8);
+        test_roundtrip(ConvertedType::MAP);
+        test_roundtrip(ConvertedType::MAP_KEY_VALUE);
+        test_roundtrip(ConvertedType::LIST);
+        test_roundtrip(ConvertedType::ENUM);
+        test_roundtrip(ConvertedType::DECIMAL);
+        test_roundtrip(ConvertedType::DATE);
+        test_roundtrip(ConvertedType::TIME_MILLIS);
+        test_roundtrip(ConvertedType::TIME_MICROS);
+        test_roundtrip(ConvertedType::TIMESTAMP_MILLIS);
+        test_roundtrip(ConvertedType::TIMESTAMP_MICROS);
+        test_roundtrip(ConvertedType::UINT_8);
+        test_roundtrip(ConvertedType::UINT_16);
+        test_roundtrip(ConvertedType::UINT_32);
+        test_roundtrip(ConvertedType::UINT_64);
+        test_roundtrip(ConvertedType::INT_8);
+        test_roundtrip(ConvertedType::INT_16);
+        test_roundtrip(ConvertedType::INT_32);
+        test_roundtrip(ConvertedType::INT_64);
+        test_roundtrip(ConvertedType::JSON);
+        test_roundtrip(ConvertedType::BSON);
+        test_roundtrip(ConvertedType::INTERVAL);
+    }
+
     #[test]
     fn test_display_converted_type() {
         assert_eq!(ConvertedType::NONE.to_string(), "NONE");
@@ -2414,6 +2491,12 @@ mod tests {
         assert_eq!(ColumnOrder::UNDEFINED.to_string(), "UNDEFINED");
     }
 
+    #[test]
+    fn test_column_order_roundtrip() {
+        // SortOrder::SIGNED is the default on read.
+        test_roundtrip(ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::SIGNED))
+    }
+
     #[test]
     fn test_column_order_get_logical_type_sort_order() {
         // Helper to check the order in a list of values.

From a9cd09dc49b62dd763043d110265cda82ee10e66 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@live.com>
Date: Mon, 25 Aug 2025 12:26:47 -0700
Subject: [PATCH 22/46] checkpoint struct field writing

---
 parquet/src/basic.rs          | 262 +++++++++++++++++++++++++++++++++-
 parquet/src/parquet_macros.rs |  18 ++-
 parquet/src/parquet_thrift.rs | 106 ++++++++++++--
 3 files changed, 370 insertions(+), 16 deletions(-)

diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs
index 788a38743db9..0371cc638b8f 100644
--- a/parquet/src/basic.rs
+++ b/parquet/src/basic.rs
@@ -27,6 +27,7 @@ use std::{fmt, str};
 pub use crate::compression::{BrotliLevel, GzipLevel, ZstdLevel};
 use crate::parquet_thrift::{
     FieldType, ThriftCompactInputProtocol, ThriftCompactOutputProtocol, WriteThrift,
+    WriteThriftField,
 };
 use crate::{thrift_enum, thrift_struct, thrift_union_all_empty};
 
@@ -227,6 +228,31 @@ struct DecimalType {
 }
 );
 
+impl<W: Write> WriteThrift<W> for DecimalType {
+    #[allow(unused_assignments)]
+    fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
+        let mut last_field_id = 0i16;
+        last_field_id = self.scale.write_thrift_field(writer, 1, last_field_id)?;
+        last_field_id = self
+            .precision
+            .write_thrift_field(writer, 2, last_field_id)?;
+        writer.write_struct_end()
+    }
+}
+
+impl<W: Write> WriteThriftField<W> for DecimalType {
+    fn write_thrift_field(
+        &self,
+        writer: &mut ThriftCompactOutputProtocol<W>,
+        field_id: i16,
+        last_field_id: i16,
+    ) -> Result<i16> {
+        writer.write_field_begin(FieldType::Struct, field_id, last_field_id)?;
+        self.write_thrift(writer)?;
+        Ok(field_id)
+    }
+}
+
 thrift_struct!(
 struct TimestampType {
   1: required bool is_adjusted_to_u_t_c
@@ -234,6 +260,31 @@ struct TimestampType {
 }
 );
 
+impl<W: Write> WriteThrift<W> for TimestampType {
+    #[allow(unused_assignments)]
+    fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
+        let mut last_field_id = 0i16;
+        last_field_id = self
+            .is_adjusted_to_u_t_c
+            .write_thrift_field(writer, 1, last_field_id)?;
+        last_field_id = self.unit.write_thrift_field(writer, 2, last_field_id)?;
+        writer.write_struct_end()
+    }
+}
+
+impl<W: Write> WriteThriftField<W> for TimestampType {
+    fn write_thrift_field(
+        &self,
+        writer: &mut ThriftCompactOutputProtocol<W>,
+        field_id: i16,
+        last_field_id: i16,
+    ) -> Result<i16> {
+        writer.write_field_begin(FieldType::Struct, field_id, last_field_id)?;
+        self.write_thrift(writer)?;
+        Ok(field_id)
+    }
+}
+
 // they are identical
 use TimestampType as TimeType;
 
@@ -244,6 +295,33 @@ struct IntType {
 }
 );
 
+impl<W: Write> WriteThrift<W> for IntType {
+    #[allow(unused_assignments)]
+    fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
+        let mut last_field_id = 0i16;
+        last_field_id = self
+            .bit_width
+            .write_thrift_field(writer, 1, last_field_id)?;
+        last_field_id = self
+            .is_signed
+            .write_thrift_field(writer, 2, last_field_id)?;
+        writer.write_struct_end()
+    }
+}
+
+impl<W: Write> WriteThriftField<W> for IntType {
+    fn write_thrift_field(
+        &self,
+        writer: &mut ThriftCompactOutputProtocol<W>,
+        field_id: i16,
+        last_field_id: i16,
+    ) -> Result<i16> {
+        writer.write_field_begin(FieldType::Struct, field_id, last_field_id)?;
+        self.write_thrift(writer)?;
+        Ok(field_id)
+    }
+}
+
 thrift_struct!(
 struct VariantType {
   // The version of the variant specification that the variant was
@@ -252,12 +330,66 @@ struct VariantType {
 }
 );
 
+impl<W: Write> WriteThrift<W> for VariantType {
+    #[allow(unused_assignments)]
+    fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
+        let mut last_field_id = 0i16;
+        if self.specification_version.is_some() {
+            last_field_id =
+                self.specification_version
+                    .unwrap()
+                    .write_thrift_field(writer, 1, last_field_id)?;
+        }
+        writer.write_struct_end()
+    }
+}
+
+impl<W: Write> WriteThriftField<W> for VariantType {
+    fn write_thrift_field(
+        &self,
+        writer: &mut ThriftCompactOutputProtocol<W>,
+        field_id: i16,
+        last_field_id: i16,
+    ) -> Result<i16> {
+        writer.write_field_begin(FieldType::Struct, field_id, last_field_id)?;
+        self.write_thrift(writer)?;
+        Ok(field_id)
+    }
+}
+
 thrift_struct!(
 struct GeometryType<'a> {
   1: optional string<'a> crs;
 }
 );
 
+impl<'a, W: Write> WriteThrift<W> for GeometryType<'a> {
+    #[allow(unused_assignments)]
+    fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
+        let mut last_field_id = 0i16;
+        if self.crs.is_some() {
+            last_field_id = self
+                .crs
+                .unwrap()
+                .write_thrift_field(writer, 1, last_field_id)?;
+        }
+        writer.write_struct_end()
+    }
+}
+
+impl<'a, W: Write> WriteThriftField<W> for GeometryType<'a> {
+    fn write_thrift_field(
+        &self,
+        writer: &mut ThriftCompactOutputProtocol<W>,
+        field_id: i16,
+        last_field_id: i16,
+    ) -> Result<i16> {
+        writer.write_field_begin(FieldType::Struct, field_id, last_field_id)?;
+        self.write_thrift(writer)?;
+        Ok(field_id)
+    }
+}
+
 thrift_struct!(
 struct GeographyType<'a> {
   1: optional string<'a> crs;
@@ -265,6 +397,40 @@ struct GeographyType<'a> {
 }
 );
 
+impl<'a, W: Write> WriteThrift<W> for GeographyType<'a> {
+    #[allow(unused_assignments)]
+    fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
+        let mut last_field_id = 0i16;
+        if self.crs.is_some() {
+            last_field_id = self
+                .crs
+                .unwrap()
+                .write_thrift_field(writer, 1, last_field_id)?;
+        }
+        if self.algorithm.is_some() {
+            last_field_id =
+                self.algorithm
+                    .as_ref()
+                    .unwrap()
+                    .write_thrift_field(writer, 2, last_field_id)?;
+        }
+        writer.write_struct_end()
+    }
+}
+
+impl<'a, W: Write> WriteThriftField<W> for GeographyType<'a> {
+    fn write_thrift_field(
+        &self,
+        writer: &mut ThriftCompactOutputProtocol<W>,
+        field_id: i16,
+        last_field_id: i16,
+    ) -> Result<i16> {
+        writer.write_field_begin(FieldType::Struct, field_id, last_field_id)?;
+        self.write_thrift(writer)?;
+        Ok(field_id)
+    }
+}
+
 /// Logical types used by version 2.4.0+ of the Parquet format.
 ///
 /// This is an *entirely new* struct as of version
@@ -462,7 +628,7 @@ impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for LogicalType {
 
 impl<W: Write> WriteThrift<W> for LogicalType {
     fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
-        match *self {
+        match self {
             Self::String => {
                 writer.write_field_begin(FieldType::Struct, 1, 0)?;
                 writer.write_struct_end()?;
@@ -480,8 +646,86 @@ impl<W: Write> WriteThrift<W> for LogicalType {
                 writer.write_struct_end()?;
             }
             Self::Decimal { scale, precision } => {
-                writer.write_field_begin(FieldType::Struct, 4, 0)?;
-                DecimalType { scale, precision }.write_thrift(writer)?;
+                DecimalType {
+                    scale: *scale,
+                    precision: *precision,
+                }
+                .write_thrift_field(writer, 5, 0)?;
+            }
+            Self::Date => {
+                writer.write_field_begin(FieldType::Struct, 6, 0)?;
+                writer.write_struct_end()?;
+            }
+            Self::Time {
+                is_adjusted_to_u_t_c,
+                unit,
+            } => {
+                TimeType {
+                    is_adjusted_to_u_t_c: *is_adjusted_to_u_t_c,
+                    unit: *unit,
+                }
+                .write_thrift_field(writer, 7, 0)?;
+            }
+            Self::Timestamp {
+                is_adjusted_to_u_t_c,
+                unit,
+            } => {
+                TimestampType {
+                    is_adjusted_to_u_t_c: *is_adjusted_to_u_t_c,
+                    unit: *unit,
+                }
+                .write_thrift_field(writer, 8, 0)?;
+            }
+            Self::Integer {
+                bit_width,
+                is_signed,
+            } => {
+                IntType {
+                    bit_width: *bit_width,
+                    is_signed: *is_signed,
+                }
+                .write_thrift_field(writer, 10, 0)?;
+            }
+            Self::Unknown => {
+                writer.write_field_begin(FieldType::Struct, 11, 0)?;
+                writer.write_struct_end()?;
+            }
+            Self::Json => {
+                writer.write_field_begin(FieldType::Struct, 12, 0)?;
+                writer.write_struct_end()?;
+            }
+            Self::Bson => {
+                writer.write_field_begin(FieldType::Struct, 13, 0)?;
+                writer.write_struct_end()?;
+            }
+            Self::Uuid => {
+                writer.write_field_begin(FieldType::Struct, 14, 0)?;
+                writer.write_struct_end()?;
+            }
+            Self::Float16 => {
+                writer.write_field_begin(FieldType::Struct, 15, 0)?;
+                writer.write_struct_end()?;
+            }
+            Self::Variant {
+                specification_version,
+            } => {
+                VariantType {
+                    specification_version: *specification_version,
+                }
+                .write_thrift_field(writer, 16, 0)?;
+            }
+            Self::Geometry { crs } => {
+                GeometryType {
+                    crs: crs.as_ref().map(|s| s.as_str()),
+                }
+                .write_thrift_field(writer, 17, 0)?;
+            }
+            Self::Geography { crs, algorithm } => {
+                GeographyType {
+                    crs: crs.as_ref().map(|s| s.as_str()),
+                    algorithm: *algorithm,
+                }
+                .write_thrift_field(writer, 18, 0)?;
             }
             _ => return Err(nyi_err!("logical type")),
         }
@@ -2186,6 +2430,18 @@ mod tests {
         );
     }
 
+    #[test]
+    fn test_logical_type_roundtrip() {
+        test_roundtrip(LogicalType::String);
+        test_roundtrip(LogicalType::Map);
+        test_roundtrip(LogicalType::List);
+        test_roundtrip(LogicalType::Enum);
+        test_roundtrip(LogicalType::Decimal {
+            scale: 0,
+            precision: 20,
+        });
+    }
+
     #[test]
     fn test_display_repetition() {
         assert_eq!(Repetition::REQUIRED.to_string(), "REQUIRED");
diff --git a/parquet/src/parquet_macros.rs b/parquet/src/parquet_macros.rs
index 9a8a9ae4e4f9..e6d35dedca01 100644
--- a/parquet/src/parquet_macros.rs
+++ b/parquet/src/parquet_macros.rs
@@ -53,7 +53,15 @@ macro_rules! thrift_enum {
 
         impl<W: Write> WriteThrift<W> for $identifier {
             fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
-                (*self as i32).write_thrift(writer)
+                writer.write_i32(*self as i32)
+            }
+        }
+
+        impl<W: Write> WriteThriftField<W> for $identifier {
+            fn write_thrift_field(&self, writer: &mut ThriftCompactOutputProtocol<W>, field_id: i16, last_field_id: i16) -> Result<i16> {
+                writer.write_field_begin(FieldType::I32, field_id, last_field_id)?;
+                self.write_thrift(writer)?;
+                Ok(field_id)
             }
         }
 
@@ -137,6 +145,14 @@ macro_rules! thrift_union_all_empty {
             }
         }
 
+        impl<W: Write> WriteThriftField<W> for $identifier {
+            fn write_thrift_field(&self, writer: &mut ThriftCompactOutputProtocol<W>, field_id: i16, last_field_id: i16) -> Result<i16> {
+                writer.write_field_begin(FieldType::Struct, field_id, last_field_id)?;
+                self.write_thrift(writer)?;
+                Ok(field_id)
+            }
+        }
+
         // TODO: remove when we finally get rid of the format module
         impl From<crate::format::$identifier> for $identifier {
             fn from(value: crate::format::$identifier) -> Self {
diff --git a/parquet/src/parquet_thrift.rs b/parquet/src/parquet_thrift.rs
index fef0b2faecda..d332565a2f59 100644
--- a/parquet/src/parquet_thrift.rs
+++ b/parquet/src/parquet_thrift.rs
@@ -597,6 +597,19 @@ impl<W: Write> ThriftCompactOutputProtocol<W> {
         self.write_byte(0)
     }
 
+    pub(crate) fn write_bytes(&mut self, val: &[u8]) -> Result<()> {
+        self.write_vlq(val.len() as u64)?;
+        self.writer.write_all(val)?;
+        Ok(())
+    }
+
+    pub(crate) fn write_bool(&mut self, val: bool) -> Result<()> {
+        match val {
+            true => self.write_byte(1),
+            false => self.write_byte(2),
+        }
+    }
+
     pub(crate) fn write_i8(&mut self, val: i8) -> Result<()> {
         self.write_byte(val as u8)
     }
@@ -615,30 +628,99 @@ impl<W: Write> ThriftCompactOutputProtocol<W> {
 }
 
 pub(crate) trait WriteThrift<W: Write> {
+    // used to write generated enums and structs
     fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()>;
 }
 
-impl<W: Write> WriteThrift<W> for i8 {
-    fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
-        writer.write_i8(*self)
+pub(crate) trait WriteThriftField<W: Write> {
+    // used to write struct fields (which may be basic types or generated types).
+    // write the field header and field value. returns `field_id`.
+    fn write_thrift_field(
+        &self,
+        writer: &mut ThriftCompactOutputProtocol<W>,
+        field_id: i16,
+        last_field_id: i16,
+    ) -> Result<i16>;
+}
+
+impl<W: Write> WriteThriftField<W> for bool {
+    fn write_thrift_field(
+        &self,
+        writer: &mut ThriftCompactOutputProtocol<W>,
+        field_id: i16,
+        last_field_id: i16,
+    ) -> Result<i16> {
+        // boolean only writes the field header
+        match *self {
+            true => writer.write_field_begin(FieldType::BooleanTrue, field_id, last_field_id)?,
+            false => writer.write_field_begin(FieldType::BooleanFalse, field_id, last_field_id)?,
+        }
+        Ok(field_id)
     }
 }
 
-impl<W: Write> WriteThrift<W> for i16 {
-    fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
-        writer.write_i16(*self)
+impl<W: Write> WriteThriftField<W> for i8 {
+    fn write_thrift_field(
+        &self,
+        writer: &mut ThriftCompactOutputProtocol<W>,
+        field_id: i16,
+        last_field_id: i16,
+    ) -> Result<i16> {
+        writer.write_field_begin(FieldType::Byte, field_id, last_field_id)?;
+        writer.write_i8(*self)?;
+        Ok(field_id)
     }
 }
 
-impl<W: Write> WriteThrift<W> for i32 {
-    fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
-        writer.write_i32(*self)
+impl<W: Write> WriteThriftField<W> for i16 {
+    fn write_thrift_field(
+        &self,
+        writer: &mut ThriftCompactOutputProtocol<W>,
+        field_id: i16,
+        last_field_id: i16,
+    ) -> Result<i16> {
+        writer.write_field_begin(FieldType::I16, field_id, last_field_id)?;
+        writer.write_i16(*self)?;
+        Ok(field_id)
+    }
+}
+
+impl<W: Write> WriteThriftField<W> for i32 {
+    fn write_thrift_field(
+        &self,
+        writer: &mut ThriftCompactOutputProtocol<W>,
+        field_id: i16,
+        last_field_id: i16,
+    ) -> Result<i16> {
+        writer.write_field_begin(FieldType::I32, field_id, last_field_id)?;
+        writer.write_i32(*self)?;
+        Ok(field_id)
     }
 }
 
-impl<W: Write> WriteThrift<W> for i64 {
-    fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
-        writer.write_i64(*self)
+impl<W: Write> WriteThriftField<W> for i64 {
+    fn write_thrift_field(
+        &self,
+        writer: &mut ThriftCompactOutputProtocol<W>,
+        field_id: i16,
+        last_field_id: i16,
+    ) -> Result<i16> {
+        writer.write_field_begin(FieldType::I64, field_id, last_field_id)?;
+        writer.write_i64(*self)?;
+        Ok(field_id)
+    }
+}
+
+impl<W: Write> WriteThriftField<W> for &str {
+    fn write_thrift_field(
+        &self,
+        writer: &mut ThriftCompactOutputProtocol<W>,
+        field_id: i16,
+        last_field_id: i16,
+    ) -> Result<i16> {
+        writer.write_field_begin(FieldType::Binary, field_id, last_field_id)?;
+        writer.write_bytes(self.as_bytes())?;
+        Ok(field_id)
     }
 }
 

From ae65167a8862fd1259cbbb6569373f967110e29b Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@live.com>
Date: Mon, 25 Aug 2025 15:03:37 -0700
Subject: [PATCH 23/46] get some struct examples and lists working

---
 parquet/src/basic.rs                        | 91 +++++++++++++++++++-
 parquet/src/file/page_index/offset_index.rs | 80 +++++++++++++++++-
 parquet/src/parquet_macros.rs               |  4 +
 parquet/src/parquet_thrift.rs               | 93 +++++++++++++++++++--
 4 files changed, 260 insertions(+), 8 deletions(-)

diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs
index 0371cc638b8f..50b920401646 100644
--- a/parquet/src/basic.rs
+++ b/parquet/src/basic.rs
@@ -26,7 +26,7 @@ use std::{fmt, str};
 
 pub use crate::compression::{BrotliLevel, GzipLevel, ZstdLevel};
 use crate::parquet_thrift::{
-    FieldType, ThriftCompactInputProtocol, ThriftCompactOutputProtocol, WriteThrift,
+    ElementType, FieldType, ThriftCompactInputProtocol, ThriftCompactOutputProtocol, WriteThrift,
     WriteThriftField,
 };
 use crate::{thrift_enum, thrift_struct, thrift_union_all_empty};
@@ -198,6 +198,8 @@ impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for ConvertedType {
 }
 
 impl<W: Write> WriteThrift<W> for ConvertedType {
+    const ELEMENT_TYPE: ElementType = ElementType::I32;
+
     fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
         // because we've added NONE, the variant values are off by 1, so correct that here
         writer.write_i32(*self as i32 - 1)
@@ -229,6 +231,8 @@ struct DecimalType {
 );
 
 impl<W: Write> WriteThrift<W> for DecimalType {
+    const ELEMENT_TYPE: ElementType = ElementType::Struct;
+
     #[allow(unused_assignments)]
     fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
         let mut last_field_id = 0i16;
@@ -261,6 +265,8 @@ struct TimestampType {
 );
 
 impl<W: Write> WriteThrift<W> for TimestampType {
+    const ELEMENT_TYPE: ElementType = ElementType::Struct;
+
     #[allow(unused_assignments)]
     fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
         let mut last_field_id = 0i16;
@@ -296,6 +302,8 @@ struct IntType {
 );
 
 impl<W: Write> WriteThrift<W> for IntType {
+    const ELEMENT_TYPE: ElementType = ElementType::Struct;
+
     #[allow(unused_assignments)]
     fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
         let mut last_field_id = 0i16;
@@ -331,6 +339,8 @@ struct VariantType {
 );
 
 impl<W: Write> WriteThrift<W> for VariantType {
+    const ELEMENT_TYPE: ElementType = ElementType::Struct;
+
     #[allow(unused_assignments)]
     fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
         let mut last_field_id = 0i16;
@@ -364,6 +374,8 @@ struct GeometryType<'a> {
 );
 
 impl<'a, W: Write> WriteThrift<W> for GeometryType<'a> {
+    const ELEMENT_TYPE: ElementType = ElementType::Struct;
+
     #[allow(unused_assignments)]
     fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
         let mut last_field_id = 0i16;
@@ -398,6 +410,8 @@ struct GeographyType<'a> {
 );
 
 impl<'a, W: Write> WriteThrift<W> for GeographyType<'a> {
+    const ELEMENT_TYPE: ElementType = ElementType::Struct;
+
     #[allow(unused_assignments)]
     fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
         let mut last_field_id = 0i16;
@@ -627,6 +641,8 @@ impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for LogicalType {
 }
 
 impl<W: Write> WriteThrift<W> for LogicalType {
+    const ELEMENT_TYPE: ElementType = ElementType::Struct;
+
     fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
         match self {
             Self::String => {
@@ -1277,6 +1293,8 @@ impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for ColumnOrder {
 }
 
 impl<W: Write> WriteThrift<W> for ColumnOrder {
+    const ELEMENT_TYPE: ElementType = ElementType::Struct;
+
     fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
         match *self {
             Self::TYPE_DEFINED_ORDER(_) => {
@@ -2440,6 +2458,77 @@ mod tests {
             scale: 0,
             precision: 20,
         });
+        test_roundtrip(LogicalType::Date);
+        test_roundtrip(LogicalType::Time {
+            is_adjusted_to_u_t_c: true,
+            unit: TimeUnit::MICROS,
+        });
+        test_roundtrip(LogicalType::Time {
+            is_adjusted_to_u_t_c: false,
+            unit: TimeUnit::MILLIS,
+        });
+        test_roundtrip(LogicalType::Time {
+            is_adjusted_to_u_t_c: false,
+            unit: TimeUnit::NANOS,
+        });
+        test_roundtrip(LogicalType::Timestamp {
+            is_adjusted_to_u_t_c: false,
+            unit: TimeUnit::MICROS,
+        });
+        test_roundtrip(LogicalType::Timestamp {
+            is_adjusted_to_u_t_c: true,
+            unit: TimeUnit::MILLIS,
+        });
+        test_roundtrip(LogicalType::Timestamp {
+            is_adjusted_to_u_t_c: true,
+            unit: TimeUnit::NANOS,
+        });
+        test_roundtrip(LogicalType::Integer {
+            bit_width: 8,
+            is_signed: true,
+        });
+        test_roundtrip(LogicalType::Integer {
+            bit_width: 16,
+            is_signed: false,
+        });
+        test_roundtrip(LogicalType::Integer {
+            bit_width: 32,
+            is_signed: true,
+        });
+        test_roundtrip(LogicalType::Integer {
+            bit_width: 64,
+            is_signed: false,
+        });
+        test_roundtrip(LogicalType::Json);
+        test_roundtrip(LogicalType::Bson);
+        test_roundtrip(LogicalType::Uuid);
+        test_roundtrip(LogicalType::Float16);
+        test_roundtrip(LogicalType::Variant {
+            specification_version: Some(1),
+        });
+        test_roundtrip(LogicalType::Variant {
+            specification_version: None,
+        });
+        test_roundtrip(LogicalType::Geometry {
+            crs: Some("foo".to_owned()),
+        });
+        test_roundtrip(LogicalType::Geometry { crs: None });
+        test_roundtrip(LogicalType::Geography {
+            crs: Some("foo".to_owned()),
+            algorithm: Some(EdgeInterpolationAlgorithm::ANDOYER),
+        });
+        test_roundtrip(LogicalType::Geography {
+            crs: None,
+            algorithm: Some(EdgeInterpolationAlgorithm::KARNEY),
+        });
+        test_roundtrip(LogicalType::Geography {
+            crs: Some("foo".to_owned()),
+            algorithm: None,
+        });
+        test_roundtrip(LogicalType::Geography {
+            crs: None,
+            algorithm: None,
+        });
     }
 
     #[test]
diff --git a/parquet/src/file/page_index/offset_index.rs b/parquet/src/file/page_index/offset_index.rs
index 6cb7539cb573..8217fa7878c8 100644
--- a/parquet/src/file/page_index/offset_index.rs
+++ b/parquet/src/file/page_index/offset_index.rs
@@ -19,7 +19,12 @@
 //!
 //! [`OffsetIndex`]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
 
-use crate::parquet_thrift::{FieldType, ThriftCompactInputProtocol};
+use std::io::Write;
+
+use crate::parquet_thrift::{
+    ElementType, FieldType, ThriftCompactInputProtocol, ThriftCompactOutputProtocol, WriteThrift,
+    WriteThriftField,
+};
 use crate::{
     errors::{ParquetError, Result},
     thrift_struct,
@@ -39,6 +44,23 @@ pub struct PageLocation {
 }
 );
 
+impl<W: Write> WriteThrift<W> for PageLocation {
+    const ELEMENT_TYPE: ElementType = ElementType::Struct;
+
+    #[allow(unused_assignments)]
+    fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
+        let mut last_field_id = 0i16;
+        last_field_id = self.offset.write_thrift_field(writer, 1, last_field_id)?;
+        last_field_id = self
+            .compressed_page_size
+            .write_thrift_field(writer, 2, last_field_id)?;
+        last_field_id = self
+            .first_row_index
+            .write_thrift_field(writer, 3, last_field_id)?;
+        writer.write_struct_end()
+    }
+}
+
 impl From<&crate::format::PageLocation> for PageLocation {
     fn from(value: &crate::format::PageLocation) -> Self {
         Self {
@@ -73,6 +95,29 @@ pub struct OffsetIndexMetaData {
 }
 );
 
+impl<W: Write> WriteThrift<W> for OffsetIndexMetaData {
+    const ELEMENT_TYPE: ElementType = ElementType::Struct;
+
+    #[allow(unused_assignments)]
+    fn write_thrift(
+        &self,
+        writer: &mut crate::parquet_thrift::ThriftCompactOutputProtocol<W>,
+    ) -> Result<()> {
+        let mut last_field_id = 0i16;
+        last_field_id = self
+            .page_locations
+            .write_thrift_field(writer, 1, last_field_id)?;
+        if self.unencoded_byte_array_data_bytes.is_some() {
+            last_field_id = self
+                .unencoded_byte_array_data_bytes
+                .as_ref()
+                .unwrap()
+                .write_thrift_field(writer, 2, last_field_id)?;
+        }
+        writer.write_struct_end()
+    }
+}
+
 impl OffsetIndexMetaData {
     /// Creates a new [`OffsetIndexMetaData`] from an [`OffsetIndex`].
     ///
@@ -193,3 +238,36 @@ fn read_page_location<'a>(prot: &mut ThriftCompactInputProtocol<'a>) -> Result<P
         first_row_index,
     })
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::parquet_thrift::tests::test_roundtrip;
+
+    #[test]
+    fn test_offset_idx_roundtrip() {
+        let page_locations = [
+            PageLocation {
+                offset: 0,
+                compressed_page_size: 10,
+                first_row_index: 0,
+            },
+            PageLocation {
+                offset: 10,
+                compressed_page_size: 20,
+                first_row_index: 100,
+            },
+        ]
+        .to_vec();
+        let unenc = [0i64, 100i64].to_vec();
+
+        test_roundtrip(OffsetIndexMetaData {
+            page_locations: page_locations.clone(),
+            unencoded_byte_array_data_bytes: Some(unenc),
+        });
+        test_roundtrip(OffsetIndexMetaData {
+            page_locations,
+            unencoded_byte_array_data_bytes: None,
+        });
+    }
+}
diff --git a/parquet/src/parquet_macros.rs b/parquet/src/parquet_macros.rs
index e6d35dedca01..40aadad98fb1 100644
--- a/parquet/src/parquet_macros.rs
+++ b/parquet/src/parquet_macros.rs
@@ -52,6 +52,8 @@ macro_rules! thrift_enum {
         }
 
         impl<W: Write> WriteThrift<W> for $identifier {
+            const ELEMENT_TYPE: ElementType = ElementType::I32;
+
             fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
                 writer.write_i32(*self as i32)
             }
@@ -134,6 +136,8 @@ macro_rules! thrift_union_all_empty {
         }
 
         impl<W: Write> WriteThrift<W> for $identifier {
+            const ELEMENT_TYPE: ElementType = ElementType::Struct;
+
             fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
                 match *self {
                     $(Self::$field_name => writer.write_field_begin(FieldType::Struct, $field_id, 0)?,)*
diff --git a/parquet/src/parquet_thrift.rs b/parquet/src/parquet_thrift.rs
index d332565a2f59..80427ddf1359 100644
--- a/parquet/src/parquet_thrift.rs
+++ b/parquet/src/parquet_thrift.rs
@@ -581,15 +581,21 @@ impl<W: Write> ThriftCompactOutputProtocol<W> {
         field_id: i16,
         last_field_id: i16,
     ) -> Result<()> {
-        let mut delta = field_id - last_field_id;
-        if delta > 0xf || delta < 0 {
-            delta = 0;
-        }
-        if delta > 0 {
+        let delta = field_id.wrapping_sub(last_field_id);
+        if delta > 0 && delta <= 0xf {
             self.write_byte((delta as u8) << 4 | field_type as u8)
         } else {
             self.write_byte(field_type as u8)?;
-            self.write_i16(delta)
+            self.write_i16(field_id)
+        }
+    }
+
+    pub(crate) fn write_list_begin(&mut self, element_type: ElementType, len: usize) -> Result<()> {
+        if len < 15 {
+            self.write_byte((len as u8) << 4 | element_type as u8)
+        } else {
+            self.write_byte(0xf0u8 | element_type as u8)?;
+            self.write_vlq(len as _)
         }
     }
 
@@ -628,10 +634,67 @@ impl<W: Write> ThriftCompactOutputProtocol<W> {
 }
 
 pub(crate) trait WriteThrift<W: Write> {
+    const ELEMENT_TYPE: ElementType;
+
     // used to write generated enums and structs
     fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()>;
 }
 
+impl<T, W: Write> WriteThrift<W> for Vec<T>
+where
+    T: WriteThrift<W>,
+{
+    const ELEMENT_TYPE: ElementType = ElementType::List;
+
+    fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
+        writer.write_list_begin(T::ELEMENT_TYPE, self.len())?;
+        for i in 0..self.len() {
+            self[i].write_thrift(writer)?;
+        }
+        Ok(())
+    }
+}
+
+impl<W: Write> WriteThrift<W> for bool {
+    const ELEMENT_TYPE: ElementType = ElementType::Bool;
+
+    fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
+        writer.write_bool(*self)
+    }
+}
+
+impl<W: Write> WriteThrift<W> for i8 {
+    const ELEMENT_TYPE: ElementType = ElementType::Byte;
+
+    fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
+        writer.write_i8(*self)
+    }
+}
+
+impl<W: Write> WriteThrift<W> for i16 {
+    const ELEMENT_TYPE: ElementType = ElementType::I16;
+
+    fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
+        writer.write_i16(*self)
+    }
+}
+
+impl<W: Write> WriteThrift<W> for i32 {
+    const ELEMENT_TYPE: ElementType = ElementType::I32;
+
+    fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
+        writer.write_i32(*self)
+    }
+}
+
+impl<W: Write> WriteThrift<W> for i64 {
+    const ELEMENT_TYPE: ElementType = ElementType::I64;
+
+    fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
+        writer.write_i64(*self)
+    }
+}
+
 pub(crate) trait WriteThriftField<W: Write> {
     // used to write struct fields (which may be basic types or generated types).
     // write the field header and field value. returns `field_id`.
@@ -724,6 +787,22 @@ impl<W: Write> WriteThriftField<W> for &str {
     }
 }
 
+impl<T, W: Write> WriteThriftField<W> for Vec<T>
+where
+    T: WriteThrift<W>,
+{
+    fn write_thrift_field(
+        &self,
+        writer: &mut ThriftCompactOutputProtocol<W>,
+        field_id: i16,
+        last_field_id: i16,
+    ) -> Result<i16> {
+        writer.write_field_begin(FieldType::List, field_id, last_field_id)?;
+        self.write_thrift(writer)?;
+        Ok(field_id)
+    }
+}
+
 #[cfg(test)]
 #[allow(deprecated)] // allow BIT_PACKED encoding for the whole test module
 pub(crate) mod tests {
@@ -744,6 +823,8 @@ pub(crate) mod tests {
         let mut writer = ThriftCompactOutputProtocol::new(buf);
         val.write_thrift(&mut writer).unwrap();
 
+        //println!("serialized: {:x?}", writer.inner());
+
         let mut prot = ThriftCompactInputProtocol::new(writer.inner());
         let read_val = T::try_from(&mut prot).unwrap();
         assert_eq!(val, read_val);

From 272a013dd263c2b31555743f1fadad9b4221ccae Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@live.com>
Date: Mon, 25 Aug 2025 15:05:12 -0700
Subject: [PATCH 24/46] get rid of copied allow

---
 parquet/src/parquet_thrift.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/parquet/src/parquet_thrift.rs b/parquet/src/parquet_thrift.rs
index 80427ddf1359..c8ff863c15a4 100644
--- a/parquet/src/parquet_thrift.rs
+++ b/parquet/src/parquet_thrift.rs
@@ -804,7 +804,6 @@ where
 }
 
 #[cfg(test)]
-#[allow(deprecated)] // allow BIT_PACKED encoding for the whole test module
 pub(crate) mod tests {
     use crate::basic::{TimeUnit, Type};
 

From 632e17127bfa747a064ad9c7900287f4a8f56874 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@live.com>
Date: Mon, 25 Aug 2025 17:12:19 -0700
Subject: [PATCH 25/46] get writer macros for structs working

---
 parquet/src/basic.rs                        | 334 +++++---------------
 parquet/src/file/column_crypto_metadata.rs  |  37 ++-
 parquet/src/file/metadata/mod.rs            |   6 +-
 parquet/src/file/metadata/thrift_gen.rs     |   6 +-
 parquet/src/file/page_encoding_stats.rs     |   7 +-
 parquet/src/file/page_index/index_reader.rs |   6 +-
 parquet/src/file/page_index/offset_index.rs |   8 +-
 parquet/src/parquet_macros.rs               |  86 ++++-
 parquet/src/parquet_thrift.rs               |  82 +++++
 parquet/tests/arrow_reader/io/mod.rs        |   6 +-
 10 files changed, 305 insertions(+), 273 deletions(-)

diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs
index 50b920401646..3d774861c2a8 100644
--- a/parquet/src/basic.rs
+++ b/parquet/src/basic.rs
@@ -206,6 +206,19 @@ impl<W: Write> WriteThrift<W> for ConvertedType {
     }
 }
 
+impl<W: Write> WriteThriftField<W> for ConvertedType {
+    fn write_thrift_field(
+        &self,
+        writer: &mut ThriftCompactOutputProtocol<W>,
+        field_id: i16,
+        last_field_id: i16,
+    ) -> Result<i16> {
+        writer.write_field_begin(FieldType::I32, field_id, last_field_id)?;
+        self.write_thrift(writer)?;
+        Ok(field_id)
+    }
+}
+
 // ----------------------------------------------------------------------
 // Mirrors thrift union `crate::format::TimeUnit`
 
@@ -230,33 +243,6 @@ struct DecimalType {
 }
 );
 
-impl<W: Write> WriteThrift<W> for DecimalType {
-    const ELEMENT_TYPE: ElementType = ElementType::Struct;
-
-    #[allow(unused_assignments)]
-    fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
-        let mut last_field_id = 0i16;
-        last_field_id = self.scale.write_thrift_field(writer, 1, last_field_id)?;
-        last_field_id = self
-            .precision
-            .write_thrift_field(writer, 2, last_field_id)?;
-        writer.write_struct_end()
-    }
-}
-
-impl<W: Write> WriteThriftField<W> for DecimalType {
-    fn write_thrift_field(
-        &self,
-        writer: &mut ThriftCompactOutputProtocol<W>,
-        field_id: i16,
-        last_field_id: i16,
-    ) -> Result<i16> {
-        writer.write_field_begin(FieldType::Struct, field_id, last_field_id)?;
-        self.write_thrift(writer)?;
-        Ok(field_id)
-    }
-}
-
 thrift_struct!(
 struct TimestampType {
   1: required bool is_adjusted_to_u_t_c
@@ -264,33 +250,6 @@ struct TimestampType {
 }
 );
 
-impl<W: Write> WriteThrift<W> for TimestampType {
-    const ELEMENT_TYPE: ElementType = ElementType::Struct;
-
-    #[allow(unused_assignments)]
-    fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
-        let mut last_field_id = 0i16;
-        last_field_id = self
-            .is_adjusted_to_u_t_c
-            .write_thrift_field(writer, 1, last_field_id)?;
-        last_field_id = self.unit.write_thrift_field(writer, 2, last_field_id)?;
-        writer.write_struct_end()
-    }
-}
-
-impl<W: Write> WriteThriftField<W> for TimestampType {
-    fn write_thrift_field(
-        &self,
-        writer: &mut ThriftCompactOutputProtocol<W>,
-        field_id: i16,
-        last_field_id: i16,
-    ) -> Result<i16> {
-        writer.write_field_begin(FieldType::Struct, field_id, last_field_id)?;
-        self.write_thrift(writer)?;
-        Ok(field_id)
-    }
-}
-
 // they are identical
 use TimestampType as TimeType;
 
@@ -301,35 +260,6 @@ struct IntType {
 }
 );
 
-impl<W: Write> WriteThrift<W> for IntType {
-    const ELEMENT_TYPE: ElementType = ElementType::Struct;
-
-    #[allow(unused_assignments)]
-    fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
-        let mut last_field_id = 0i16;
-        last_field_id = self
-            .bit_width
-            .write_thrift_field(writer, 1, last_field_id)?;
-        last_field_id = self
-            .is_signed
-            .write_thrift_field(writer, 2, last_field_id)?;
-        writer.write_struct_end()
-    }
-}
-
-impl<W: Write> WriteThriftField<W> for IntType {
-    fn write_thrift_field(
-        &self,
-        writer: &mut ThriftCompactOutputProtocol<W>,
-        field_id: i16,
-        last_field_id: i16,
-    ) -> Result<i16> {
-        writer.write_field_begin(FieldType::Struct, field_id, last_field_id)?;
-        self.write_thrift(writer)?;
-        Ok(field_id)
-    }
-}
-
 thrift_struct!(
 struct VariantType {
   // The version of the variant specification that the variant was
@@ -338,70 +268,12 @@ struct VariantType {
 }
 );
 
-impl<W: Write> WriteThrift<W> for VariantType {
-    const ELEMENT_TYPE: ElementType = ElementType::Struct;
-
-    #[allow(unused_assignments)]
-    fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
-        let mut last_field_id = 0i16;
-        if self.specification_version.is_some() {
-            last_field_id =
-                self.specification_version
-                    .unwrap()
-                    .write_thrift_field(writer, 1, last_field_id)?;
-        }
-        writer.write_struct_end()
-    }
-}
-
-impl<W: Write> WriteThriftField<W> for VariantType {
-    fn write_thrift_field(
-        &self,
-        writer: &mut ThriftCompactOutputProtocol<W>,
-        field_id: i16,
-        last_field_id: i16,
-    ) -> Result<i16> {
-        writer.write_field_begin(FieldType::Struct, field_id, last_field_id)?;
-        self.write_thrift(writer)?;
-        Ok(field_id)
-    }
-}
-
 thrift_struct!(
 struct GeometryType<'a> {
   1: optional string<'a> crs;
 }
 );
 
-impl<'a, W: Write> WriteThrift<W> for GeometryType<'a> {
-    const ELEMENT_TYPE: ElementType = ElementType::Struct;
-
-    #[allow(unused_assignments)]
-    fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
-        let mut last_field_id = 0i16;
-        if self.crs.is_some() {
-            last_field_id = self
-                .crs
-                .unwrap()
-                .write_thrift_field(writer, 1, last_field_id)?;
-        }
-        writer.write_struct_end()
-    }
-}
-
-impl<'a, W: Write> WriteThriftField<W> for GeometryType<'a> {
-    fn write_thrift_field(
-        &self,
-        writer: &mut ThriftCompactOutputProtocol<W>,
-        field_id: i16,
-        last_field_id: i16,
-    ) -> Result<i16> {
-        writer.write_field_begin(FieldType::Struct, field_id, last_field_id)?;
-        self.write_thrift(writer)?;
-        Ok(field_id)
-    }
-}
-
 thrift_struct!(
 struct GeographyType<'a> {
   1: optional string<'a> crs;
@@ -409,42 +281,6 @@ struct GeographyType<'a> {
 }
 );
 
-impl<'a, W: Write> WriteThrift<W> for GeographyType<'a> {
-    const ELEMENT_TYPE: ElementType = ElementType::Struct;
-
-    #[allow(unused_assignments)]
-    fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
-        let mut last_field_id = 0i16;
-        if self.crs.is_some() {
-            last_field_id = self
-                .crs
-                .unwrap()
-                .write_thrift_field(writer, 1, last_field_id)?;
-        }
-        if self.algorithm.is_some() {
-            last_field_id =
-                self.algorithm
-                    .as_ref()
-                    .unwrap()
-                    .write_thrift_field(writer, 2, last_field_id)?;
-        }
-        writer.write_struct_end()
-    }
-}
-
-impl<'a, W: Write> WriteThriftField<W> for GeographyType<'a> {
-    fn write_thrift_field(
-        &self,
-        writer: &mut ThriftCompactOutputProtocol<W>,
-        field_id: i16,
-        last_field_id: i16,
-    ) -> Result<i16> {
-        writer.write_field_begin(FieldType::Struct, field_id, last_field_id)?;
-        self.write_thrift(writer)?;
-        Ok(field_id)
-    }
-}
-
 /// Logical types used by version 2.4.0+ of the Parquet format.
 ///
 /// This is an *entirely new* struct as of version
@@ -646,20 +482,16 @@ impl<W: Write> WriteThrift<W> for LogicalType {
     fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
         match self {
             Self::String => {
-                writer.write_field_begin(FieldType::Struct, 1, 0)?;
-                writer.write_struct_end()?;
+                writer.write_empty_struct(1, 0)?;
             }
             Self::Map => {
-                writer.write_field_begin(FieldType::Struct, 2, 0)?;
-                writer.write_struct_end()?;
+                writer.write_empty_struct(2, 0)?;
             }
             Self::List => {
-                writer.write_field_begin(FieldType::Struct, 3, 0)?;
-                writer.write_struct_end()?;
+                writer.write_empty_struct(3, 0)?;
             }
             Self::Enum => {
-                writer.write_field_begin(FieldType::Struct, 4, 0)?;
-                writer.write_struct_end()?;
+                writer.write_empty_struct(4, 0)?;
             }
             Self::Decimal { scale, precision } => {
                 DecimalType {
@@ -669,8 +501,7 @@ impl<W: Write> WriteThrift<W> for LogicalType {
                 .write_thrift_field(writer, 5, 0)?;
             }
             Self::Date => {
-                writer.write_field_begin(FieldType::Struct, 6, 0)?;
-                writer.write_struct_end()?;
+                writer.write_empty_struct(6, 0)?;
             }
             Self::Time {
                 is_adjusted_to_u_t_c,
@@ -703,24 +534,19 @@ impl<W: Write> WriteThrift<W> for LogicalType {
                 .write_thrift_field(writer, 10, 0)?;
             }
             Self::Unknown => {
-                writer.write_field_begin(FieldType::Struct, 11, 0)?;
-                writer.write_struct_end()?;
+                writer.write_empty_struct(11, 0)?;
             }
             Self::Json => {
-                writer.write_field_begin(FieldType::Struct, 12, 0)?;
-                writer.write_struct_end()?;
+                writer.write_empty_struct(12, 0)?;
             }
             Self::Bson => {
-                writer.write_field_begin(FieldType::Struct, 13, 0)?;
-                writer.write_struct_end()?;
+                writer.write_empty_struct(13, 0)?;
             }
             Self::Uuid => {
-                writer.write_field_begin(FieldType::Struct, 14, 0)?;
-                writer.write_struct_end()?;
+                writer.write_empty_struct(14, 0)?;
             }
             Self::Float16 => {
-                writer.write_field_begin(FieldType::Struct, 15, 0)?;
-                writer.write_struct_end()?;
+                writer.write_empty_struct(15, 0)?;
             }
             Self::Variant {
                 specification_version,
@@ -749,35 +575,37 @@ impl<W: Write> WriteThrift<W> for LogicalType {
     }
 }
 
+impl<W: Write> WriteThriftField<W> for LogicalType {
+    fn write_thrift_field(
+        &self,
+        writer: &mut ThriftCompactOutputProtocol<W>,
+        field_id: i16,
+        last_field_id: i16,
+    ) -> Result<i16> {
+        writer.write_field_begin(FieldType::Struct, field_id, last_field_id)?;
+        self.write_thrift(writer)?;
+        Ok(field_id)
+    }
+}
+
 // ----------------------------------------------------------------------
 // Mirrors thrift enum `crate::format::FieldRepetitionType`
 //
 // Cannot use macro since the name is changed
 
+thrift_enum!(
 /// Representation of field types in schema.
-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
-#[allow(non_camel_case_types)]
-pub enum Repetition {
-    /// Field is required (can not be null) and each record has exactly 1 value.
-    REQUIRED,
-    /// Field is optional (can be null) and each record has 0 or 1 values.
-    OPTIONAL,
-    /// Field is repeated and can contain 0 or more values.
-    REPEATED,
+enum FieldRepetitionType {
+  /// This field is required (can not be null) and each row has exactly 1 value.
+  REQUIRED = 0;
+  /// The field is optional (can be null) and each row has 0 or 1 values.
+  OPTIONAL = 1;
+  /// The field is repeated and can contain 0 or more values.
+  REPEATED = 2;
 }
+);
 
-impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for Repetition {
-    type Error = ParquetError;
-    fn try_from(prot: &mut ThriftCompactInputProtocol<'a>) -> Result<Self> {
-        let val = prot.read_i32()?;
-        Ok(match val {
-            0 => Self::REQUIRED,
-            1 => Self::OPTIONAL,
-            2 => Self::REPEATED,
-            _ => return Err(general_err!("Unexpected FieldRepetitionType {}", val)),
-        })
-    }
-}
+pub(crate) type Repetition = FieldRepetitionType;
 
 // ----------------------------------------------------------------------
 // Mirrors thrift enum `crate::format::Encoding`
@@ -945,6 +773,39 @@ impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for Compression {
     }
 }
 
+// FIXME
+// ugh...why did we add compression level to some variants if we don't use them????
+impl<W: Write> WriteThrift<W> for Compression {
+    const ELEMENT_TYPE: ElementType = ElementType::I32;
+
+    fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
+        let id: i32 = match *self {
+            Self::UNCOMPRESSED => 0,
+            Self::SNAPPY => 1,
+            Self::GZIP(_) => 2,
+            Self::LZO => 3,
+            Self::BROTLI(_) => 4,
+            Self::LZ4 => 5,
+            Self::ZSTD(_) => 6,
+            Self::LZ4_RAW => 7,
+        };
+        writer.write_i32(id)
+    }
+}
+
+impl<W: Write> WriteThriftField<W> for Compression {
+    fn write_thrift_field(
+        &self,
+        writer: &mut ThriftCompactOutputProtocol<W>,
+        field_id: i16,
+        last_field_id: i16,
+    ) -> Result<i16> {
+        writer.write_field_begin(FieldType::I32, field_id, last_field_id)?;
+        self.write_thrift(writer)?;
+        Ok(field_id)
+    }
+}
+
 impl Compression {
     /// Returns the codec type of this compression setting as a string, without the compression
     /// level.
@@ -1317,12 +1178,6 @@ impl fmt::Display for ConvertedType {
     }
 }
 
-impl fmt::Display for Repetition {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        write!(f, "{self:?}")
-    }
-}
-
 impl fmt::Display for Compression {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         write!(f, "{self:?}")
@@ -1574,37 +1429,6 @@ impl From<Option<LogicalType>> for ConvertedType {
     }
 }
 
-// ----------------------------------------------------------------------
-// crate::format::FieldRepetitionType <=> Repetition conversion
-
-impl TryFrom<crate::format::FieldRepetitionType> for Repetition {
-    type Error = ParquetError;
-
-    fn try_from(value: crate::format::FieldRepetitionType) -> Result<Self> {
-        Ok(match value {
-            crate::format::FieldRepetitionType::REQUIRED => Repetition::REQUIRED,
-            crate::format::FieldRepetitionType::OPTIONAL => Repetition::OPTIONAL,
-            crate::format::FieldRepetitionType::REPEATED => Repetition::REPEATED,
-            _ => {
-                return Err(general_err!(
-                    "unexpected parquet repetition type: {}",
-                    value.0
-                ))
-            }
-        })
-    }
-}
-
-impl From<Repetition> for crate::format::FieldRepetitionType {
-    fn from(value: Repetition) -> Self {
-        match value {
-            Repetition::REQUIRED => crate::format::FieldRepetitionType::REQUIRED,
-            Repetition::OPTIONAL => crate::format::FieldRepetitionType::OPTIONAL,
-            Repetition::REPEATED => crate::format::FieldRepetitionType::REPEATED,
-        }
-    }
-}
-
 // ----------------------------------------------------------------------
 // crate::format::CompressionCodec <=> Compression conversion
 
diff --git a/parquet/src/file/column_crypto_metadata.rs b/parquet/src/file/column_crypto_metadata.rs
index 95cbc65cf716..ec2e91beaa54 100644
--- a/parquet/src/file/column_crypto_metadata.rs
+++ b/parquet/src/file/column_crypto_metadata.rs
@@ -17,13 +17,18 @@
 
 //! Column chunk encryption metadata
 
+use std::io::Write;
+
 use crate::errors::{ParquetError, Result};
 use crate::format::{
     ColumnCryptoMetaData as TColumnCryptoMetaData,
     EncryptionWithColumnKey as TEncryptionWithColumnKey,
     EncryptionWithFooterKey as TEncryptionWithFooterKey,
 };
-use crate::parquet_thrift::{FieldType, ThriftCompactInputProtocol};
+use crate::parquet_thrift::{
+    ElementType, FieldType, ThriftCompactInputProtocol, ThriftCompactOutputProtocol, WriteThrift,
+    WriteThriftField,
+};
 use crate::{thrift_struct, thrift_union};
 
 // define this and ColumnCryptoMetadata here so they're only defined when
@@ -48,6 +53,36 @@ union ColumnCryptoMetaData {
 }
 );
 
+// TODO: need to get this into the thrift_union macro
+impl<W: Write> WriteThrift<W> for ColumnCryptoMetaData {
+    const ELEMENT_TYPE: ElementType = ElementType::Struct;
+
+    fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
+        match self {
+            Self::ENCRYPTION_WITH_FOOTER_KEY => {
+                writer.write_empty_struct(1, 0)?;
+            }
+            Self::ENCRYPTION_WITH_COLUMN_KEY(key) => {
+                key.write_thrift_field(writer, 2, 0)?;
+            }
+        }
+        writer.write_struct_end()
+    }
+}
+
+impl<W: Write> WriteThriftField<W> for ColumnCryptoMetaData {
+    fn write_thrift_field(
+        &self,
+        writer: &mut ThriftCompactOutputProtocol<W>,
+        field_id: i16,
+        last_field_id: i16,
+    ) -> Result<i16> {
+        writer.write_field_begin(FieldType::Struct, field_id, last_field_id)?;
+        self.write_thrift(writer)?;
+        Ok(field_id)
+    }
+}
+
 /// Converts Thrift definition into `ColumnCryptoMetadata`.
 pub fn try_from_thrift(
     thrift_column_crypto_metadata: &TColumnCryptoMetaData,
diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs
index 69cdf8f10714..8b06fe676308 100644
--- a/parquet/src/file/metadata/mod.rs
+++ b/parquet/src/file/metadata/mod.rs
@@ -125,7 +125,10 @@ use crate::{
 };
 use crate::{
     basic::{ColumnOrder, Compression, Encoding, Type},
-    parquet_thrift::{FieldType, ThriftCompactInputProtocol},
+    parquet_thrift::{
+        ElementType, FieldType, ThriftCompactInputProtocol, ThriftCompactOutputProtocol,
+        WriteThrift, WriteThriftField,
+    },
 };
 use crate::{
     data_type::private::ParquetValueType, file::page_index::offset_index::OffsetIndexMetaData,
@@ -135,6 +138,7 @@ use crate::{
     thrift_struct,
 };
 pub use reader::{FooterTail, PageIndexPolicy, ParquetMetaDataReader};
+use std::io::Write;
 use std::ops::Range;
 use std::sync::Arc;
 pub use writer::ParquetMetaDataWriter;
diff --git a/parquet/src/file/metadata/thrift_gen.rs b/parquet/src/file/metadata/thrift_gen.rs
index 3888d247df1c..bc3914112d0d 100644
--- a/parquet/src/file/metadata/thrift_gen.rs
+++ b/parquet/src/file/metadata/thrift_gen.rs
@@ -17,6 +17,7 @@
 
 // a collection of generated structs used to parse thrift metadata
 
+use std::io::Write;
 use std::sync::Arc;
 
 #[cfg(feature = "encryption")]
@@ -33,7 +34,10 @@ use crate::{
         page_encoding_stats::PageEncodingStats,
         statistics::ValueStatistics,
     },
-    parquet_thrift::{FieldType, ThriftCompactInputProtocol},
+    parquet_thrift::{
+        ElementType, FieldType, ThriftCompactInputProtocol, ThriftCompactOutputProtocol,
+        WriteThrift, WriteThriftField,
+    },
     schema::types::{parquet_schema_from_array, ColumnDescriptor, SchemaDescriptor},
     thrift_struct,
     util::bit_util::FromBytes,
diff --git a/parquet/src/file/page_encoding_stats.rs b/parquet/src/file/page_encoding_stats.rs
index 281954d939dd..2d433dc9b3f1 100644
--- a/parquet/src/file/page_encoding_stats.rs
+++ b/parquet/src/file/page_encoding_stats.rs
@@ -17,9 +17,14 @@
 
 //! Per-page encoding information.
 
+use std::io::Write;
+
 use crate::basic::{Encoding, PageType};
 use crate::errors::{ParquetError, Result};
-use crate::parquet_thrift::{FieldType, ThriftCompactInputProtocol};
+use crate::parquet_thrift::{
+    ElementType, FieldType, ThriftCompactInputProtocol, ThriftCompactOutputProtocol, WriteThrift,
+    WriteThriftField,
+};
 use crate::thrift_struct;
 
 // TODO: This should probably all be moved to thrift_gen
diff --git a/parquet/src/file/page_index/index_reader.rs b/parquet/src/file/page_index/index_reader.rs
index 99e5963b290e..e9cf119224c9 100644
--- a/parquet/src/file/page_index/index_reader.rs
+++ b/parquet/src/file/page_index/index_reader.rs
@@ -26,8 +26,12 @@ use crate::file::page_index::column_index::{
 };
 use crate::file::page_index::offset_index::OffsetIndexMetaData;
 use crate::file::reader::ChunkReader;
-use crate::parquet_thrift::{FieldType, ThriftCompactInputProtocol};
+use crate::parquet_thrift::{
+    ElementType, FieldType, ThriftCompactInputProtocol, ThriftCompactOutputProtocol, WriteThrift,
+    WriteThriftField,
+};
 use crate::thrift_struct;
+use std::io::Write;
 use std::ops::Range;
 
 /// Computes the covering range of two optional ranges
diff --git a/parquet/src/file/page_index/offset_index.rs b/parquet/src/file/page_index/offset_index.rs
index 8217fa7878c8..b2842a897ebf 100644
--- a/parquet/src/file/page_index/offset_index.rs
+++ b/parquet/src/file/page_index/offset_index.rs
@@ -44,7 +44,7 @@ pub struct PageLocation {
 }
 );
 
-impl<W: Write> WriteThrift<W> for PageLocation {
+/*impl<W: Write> WriteThrift<W> for PageLocation {
     const ELEMENT_TYPE: ElementType = ElementType::Struct;
 
     #[allow(unused_assignments)]
@@ -59,7 +59,7 @@ impl<W: Write> WriteThrift<W> for PageLocation {
             .write_thrift_field(writer, 3, last_field_id)?;
         writer.write_struct_end()
     }
-}
+}*/
 
 impl From<&crate::format::PageLocation> for PageLocation {
     fn from(value: &crate::format::PageLocation) -> Self {
@@ -95,7 +95,7 @@ pub struct OffsetIndexMetaData {
 }
 );
 
-impl<W: Write> WriteThrift<W> for OffsetIndexMetaData {
+/*impl<W: Write> WriteThrift<W> for OffsetIndexMetaData {
     const ELEMENT_TYPE: ElementType = ElementType::Struct;
 
     #[allow(unused_assignments)]
@@ -116,7 +116,7 @@ impl<W: Write> WriteThrift<W> for OffsetIndexMetaData {
         }
         writer.write_struct_end()
     }
-}
+}*/
 
 impl OffsetIndexMetaData {
     /// Creates a new [`OffsetIndexMetaData`] from an [`OffsetIndex`].
diff --git a/parquet/src/parquet_macros.rs b/parquet/src/parquet_macros.rs
index 40aadad98fb1..c015a8685651 100644
--- a/parquet/src/parquet_macros.rs
+++ b/parquet/src/parquet_macros.rs
@@ -140,10 +140,8 @@ macro_rules! thrift_union_all_empty {
 
             fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
                 match *self {
-                    $(Self::$field_name => writer.write_field_begin(FieldType::Struct, $field_id, 0)?,)*
-                }
-                // write end of struct for empty struct
-                writer.write_struct_end()?;
+                    $(Self::$field_name => writer.write_empty_struct($field_id, 0)?,)*
+                };
                 // write end of struct for this union
                 writer.write_struct_end()
             }
@@ -266,9 +264,89 @@ macro_rules! thrift_struct {
                 })
             }
         }
+
+        impl<$($lt,)? W: Write> WriteThrift<W> for $identifier $(<$lt>)? {
+            const ELEMENT_TYPE: ElementType = ElementType::Struct;
+
+            #[allow(unused_assignments)]
+            fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
+                let mut last_field_id = 0i16;
+                $($crate::__thrift_write_required_or_optional_field!($required_or_optional $field_name, $field_id, $field_type, self, writer, last_field_id);)*
+                Ok(())
+            }
+        }
+
+        impl<$($lt,)? W: Write> WriteThriftField<W> for $identifier $(<$lt>)? {
+            fn write_thrift_field(&self, writer: &mut ThriftCompactOutputProtocol<W>, field_id: i16, last_field_id: i16) -> Result<i16> {
+                writer.write_field_begin(FieldType::Struct, field_id, last_field_id)?;
+                self.write_thrift(writer)?;
+                Ok(field_id)
+            }
+        }
     }
 }
 
+#[doc(hidden)]
+#[macro_export]
+macro_rules! __thrift_write_required_or_optional_field {
+    (required $field_name:ident, $field_id:literal, $field_type:ident, $self:tt, $writer:tt, $last_id:tt) => {
+        $crate::__thrift_write_required_field!(
+            $field_type,
+            $field_name,
+            $field_id,
+            $self,
+            $writer,
+            $last_id
+        )
+    };
+    (optional $field_name:ident, $field_id:literal, $field_type:ident, $self:tt, $writer:tt, $last_id:tt) => {
+        $crate::__thrift_write_optional_field!(
+            $field_type,
+            $field_name,
+            $field_id,
+            $self,
+            $writer,
+            $last_id
+        )
+    };
+}
+
+#[doc(hidden)]
+#[macro_export]
+macro_rules! __thrift_write_required_field {
+    (binary, $field_name:ident, $field_id:literal, $self:ident, $writer:ident, $last_id:ident) => {
+        $writer.write_field_begin(FieldType::Binary, $field_id, $last_id)?;
+        $writer.write_bytes($self.$field_name)?;
+        $last_id = $field_id;
+    };
+    ($field_type:ident, $field_name:ident, $field_id:literal, $self:ident, $writer:ident, $last_id:ident) => {
+        $last_id = $self
+            .$field_name
+            .write_thrift_field($writer, $field_id, $last_id)?;
+    };
+}
+
+#[doc(hidden)]
+#[macro_export]
+macro_rules! __thrift_write_optional_field {
+    (binary, $field_name:ident, $field_id:literal, $self:ident, $writer:tt, $last_id:tt) => {
+        if $self.$field_name.is_some() {
+            $writer.write_field_begin(FieldType::Binary, $field_id, $last_id)?;
+            $writer.write_bytes($self.$field_name.as_ref().unwrap())?;
+            $last_id = $field_id;
+        }
+    };
+    ($field_type:ident, $field_name:ident, $field_id:literal, $self:ident, $writer:tt, $last_id:tt) => {
+        if $self.$field_name.is_some() {
+            $last_id = $self
+                .$field_name
+                .as_ref()
+                .unwrap()
+                .write_thrift_field($writer, $field_id, $last_id)?;
+        }
+    };
+}
+
 /// macro to use when decoding struct fields
 #[macro_export]
 macro_rules! thrift_read_field {
diff --git a/parquet/src/parquet_thrift.rs b/parquet/src/parquet_thrift.rs
index c8ff863c15a4..b2cb7bf54597 100644
--- a/parquet/src/parquet_thrift.rs
+++ b/parquet/src/parquet_thrift.rs
@@ -609,6 +609,12 @@ impl<W: Write> ThriftCompactOutputProtocol<W> {
         Ok(())
     }
 
+    pub(crate) fn write_empty_struct(&mut self, field_id: i16, last_field_id: i16) -> Result<i16> {
+        self.write_field_begin(FieldType::Struct, field_id, last_field_id)?;
+        self.write_struct_end()?;
+        Ok(last_field_id)
+    }
+
     pub(crate) fn write_bool(&mut self, val: bool) -> Result<()> {
         match val {
             true => self.write_byte(1),
@@ -631,6 +637,11 @@ impl<W: Write> ThriftCompactOutputProtocol<W> {
     pub(crate) fn write_i64(&mut self, val: i64) -> Result<()> {
         self.write_zig_zag(val as _)
     }
+
+    pub(crate) fn write_double(&mut self, val: f64) -> Result<()> {
+        self.writer.write_all(&val.to_le_bytes())?;
+        Ok(())
+    }
 }
 
 pub(crate) trait WriteThrift<W: Write> {
@@ -695,6 +706,38 @@ impl<W: Write> WriteThrift<W> for i64 {
     }
 }
 
+impl<W: Write> WriteThrift<W> for OrderedF64 {
+    const ELEMENT_TYPE: ElementType = ElementType::Double;
+
+    fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
+        writer.write_double(self.0)
+    }
+}
+
+impl<W: Write> WriteThrift<W> for &[u8] {
+    const ELEMENT_TYPE: ElementType = ElementType::Binary;
+
+    fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
+        writer.write_bytes(self)
+    }
+}
+
+impl<W: Write> WriteThrift<W> for &str {
+    const ELEMENT_TYPE: ElementType = ElementType::Binary;
+
+    fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
+        writer.write_bytes(self.as_bytes())
+    }
+}
+
+impl<W: Write> WriteThrift<W> for String {
+    const ELEMENT_TYPE: ElementType = ElementType::Binary;
+
+    fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
+        writer.write_bytes(self.as_bytes())
+    }
+}
+
 pub(crate) trait WriteThriftField<W: Write> {
     // used to write struct fields (which may be basic types or generated types).
     // write the field header and field value. returns `field_id`.
@@ -774,6 +817,32 @@ impl<W: Write> WriteThriftField<W> for i64 {
     }
 }
 
+impl<W: Write> WriteThriftField<W> for OrderedF64 {
+    fn write_thrift_field(
+        &self,
+        writer: &mut ThriftCompactOutputProtocol<W>,
+        field_id: i16,
+        last_field_id: i16,
+    ) -> Result<i16> {
+        writer.write_field_begin(FieldType::Double, field_id, last_field_id)?;
+        writer.write_double(self.0)?;
+        Ok(field_id)
+    }
+}
+
+impl<W: Write> WriteThriftField<W> for &[u8] {
+    fn write_thrift_field(
+        &self,
+        writer: &mut ThriftCompactOutputProtocol<W>,
+        field_id: i16,
+        last_field_id: i16,
+    ) -> Result<i16> {
+        writer.write_field_begin(FieldType::Binary, field_id, last_field_id)?;
+        writer.write_bytes(self)?;
+        Ok(field_id)
+    }
+}
+
 impl<W: Write> WriteThriftField<W> for &str {
     fn write_thrift_field(
         &self,
@@ -787,6 +856,19 @@ impl<W: Write> WriteThriftField<W> for &str {
     }
 }
 
+impl<W: Write> WriteThriftField<W> for String {
+    fn write_thrift_field(
+        &self,
+        writer: &mut ThriftCompactOutputProtocol<W>,
+        field_id: i16,
+        last_field_id: i16,
+    ) -> Result<i16> {
+        writer.write_field_begin(FieldType::Binary, field_id, last_field_id)?;
+        writer.write_bytes(self.as_bytes())?;
+        Ok(field_id)
+    }
+}
+
 impl<T, W: Write> WriteThriftField<W> for Vec<T>
 where
     T: WriteThrift<W>,
diff --git a/parquet/tests/arrow_reader/io/mod.rs b/parquet/tests/arrow_reader/io/mod.rs
index bfdb9467e20c..051a61de5075 100644
--- a/parquet/tests/arrow_reader/io/mod.rs
+++ b/parquet/tests/arrow_reader/io/mod.rs
@@ -286,11 +286,7 @@ impl TestRowGroups {
                     .enumerate()
                     .map(|(col_idx, col_meta)| {
                         let column_name = col_meta.column_descr().name().to_string();
-                        let page_locations = offset_index[rg_index][col_idx]
-                            .page_locations()
-                            .iter()
-                            .map(parquet::format::PageLocation::from)
-                            .collect();
+                        let page_locations = offset_index[rg_index][col_idx].page_locations();
                         let dictionary_page_location = col_meta.dictionary_page_offset();
 
                         // We can find the byte range of the entire column chunk

From 9f01b6076782a8f98f532291d10c1d3a7e0fcbd1 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@live.com>
Date: Mon, 25 Aug 2025 17:21:17 -0700
Subject: [PATCH 26/46] fix bug in struct macro

---
 parquet/src/file/page_index/offset_index.rs | 40 ---------------------
 parquet/src/parquet_macros.rs               |  2 +-
 2 files changed, 1 insertion(+), 41 deletions(-)

diff --git a/parquet/src/file/page_index/offset_index.rs b/parquet/src/file/page_index/offset_index.rs
index b2842a897ebf..ac2620af09d8 100644
--- a/parquet/src/file/page_index/offset_index.rs
+++ b/parquet/src/file/page_index/offset_index.rs
@@ -44,23 +44,6 @@ pub struct PageLocation {
 }
 );
 
-/*impl<W: Write> WriteThrift<W> for PageLocation {
-    const ELEMENT_TYPE: ElementType = ElementType::Struct;
-
-    #[allow(unused_assignments)]
-    fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
-        let mut last_field_id = 0i16;
-        last_field_id = self.offset.write_thrift_field(writer, 1, last_field_id)?;
-        last_field_id = self
-            .compressed_page_size
-            .write_thrift_field(writer, 2, last_field_id)?;
-        last_field_id = self
-            .first_row_index
-            .write_thrift_field(writer, 3, last_field_id)?;
-        writer.write_struct_end()
-    }
-}*/
-
 impl From<&crate::format::PageLocation> for PageLocation {
     fn from(value: &crate::format::PageLocation) -> Self {
         Self {
@@ -95,29 +78,6 @@ pub struct OffsetIndexMetaData {
 }
 );
 
-/*impl<W: Write> WriteThrift<W> for OffsetIndexMetaData {
-    const ELEMENT_TYPE: ElementType = ElementType::Struct;
-
-    #[allow(unused_assignments)]
-    fn write_thrift(
-        &self,
-        writer: &mut crate::parquet_thrift::ThriftCompactOutputProtocol<W>,
-    ) -> Result<()> {
-        let mut last_field_id = 0i16;
-        last_field_id = self
-            .page_locations
-            .write_thrift_field(writer, 1, last_field_id)?;
-        if self.unencoded_byte_array_data_bytes.is_some() {
-            last_field_id = self
-                .unencoded_byte_array_data_bytes
-                .as_ref()
-                .unwrap()
-                .write_thrift_field(writer, 2, last_field_id)?;
-        }
-        writer.write_struct_end()
-    }
-}*/
-
 impl OffsetIndexMetaData {
     /// Creates a new [`OffsetIndexMetaData`] from an [`OffsetIndex`].
     ///
diff --git a/parquet/src/parquet_macros.rs b/parquet/src/parquet_macros.rs
index c015a8685651..5573c5812946 100644
--- a/parquet/src/parquet_macros.rs
+++ b/parquet/src/parquet_macros.rs
@@ -272,7 +272,7 @@ macro_rules! thrift_struct {
             fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
                 let mut last_field_id = 0i16;
                 $($crate::__thrift_write_required_or_optional_field!($required_or_optional $field_name, $field_id, $field_type, self, writer, last_field_id);)*
-                Ok(())
+                writer.write_struct_end()
             }
         }
 

From 2511f8fe0f8a4d598fe2ec5f519aa31447b8859d Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@live.com>
Date: Mon, 25 Aug 2025 18:19:40 -0700
Subject: [PATCH 27/46] make Repetition public

---
 parquet/src/basic.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs
index 3d774861c2a8..cf451b961f69 100644
--- a/parquet/src/basic.rs
+++ b/parquet/src/basic.rs
@@ -605,7 +605,8 @@ enum FieldRepetitionType {
 }
 );
 
-pub(crate) type Repetition = FieldRepetitionType;
+/// Type alias for thrift `FieldRepetitionType`
+pub type Repetition = FieldRepetitionType;
 
 // ----------------------------------------------------------------------
 // Mirrors thrift enum `crate::format::Encoding`

From 61e9e07655e16cd34e39ab3226b37fbaa2c61e10 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@live.com>
Date: Mon, 25 Aug 2025 22:32:36 -0700
Subject: [PATCH 28/46] get union working for writes

---
 parquet/src/file/column_crypto_metadata.rs | 51 +++++++++-------------
 parquet/src/parquet_macros.rs              | 50 +++++++++++++++++----
 2 files changed, 63 insertions(+), 38 deletions(-)

diff --git a/parquet/src/file/column_crypto_metadata.rs b/parquet/src/file/column_crypto_metadata.rs
index ec2e91beaa54..5bba07357947 100644
--- a/parquet/src/file/column_crypto_metadata.rs
+++ b/parquet/src/file/column_crypto_metadata.rs
@@ -53,36 +53,6 @@ union ColumnCryptoMetaData {
 }
 );
 
-// TODO: need to get this into the thrift_union macro
-impl<W: Write> WriteThrift<W> for ColumnCryptoMetaData {
-    const ELEMENT_TYPE: ElementType = ElementType::Struct;
-
-    fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
-        match self {
-            Self::ENCRYPTION_WITH_FOOTER_KEY => {
-                writer.write_empty_struct(1, 0)?;
-            }
-            Self::ENCRYPTION_WITH_COLUMN_KEY(key) => {
-                key.write_thrift_field(writer, 2, 0)?;
-            }
-        }
-        writer.write_struct_end()
-    }
-}
-
-impl<W: Write> WriteThriftField<W> for ColumnCryptoMetaData {
-    fn write_thrift_field(
-        &self,
-        writer: &mut ThriftCompactOutputProtocol<W>,
-        field_id: i16,
-        last_field_id: i16,
-    ) -> Result<i16> {
-        writer.write_field_begin(FieldType::Struct, field_id, last_field_id)?;
-        self.write_thrift(writer)?;
-        Ok(field_id)
-    }
-}
-
 /// Converts Thrift definition into `ColumnCryptoMetadata`.
 pub fn try_from_thrift(
     thrift_column_crypto_metadata: &TColumnCryptoMetaData,
@@ -119,6 +89,7 @@ pub fn to_thrift(column_crypto_metadata: &ColumnCryptoMetaData) -> TColumnCrypto
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::parquet_thrift::tests::test_roundtrip;
 
     #[test]
     fn test_encryption_with_footer_key_from_thrift() {
@@ -136,4 +107,24 @@ mod tests {
 
         assert_eq!(try_from_thrift(&to_thrift(&metadata)).unwrap(), metadata);
     }
+
+    #[test]
+    fn test_column_crypto_roundtrip() {
+        test_roundtrip(ColumnCryptoMetaData::ENCRYPTION_WITH_FOOTER_KEY);
+
+        let path_in_schema = vec!["foo".to_owned(), "bar".to_owned(), "really".to_owned()];
+        let key_metadata = vec![1u8; 32];
+        test_roundtrip(ColumnCryptoMetaData::ENCRYPTION_WITH_COLUMN_KEY(
+            EncryptionWithColumnKey {
+                path_in_schema: path_in_schema.clone(),
+                key_metadata: None,
+            },
+        ));
+        test_roundtrip(ColumnCryptoMetaData::ENCRYPTION_WITH_COLUMN_KEY(
+            EncryptionWithColumnKey {
+                path_in_schema,
+                key_metadata: Some(key_metadata),
+            },
+        ));
+    }
 }
diff --git a/parquet/src/parquet_macros.rs b/parquet/src/parquet_macros.rs
index 5573c5812946..41a5bf3b43f9 100644
--- a/parquet/src/parquet_macros.rs
+++ b/parquet/src/parquet_macros.rs
@@ -219,9 +219,51 @@ macro_rules! thrift_union {
                 Ok(ret)
             }
         }
+
+        impl<W: Write> WriteThrift<W> for $identifier {
+            const ELEMENT_TYPE: ElementType = ElementType::Struct;
+
+            fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
+                match self {
+                    $($crate::__thrift_write_variant_lhs!($field_name $($field_type)?, variant_val) =>
+                      $crate::__thrift_write_variant_rhs!($field_id $($field_type)?, writer, variant_val),)*
+                };
+                writer.write_struct_end()
+            }
+        }
+
+        impl<W: Write> WriteThriftField<W> for $identifier {
+            fn write_thrift_field(&self, writer: &mut ThriftCompactOutputProtocol<W>, field_id: i16, last_field_id: i16) -> Result<i16> {
+                writer.write_field_begin(FieldType::Struct, field_id, last_field_id)?;
+                self.write_thrift(writer)?;
+                Ok(field_id)
+            }
+        }
     }
 }
 
+#[doc(hidden)]
+#[macro_export]
+macro_rules! __thrift_write_variant_lhs {
+    ($field_name:ident $field_type:ident, $val:tt) => {
+        Self::$field_name($val)
+    };
+    ($field_name:ident, $val:tt) => {
+        Self::$field_name
+    };
+}
+
+#[doc(hidden)]
+#[macro_export]
+macro_rules! __thrift_write_variant_rhs {
+    ($field_id:literal $field_type:ident, $writer:tt, $val:ident) => {
+        $val.write_thrift_field($writer, $field_id, 0)?
+    };
+    ($field_id:literal, $writer:tt, $val:tt) => {
+        $writer.write_empty_struct($field_id, 0)?
+    };
+}
+
 /// macro to generate rust structs from a thrift struct definition
 /// unlike enum and union, this macro will allow for visibility specifier
 /// can also take optional lifetime for struct and elements within it (need e.g.)
@@ -347,14 +389,6 @@ macro_rules! __thrift_write_optional_field {
     };
 }
 
-/// macro to use when decoding struct fields
-#[macro_export]
-macro_rules! thrift_read_field {
-    ($field_name:ident, $prot:tt, $field_type:ident) => {
-        $field_name = Some($crate::__thrift_read_field!($prot, $field_type));
-    };
-}
-
 #[doc(hidden)]
 #[macro_export]
 macro_rules! __thrift_required_or_optional {

From e39f119d30fca1ec7b87acfc56a1aa5c4ef41626 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@live.com>
Date: Mon, 25 Aug 2025 22:49:34 -0700
Subject: [PATCH 29/46] add some tests

---
 parquet/src/file/metadata/thrift_gen.rs | 42 +++++++++++++++++++++++++
 parquet/src/parquet_thrift.rs           |  2 +-
 2 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/parquet/src/file/metadata/thrift_gen.rs b/parquet/src/file/metadata/thrift_gen.rs
index bc3914112d0d..161f792084f7 100644
--- a/parquet/src/file/metadata/thrift_gen.rs
+++ b/parquet/src/file/metadata/thrift_gen.rs
@@ -511,3 +511,45 @@ impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for ParquetMetaData {
         Ok(ParquetMetaData::new(fmd, row_groups))
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use crate::file::metadata::thrift_gen::BoundingBox;
+    use crate::parquet_thrift::{tests::test_roundtrip, OrderedF64};
+
+    #[test]
+    fn test_bounding_box_roundtrip() {
+        test_roundtrip(BoundingBox {
+            xmin: OrderedF64(0.1),
+            xmax: OrderedF64(10.3),
+            ymin: OrderedF64(0.001),
+            ymax: OrderedF64(128.5),
+            zmin: None,
+            zmax: None,
+            mmin: None,
+            mmax: None,
+        });
+
+        test_roundtrip(BoundingBox {
+            xmin: OrderedF64(0.1),
+            xmax: OrderedF64(10.3),
+            ymin: OrderedF64(0.001),
+            ymax: OrderedF64(128.5),
+            zmin: Some(OrderedF64(11.0)),
+            zmax: Some(OrderedF64(1300.0)),
+            mmin: None,
+            mmax: None,
+        });
+
+        test_roundtrip(BoundingBox {
+            xmin: OrderedF64(0.1),
+            xmax: OrderedF64(10.3),
+            ymin: OrderedF64(0.001),
+            ymax: OrderedF64(128.5),
+            zmin: Some(OrderedF64(11.0)),
+            zmax: Some(OrderedF64(1300.0)),
+            mmin: Some(OrderedF64(3.14)),
+            mmax: Some(OrderedF64(42.0)),
+        });
+    }
+}
diff --git a/parquet/src/parquet_thrift.rs b/parquet/src/parquet_thrift.rs
index b2cb7bf54597..935965b64abd 100644
--- a/parquet/src/parquet_thrift.rs
+++ b/parquet/src/parquet_thrift.rs
@@ -29,7 +29,7 @@ use crate::errors::{ParquetError, Result};
 // wrappers out there that should probably be used instead.
 // thrift seems to re-export an impl from ordered-float
 #[derive(Debug, Clone, Copy, PartialEq)]
-pub struct OrderedF64(f64);
+pub struct OrderedF64(pub f64);
 
 impl From<OrderedF64> for f64 {
     fn from(value: OrderedF64) -> Self {

From def3d07fa516bfadbf5d7bd35ffa6a4c0b427994 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@live.com>
Date: Mon, 25 Aug 2025 22:55:55 -0700
Subject: [PATCH 30/46] redo OrderedF64 initialization

---
 parquet/src/file/metadata/thrift_gen.rs | 36 ++++++++++++-------------
 parquet/src/parquet_thrift.rs           |  8 +++++-
 2 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/parquet/src/file/metadata/thrift_gen.rs b/parquet/src/file/metadata/thrift_gen.rs
index 161f792084f7..c553d8f5f572 100644
--- a/parquet/src/file/metadata/thrift_gen.rs
+++ b/parquet/src/file/metadata/thrift_gen.rs
@@ -520,10 +520,10 @@ mod tests {
     #[test]
     fn test_bounding_box_roundtrip() {
         test_roundtrip(BoundingBox {
-            xmin: OrderedF64(0.1),
-            xmax: OrderedF64(10.3),
-            ymin: OrderedF64(0.001),
-            ymax: OrderedF64(128.5),
+            xmin: 0.1.into(),
+            xmax: 10.3.into(),
+            ymin: 0.001.into(),
+            ymax: 128.5.into(),
             zmin: None,
             zmax: None,
             mmin: None,
@@ -531,25 +531,25 @@ mod tests {
         });
 
         test_roundtrip(BoundingBox {
-            xmin: OrderedF64(0.1),
-            xmax: OrderedF64(10.3),
-            ymin: OrderedF64(0.001),
-            ymax: OrderedF64(128.5),
-            zmin: Some(OrderedF64(11.0)),
-            zmax: Some(OrderedF64(1300.0)),
+            xmin: 0.1.into(),
+            xmax: 10.3.into(),
+            ymin: 0.001.into(),
+            ymax: 128.5.into(),
+            zmin: Some(11.0.into()),
+            zmax: Some(1300.0.into()),
             mmin: None,
             mmax: None,
         });
 
         test_roundtrip(BoundingBox {
-            xmin: OrderedF64(0.1),
-            xmax: OrderedF64(10.3),
-            ymin: OrderedF64(0.001),
-            ymax: OrderedF64(128.5),
-            zmin: Some(OrderedF64(11.0)),
-            zmax: Some(OrderedF64(1300.0)),
-            mmin: Some(OrderedF64(3.14)),
-            mmax: Some(OrderedF64(42.0)),
+            xmin: 0.1.into(),
+            xmax: 10.3.into(),
+            ymin: 0.001.into(),
+            ymax: 128.5.into(),
+            zmin: Some(11.0.into()),
+            zmax: Some(1300.0.into()),
+            mmin: Some(3.14.into()),
+            mmax: Some(42.0.into()),
         });
     }
 }
diff --git a/parquet/src/parquet_thrift.rs b/parquet/src/parquet_thrift.rs
index 935965b64abd..4f04d990860e 100644
--- a/parquet/src/parquet_thrift.rs
+++ b/parquet/src/parquet_thrift.rs
@@ -29,7 +29,13 @@ use crate::errors::{ParquetError, Result};
 // wrappers out there that should probably be used instead.
 // thrift seems to re-export an impl from ordered-float
 #[derive(Debug, Clone, Copy, PartialEq)]
-pub struct OrderedF64(pub f64);
+pub struct OrderedF64(f64);
+
+impl From<f64> for OrderedF64 {
+    fn from(value: f64) -> Self {
+        Self(value)
+    }
+}
 
 impl From<OrderedF64> for f64 {
     fn from(value: OrderedF64) -> Self {

From 386f222f79fdee635c1f696942fca6c969d2365b Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@live.com>
Date: Mon, 25 Aug 2025 22:57:09 -0700
Subject: [PATCH 31/46] unused import

---
 parquet/src/file/metadata/thrift_gen.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/parquet/src/file/metadata/thrift_gen.rs b/parquet/src/file/metadata/thrift_gen.rs
index c553d8f5f572..1dc829e5cfe2 100644
--- a/parquet/src/file/metadata/thrift_gen.rs
+++ b/parquet/src/file/metadata/thrift_gen.rs
@@ -515,7 +515,7 @@ impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for ParquetMetaData {
 #[cfg(test)]
 mod tests {
     use crate::file::metadata::thrift_gen::BoundingBox;
-    use crate::parquet_thrift::{tests::test_roundtrip, OrderedF64};
+    use crate::parquet_thrift::tests::test_roundtrip;
 
     #[test]
     fn test_bounding_box_roundtrip() {

From 6beb79d4fbe11fd570c4144c5bd592f434c7785d Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@live.com>
Date: Tue, 26 Aug 2025 12:12:21 -0700
Subject: [PATCH 32/46] get decryption working

---
 parquet/src/file/metadata/mod.rs        |  94 +---------
 parquet/src/file/metadata/reader.rs     | 217 +++-------------------
 parquet/src/file/metadata/thrift_gen.rs | 234 +++++++++++++++++++++++-
 parquet/src/file/serialized_reader.rs   |   2 +-
 parquet/src/parquet_macros.rs           |  15 +-
 parquet/tests/arrow_reader/bad_data.rs  |   2 +-
 6 files changed, 268 insertions(+), 296 deletions(-)

diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs
index 8b06fe676308..0c4372e38683 100644
--- a/parquet/src/file/metadata/mod.rs
+++ b/parquet/src/file/metadata/mod.rs
@@ -97,10 +97,7 @@ pub(crate) mod thrift_gen;
 mod writer;
 
 #[cfg(feature = "encryption")]
-use crate::encryption::{
-    decrypt::FileDecryptor,
-    modules::{create_module_aad, ModuleType},
-};
+use crate::encryption::decrypt::FileDecryptor;
 #[cfg(feature = "encryption")]
 use crate::file::column_crypto_metadata::{self, ColumnCryptoMetaData};
 pub(crate) use crate::file::metadata::memory::HeapSize;
@@ -117,8 +114,6 @@ use crate::schema::types::{
     ColumnDescPtr, ColumnDescriptor, ColumnPath, SchemaDescPtr, SchemaDescriptor,
     Type as SchemaType,
 };
-#[cfg(feature = "encryption")]
-use crate::thrift::{TCompactSliceInputProtocol, TSerializable};
 use crate::{
     basic::BoundaryOrder,
     errors::{ParquetError, Result},
@@ -684,93 +679,6 @@ impl RowGroupMetaData {
         self.file_offset
     }
 
-    /// Method to convert from encrypted Thrift.
-    #[cfg(feature = "encryption")]
-    fn from_encrypted_thrift(
-        schema_descr: SchemaDescPtr,
-        mut rg: crate::format::RowGroup,
-        decryptor: Option<&FileDecryptor>,
-    ) -> Result<RowGroupMetaData> {
-        if schema_descr.num_columns() != rg.columns.len() {
-            return Err(general_err!(
-                "Column count mismatch. Schema has {} columns while Row Group has {}",
-                schema_descr.num_columns(),
-                rg.columns.len()
-            ));
-        }
-        let total_byte_size = rg.total_byte_size;
-        let num_rows = rg.num_rows;
-        let mut columns = vec![];
-
-        for (i, (mut c, d)) in rg
-            .columns
-            .drain(0..)
-            .zip(schema_descr.columns())
-            .enumerate()
-        {
-            // Read encrypted metadata if it's present and we have a decryptor.
-            if let (true, Some(decryptor)) = (c.encrypted_column_metadata.is_some(), decryptor) {
-                let column_decryptor = match c.crypto_metadata.as_ref() {
-                    None => {
-                        return Err(general_err!(
-                            "No crypto_metadata is set for column '{}', which has encrypted metadata",
-                            d.path().string()
-                        ));
-                    }
-                    Some(TColumnCryptoMetaData::ENCRYPTIONWITHCOLUMNKEY(crypto_metadata)) => {
-                        let column_name = crypto_metadata.path_in_schema.join(".");
-                        decryptor.get_column_metadata_decryptor(
-                            column_name.as_str(),
-                            crypto_metadata.key_metadata.as_deref(),
-                        )?
-                    }
-                    Some(TColumnCryptoMetaData::ENCRYPTIONWITHFOOTERKEY(_)) => {
-                        decryptor.get_footer_decryptor()?
-                    }
-                };
-
-                let column_aad = create_module_aad(
-                    decryptor.file_aad(),
-                    ModuleType::ColumnMetaData,
-                    rg.ordinal.unwrap() as usize,
-                    i,
-                    None,
-                )?;
-
-                let buf = c.encrypted_column_metadata.clone().unwrap();
-                let decrypted_cc_buf = column_decryptor
-                    .decrypt(buf.as_slice(), column_aad.as_ref())
-                    .map_err(|_| {
-                        general_err!(
-                            "Unable to decrypt column '{}', perhaps the column key is wrong?",
-                            d.path().string()
-                        )
-                    })?;
-
-                let mut prot = TCompactSliceInputProtocol::new(decrypted_cc_buf.as_slice());
-                c.meta_data = Some(crate::format::ColumnMetaData::read_from_in_protocol(
-                    &mut prot,
-                )?);
-            }
-            columns.push(ColumnChunkMetaData::from_thrift(d.clone(), c)?);
-        }
-
-        let sorting_columns = rg.sorting_columns.map(|scs| {
-            scs.iter()
-                .map(|sc| sc.into())
-                .collect::<Vec<SortingColumn>>()
-        });
-        Ok(RowGroupMetaData {
-            columns,
-            num_rows,
-            sorting_columns,
-            total_byte_size,
-            schema_descr,
-            file_offset: rg.file_offset,
-            ordinal: rg.ordinal,
-        })
-    }
-
     /// Method to convert from Thrift.
     pub fn from_thrift(
         schema_descr: SchemaDescPtr,
diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs
index 57cc7c57ac66..ddccf39703bc 100644
--- a/parquet/src/file/metadata/reader.rs
+++ b/parquet/src/file/metadata/reader.rs
@@ -15,32 +15,25 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::{io::Read, ops::Range, sync::Arc};
+use std::{io::Read, ops::Range};
 
-use crate::{
-    basic::ColumnOrder,
-    file::metadata::{FileMetaData, KeyValue},
-    parquet_thrift::ThriftCompactInputProtocol,
-};
+use crate::parquet_thrift::ThriftCompactInputProtocol;
 #[cfg(feature = "encryption")]
 use crate::{
     encryption::{
         decrypt::{CryptoContext, FileDecryptionProperties, FileDecryptor},
         modules::create_footer_aad,
     },
-    format::{EncryptionAlgorithm, FileCryptoMetaData as TFileCryptoMetaData},
+    file::metadata::thrift_gen::EncryptionAlgorithm,
 };
 use bytes::Bytes;
 
 use crate::errors::{ParquetError, Result};
-use crate::file::metadata::{ColumnChunkMetaData, ParquetMetaData, RowGroupMetaData};
+use crate::file::metadata::{ColumnChunkMetaData, ParquetMetaData};
 use crate::file::page_index::column_index::ColumnIndexMetaData;
 use crate::file::page_index::index_reader::{acc_range, decode_column_index, decode_offset_index};
 use crate::file::reader::ChunkReader;
 use crate::file::{FOOTER_SIZE, PARQUET_MAGIC, PARQUET_MAGIC_ENCR_FOOTER};
-use crate::schema::types;
-use crate::schema::types::SchemaDescriptor;
-use crate::thrift::{TCompactSliceInputProtocol, TSerializable};
 
 #[cfg(all(feature = "async", feature = "arrow"))]
 use crate::arrow::async_reader::{MetadataFetch, MetadataSuffixFetch};
@@ -960,17 +953,21 @@ impl ParquetMetaDataReader {
         encrypted_footer: bool,
         file_decryption_properties: Option<&FileDecryptionProperties>,
     ) -> Result<ParquetMetaData> {
-        let mut prot = TCompactSliceInputProtocol::new(buf);
+        use crate::file::metadata::thrift_gen::parquet_metadata_with_encryption;
+
+        let mut prot = ThriftCompactInputProtocol::new(buf);
         let mut file_decryptor = None;
         let decrypted_fmd_buf;
 
         if encrypted_footer {
             if let Some(file_decryption_properties) = file_decryption_properties {
-                let t_file_crypto_metadata: TFileCryptoMetaData =
-                    TFileCryptoMetaData::read_from_in_protocol(&mut prot)
+                use crate::file::metadata::thrift_gen::{EncryptionAlgorithm, FileCryptoMetaData};
+
+                let t_file_crypto_metadata: FileCryptoMetaData =
+                    FileCryptoMetaData::try_from(&mut prot)
                         .map_err(|e| general_err!("Could not parse crypto metadata: {}", e))?;
                 let supply_aad_prefix = match &t_file_crypto_metadata.encryption_algorithm {
-                    EncryptionAlgorithm::AESGCMV1(algo) => algo.supply_aad_prefix,
+                    EncryptionAlgorithm::AES_GCM_V1(algo) => algo.supply_aad_prefix,
                     _ => Some(false),
                 }
                 .unwrap_or(false);
@@ -995,7 +992,7 @@ impl ParquetMetaDataReader {
                             "Provided footer key and AAD were unable to decrypt parquet footer"
                         )
                     })?;
-                prot = TCompactSliceInputProtocol::new(decrypted_fmd_buf.as_ref());
+                prot = ThriftCompactInputProtocol::new(decrypted_fmd_buf.as_ref());
 
                 file_decryptor = Some(decryptor);
             } else {
@@ -1003,58 +1000,13 @@ impl ParquetMetaDataReader {
             }
         }
 
-        let t_file_metadata = crate::format::FileMetaData::read_from_in_protocol(&mut prot)
-            .map_err(|e| general_err!("Could not parse metadata: {}", e))?;
-        let schema = types::from_thrift(&t_file_metadata.schema)?;
-        let schema_descr = Arc::new(SchemaDescriptor::new(schema));
-
-        if let (Some(algo), Some(file_decryption_properties)) = (
-            t_file_metadata.encryption_algorithm,
+        parquet_metadata_with_encryption(
+            &mut prot,
+            file_decryptor,
             file_decryption_properties,
-        ) {
-            // File has a plaintext footer but encryption algorithm is set
-            let file_decryptor_value = get_file_decryptor(
-                algo,
-                t_file_metadata.footer_signing_key_metadata.as_deref(),
-                file_decryption_properties,
-            )?;
-            if file_decryption_properties.check_plaintext_footer_integrity() && !encrypted_footer {
-                file_decryptor_value.verify_plaintext_footer_signature(buf)?;
-            }
-            file_decryptor = Some(file_decryptor_value);
-        }
-
-        let mut row_groups = Vec::new();
-        for rg in t_file_metadata.row_groups {
-            let r = RowGroupMetaData::from_encrypted_thrift(
-                schema_descr.clone(),
-                rg,
-                file_decryptor.as_ref(),
-            )?;
-            row_groups.push(r);
-        }
-        let column_orders =
-            Self::parse_column_orders(t_file_metadata.column_orders, &schema_descr)?;
-
-        let key_value_metadata = t_file_metadata.key_value_metadata.map(|vkv| {
-            vkv.into_iter()
-                .map(|kv| KeyValue::new(kv.key, kv.value))
-                .collect::<Vec<KeyValue>>()
-        });
-
-        let file_metadata = FileMetaData::new(
-            t_file_metadata.version,
-            t_file_metadata.num_rows,
-            t_file_metadata.created_by,
-            key_value_metadata,
-            schema_descr,
-            column_orders,
-        );
-        let mut metadata = ParquetMetaData::new(file_metadata, row_groups);
-
-        metadata.with_file_decryptor(file_decryptor);
-
-        Ok(metadata)
+            encrypted_footer,
+            buf,
+        )
     }
 
     /// Decodes [`ParquetMetaData`] from the provided bytes.
@@ -1065,36 +1017,8 @@ impl ParquetMetaDataReader {
     ///
     /// [Parquet Spec]: https://github.com/apache/parquet-format#metadata
     pub fn decode_metadata(buf: &[u8]) -> Result<ParquetMetaData> {
-        let mut prot = TCompactSliceInputProtocol::new(buf);
-
-        let t_file_metadata = crate::format::FileMetaData::read_from_in_protocol(&mut prot)
-            .map_err(|e| general_err!("Could not parse metadata: {}", e))?;
-        let schema = types::from_thrift(&t_file_metadata.schema)?;
-        let schema_descr = Arc::new(SchemaDescriptor::new(schema));
-
-        let mut row_groups = Vec::new();
-        for rg in t_file_metadata.row_groups {
-            row_groups.push(RowGroupMetaData::from_thrift(schema_descr.clone(), rg)?);
-        }
-        let column_orders =
-            Self::parse_column_orders(t_file_metadata.column_orders, &schema_descr)?;
-
-        let key_value_metadata = t_file_metadata.key_value_metadata.map(|vkv| {
-            vkv.into_iter()
-                .map(|kv| KeyValue::new(kv.key, kv.value))
-                .collect::<Vec<KeyValue>>()
-        });
-
-        let file_metadata = FileMetaData::new(
-            t_file_metadata.version,
-            t_file_metadata.num_rows,
-            t_file_metadata.created_by,
-            key_value_metadata,
-            schema_descr,
-            column_orders,
-        );
-
-        Ok(ParquetMetaData::new(file_metadata, row_groups))
+        let mut prot = ThriftCompactInputProtocol::new(buf);
+        ParquetMetaData::try_from(&mut prot)
     }
 
     /// create meta data from thrift encoded bytes
@@ -1102,55 +1026,25 @@ impl ParquetMetaDataReader {
         let mut prot = ThriftCompactInputProtocol::new(buf);
         ParquetMetaData::try_from(&mut prot)
     }
-
-    /// Parses column orders from Thrift definition.
-    /// If no column orders are defined, returns `None`.
-    fn parse_column_orders(
-        t_column_orders: Option<Vec<crate::format::ColumnOrder>>,
-        schema_descr: &SchemaDescriptor,
-    ) -> Result<Option<Vec<ColumnOrder>>> {
-        match t_column_orders {
-            Some(orders) => {
-                // Should always be the case
-                if orders.len() != schema_descr.num_columns() {
-                    return Err(general_err!("Column order length mismatch"));
-                };
-                let mut res = Vec::new();
-                for (i, column) in schema_descr.columns().iter().enumerate() {
-                    match orders[i] {
-                        crate::format::ColumnOrder::TYPEORDER(_) => {
-                            let sort_order = ColumnOrder::get_sort_order(
-                                column.logical_type(),
-                                column.converted_type(),
-                                column.physical_type(),
-                            );
-                            res.push(ColumnOrder::TYPE_DEFINED_ORDER(sort_order));
-                        }
-                    }
-                }
-                Ok(Some(res))
-            }
-            None => Ok(None),
-        }
-    }
 }
 
 #[cfg(feature = "encryption")]
-fn get_file_decryptor(
+pub(super) fn get_file_decryptor(
     encryption_algorithm: EncryptionAlgorithm,
     footer_key_metadata: Option<&[u8]>,
     file_decryption_properties: &FileDecryptionProperties,
 ) -> Result<FileDecryptor> {
     match encryption_algorithm {
-        EncryptionAlgorithm::AESGCMV1(algo) => {
+        EncryptionAlgorithm::AES_GCM_V1(algo) => {
             let aad_file_unique = algo
                 .aad_file_unique
                 .ok_or_else(|| general_err!("AAD unique file identifier is not set"))?;
             let aad_prefix = if let Some(aad_prefix) = file_decryption_properties.aad_prefix() {
                 aad_prefix.clone()
             } else {
-                algo.aad_prefix.unwrap_or_default()
+                algo.aad_prefix.map(|v| v.to_vec()).unwrap_or_default()
             };
+            let aad_file_unique = aad_file_unique.to_vec();
 
             FileDecryptor::new(
                 file_decryption_properties,
@@ -1159,7 +1053,7 @@ fn get_file_decryptor(
                 aad_prefix,
             )
         }
-        EncryptionAlgorithm::AESGCMCTRV1(_) => Err(nyi_err!(
+        EncryptionAlgorithm::AES_GCM_CTR_V1(_) => Err(nyi_err!(
             "The AES_GCM_CTR_V1 encryption algorithm is not yet supported"
         )),
     }
@@ -1171,10 +1065,7 @@ mod tests {
     use bytes::Bytes;
     use zstd::zstd_safe::WriteBuf;
 
-    use crate::basic::SortOrder;
-    use crate::basic::Type;
     use crate::file::reader::Length;
-    use crate::schema::types::Type as SchemaType;
     use crate::util::test_common::file_util::get_test_file;
 
     #[test]
@@ -1205,61 +1096,6 @@ mod tests {
         assert!(matches!(err, ParquetError::NeedMoreData(263)));
     }
 
-    #[test]
-    fn test_metadata_column_orders_parse() {
-        // Define simple schema, we do not need to provide logical types.
-        let fields = vec![
-            Arc::new(
-                SchemaType::primitive_type_builder("col1", Type::INT32)
-                    .build()
-                    .unwrap(),
-            ),
-            Arc::new(
-                SchemaType::primitive_type_builder("col2", Type::FLOAT)
-                    .build()
-                    .unwrap(),
-            ),
-        ];
-        let schema = SchemaType::group_type_builder("schema")
-            .with_fields(fields)
-            .build()
-            .unwrap();
-        let schema_descr = SchemaDescriptor::new(Arc::new(schema));
-
-        let t_column_orders = Some(vec![
-            crate::format::ColumnOrder::TYPEORDER(Default::default()),
-            crate::format::ColumnOrder::TYPEORDER(Default::default()),
-        ]);
-
-        assert_eq!(
-            ParquetMetaDataReader::parse_column_orders(t_column_orders, &schema_descr).unwrap(),
-            Some(vec![
-                ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::SIGNED),
-                ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::SIGNED)
-            ])
-        );
-
-        // Test when no column orders are defined.
-        assert_eq!(
-            ParquetMetaDataReader::parse_column_orders(None, &schema_descr).unwrap(),
-            None
-        );
-    }
-
-    #[test]
-    fn test_metadata_column_orders_len_mismatch() {
-        let schema = SchemaType::group_type_builder("schema").build().unwrap();
-        let schema_descr = SchemaDescriptor::new(Arc::new(schema));
-
-        let t_column_orders = Some(vec![crate::format::ColumnOrder::TYPEORDER(
-            Default::default(),
-        )]);
-
-        let res = ParquetMetaDataReader::parse_column_orders(t_column_orders, &schema_descr);
-        assert!(res.is_err());
-        assert!(format!("{:?}", res.unwrap_err()).contains("Column order length mismatch"));
-    }
-
     #[test]
     #[allow(deprecated)]
     fn test_try_parse() {
@@ -1412,6 +1248,7 @@ mod async_tests {
     use std::io::{Read, Seek, SeekFrom};
     use std::ops::Range;
     use std::sync::atomic::{AtomicUsize, Ordering};
+    use std::sync::Arc;
     use tempfile::NamedTempFile;
 
     use crate::arrow::ArrowWriter;
diff --git a/parquet/src/file/metadata/thrift_gen.rs b/parquet/src/file/metadata/thrift_gen.rs
index 1dc829e5cfe2..60e9b5036916 100644
--- a/parquet/src/file/metadata/thrift_gen.rs
+++ b/parquet/src/file/metadata/thrift_gen.rs
@@ -20,8 +20,6 @@
 use std::io::Write;
 use std::sync::Arc;
 
-#[cfg(feature = "encryption")]
-use crate::file::column_crypto_metadata::ColumnCryptoMetaData;
 use crate::{
     basic::{ColumnOrder, Compression, ConvertedType, Encoding, LogicalType, Repetition, Type},
     data_type::{ByteArray, FixedLenByteArray, Int96},
@@ -39,9 +37,15 @@ use crate::{
         WriteThrift, WriteThriftField,
     },
     schema::types::{parquet_schema_from_array, ColumnDescriptor, SchemaDescriptor},
-    thrift_struct,
+    thrift_struct, thrift_union,
     util::bit_util::FromBytes,
 };
+#[cfg(feature = "encryption")]
+use crate::{
+    encryption::decrypt::{FileDecryptionProperties, FileDecryptor},
+    file::column_crypto_metadata::ColumnCryptoMetaData,
+    schema::types::SchemaDescPtr,
+};
 
 // this needs to be visible to the schema conversion code
 thrift_struct!(
@@ -60,6 +64,56 @@ pub(crate) struct SchemaElement<'a> {
 }
 );
 
+thrift_struct!(
+pub(crate) struct AesGcmV1<'a> {
+  /// AAD prefix
+  1: optional binary<'a> aad_prefix
+
+  /// Unique file identifier part of AAD suffix
+  2: optional binary<'a> aad_file_unique
+
+  /// In files encrypted with AAD prefix without storing it,
+  /// readers must supply the prefix
+  3: optional bool supply_aad_prefix
+}
+);
+
+thrift_struct!(
+pub(crate) struct AesGcmCtrV1<'a> {
+  /// AAD prefix
+  1: optional binary<'a> aad_prefix
+
+  /// Unique file identifier part of AAD suffix
+  2: optional binary<'a> aad_file_unique
+
+  /// In files encrypted with AAD prefix without storing it,
+  /// readers must supply the prefix
+  3: optional bool supply_aad_prefix
+}
+);
+
+thrift_union!(
+union EncryptionAlgorithm<'a> {
+  1: (AesGcmV1<'a>) AES_GCM_V1
+  2: (AesGcmCtrV1<'a>) AES_GCM_CTR_V1
+}
+);
+
+#[cfg(feature = "encryption")]
+thrift_struct!(
+/// Crypto metadata for files with encrypted footer
+pub(crate) struct FileCryptoMetaData<'a> {
+  /// Encryption algorithm. This field is only used for files
+  /// with encrypted footer. Files with plaintext footer store algorithm id
+  /// inside footer (FileMetaData structure).
+  1: required EncryptionAlgorithm<'a> encryption_algorithm
+
+  /** Retrieval metadata of key used for encryption of footer,
+   *  and (possibly) columns **/
+  2: optional binary<'a> key_metadata
+}
+);
+
 // the following are only used internally so are private
 thrift_struct!(
 struct FileMetaData<'a> {
@@ -71,8 +125,8 @@ struct FileMetaData<'a> {
   5: optional list<KeyValue> key_value_metadata
   6: optional string created_by
   7: optional list<ColumnOrder> column_orders;
-  //8: optional EncryptionAlgorithm encryption_algorithm
-  //9: optional binary footer_signing_key_metadata
+  8: optional EncryptionAlgorithm<'a> encryption_algorithm
+  9: optional binary<'a> footer_signing_key_metadata
 }
 );
 
@@ -453,6 +507,176 @@ fn convert_stats(
     })
 }
 
+#[cfg(feature = "encryption")]
+fn row_group_from_encrypted_thrift(
+    mut rg: RowGroup,
+    schema_descr: SchemaDescPtr,
+    decryptor: Option<&FileDecryptor>,
+) -> Result<RowGroupMetaData> {
+    if schema_descr.num_columns() != rg.columns.len() {
+        return Err(general_err!(
+            "Column count mismatch. Schema has {} columns while Row Group has {}",
+            schema_descr.num_columns(),
+            rg.columns.len()
+        ));
+    }
+    let total_byte_size = rg.total_byte_size;
+    let num_rows = rg.num_rows;
+    let mut columns = vec![];
+
+    for (i, (mut c, d)) in rg
+        .columns
+        .drain(0..)
+        .zip(schema_descr.columns())
+        .enumerate()
+    {
+        // Read encrypted metadata if it's present and we have a decryptor.
+        if let (true, Some(decryptor)) = (c.encrypted_column_metadata.is_some(), decryptor) {
+            let column_decryptor = match c.crypto_metadata.as_ref() {
+                None => {
+                    return Err(general_err!(
+                        "No crypto_metadata is set for column '{}', which has encrypted metadata",
+                        d.path().string()
+                    ));
+                }
+                Some(ColumnCryptoMetaData::ENCRYPTION_WITH_COLUMN_KEY(crypto_metadata)) => {
+                    let column_name = crypto_metadata.path_in_schema.join(".");
+                    decryptor.get_column_metadata_decryptor(
+                        column_name.as_str(),
+                        crypto_metadata.key_metadata.as_deref(),
+                    )?
+                }
+                Some(ColumnCryptoMetaData::ENCRYPTION_WITH_FOOTER_KEY) => {
+                    decryptor.get_footer_decryptor()?
+                }
+            };
+
+            let column_aad = crate::encryption::modules::create_module_aad(
+                decryptor.file_aad(),
+                crate::encryption::modules::ModuleType::ColumnMetaData,
+                rg.ordinal.unwrap() as usize,
+                i,
+                None,
+            )?;
+
+            let buf = c.encrypted_column_metadata.unwrap();
+            let decrypted_cc_buf =
+                column_decryptor
+                    .decrypt(buf, column_aad.as_ref())
+                    .map_err(|_| {
+                        general_err!(
+                            "Unable to decrypt column '{}', perhaps the column key is wrong?",
+                            d.path().string()
+                        )
+                    })?;
+
+            let mut prot = ThriftCompactInputProtocol::new(decrypted_cc_buf.as_slice());
+            let col_meta = ColumnMetaData::try_from(&mut prot)?;
+            c.meta_data = Some(col_meta);
+            columns.push(convert_column(c, d.clone())?);
+        } else {
+            columns.push(convert_column(c, d.clone())?);
+        }
+    }
+
+    let sorting_columns = rg.sorting_columns;
+    let file_offset = rg.file_offset;
+    let ordinal = rg.ordinal;
+
+    Ok(RowGroupMetaData {
+        columns,
+        num_rows,
+        sorting_columns,
+        total_byte_size,
+        schema_descr,
+        file_offset,
+        ordinal,
+    })
+}
+
+#[cfg(feature = "encryption")]
+pub(crate) fn parquet_metadata_with_encryption<'a>(
+    prot: &mut ThriftCompactInputProtocol<'a>,
+    mut file_decryptor: Option<FileDecryptor>,
+    file_decryption_properties: Option<&FileDecryptionProperties>,
+    encrypted_footer: bool,
+    buf: &[u8],
+) -> Result<ParquetMetaData> {
+    let file_meta = super::thrift_gen::FileMetaData::try_from(prot)
+        .map_err(|e| general_err!("Could not parse metadata: {}", e))?;
+
+    let version = file_meta.version;
+    let num_rows = file_meta.num_rows;
+    let created_by = file_meta.created_by.map(|c| c.to_owned());
+    let key_value_metadata = file_meta.key_value_metadata;
+
+    let val = parquet_schema_from_array(file_meta.schema)?;
+    let schema_descr = Arc::new(SchemaDescriptor::new(val));
+
+    if let (Some(algo), Some(file_decryption_properties)) =
+        (file_meta.encryption_algorithm, file_decryption_properties)
+    {
+        // File has a plaintext footer but encryption algorithm is set
+        let file_decryptor_value = crate::file::metadata::reader::get_file_decryptor(
+            algo,
+            file_meta.footer_signing_key_metadata.as_deref(),
+            file_decryption_properties,
+        )?;
+        if file_decryption_properties.check_plaintext_footer_integrity() && !encrypted_footer {
+            file_decryptor_value.verify_plaintext_footer_signature(buf)?;
+        }
+        file_decryptor = Some(file_decryptor_value);
+    }
+
+    // decrypt column chunk info
+    let mut row_groups = Vec::with_capacity(file_meta.row_groups.len());
+    for rg in file_meta.row_groups {
+        let r = row_group_from_encrypted_thrift(rg, schema_descr.clone(), file_decryptor.as_ref())?;
+        row_groups.push(r);
+    }
+
+    // need to map read column orders to actual values based on the schema
+    if file_meta
+        .column_orders
+        .as_ref()
+        .is_some_and(|cos| cos.len() != schema_descr.num_columns())
+    {
+        return Err(general_err!("Column order length mismatch"));
+    }
+
+    let column_orders = file_meta.column_orders.map(|cos| {
+        let mut res = Vec::with_capacity(cos.len());
+        for (i, column) in schema_descr.columns().iter().enumerate() {
+            match cos[i] {
+                ColumnOrder::TYPE_DEFINED_ORDER(_) => {
+                    let sort_order = ColumnOrder::get_sort_order(
+                        column.logical_type(),
+                        column.converted_type(),
+                        column.physical_type(),
+                    );
+                    res.push(ColumnOrder::TYPE_DEFINED_ORDER(sort_order));
+                }
+                _ => res.push(cos[i]),
+            }
+        }
+        res
+    });
+
+    let fmd = crate::file::metadata::FileMetaData::new(
+        version,
+        num_rows,
+        created_by,
+        key_value_metadata,
+        schema_descr,
+        column_orders,
+    );
+    let mut metadata = ParquetMetaData::new(fmd, row_groups);
+
+    metadata.with_file_decryptor(file_decryptor);
+
+    Ok(metadata)
+}
+
 /// Create ParquetMetaData from thrift input. Note that this only decodes the file metadata in
 /// the Parquet footer. Page indexes will need to be added later.
 impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for ParquetMetaData {
diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs
index 5308825b0976..335f0bc3601b 100644
--- a/parquet/src/file/serialized_reader.rs
+++ b/parquet/src/file/serialized_reader.rs
@@ -1877,7 +1877,7 @@ mod tests {
         let ret = SerializedFileReader::new(Bytes::copy_from_slice(&data));
         assert_eq!(
             ret.err().unwrap().to_string(),
-            "Parquet error: Could not parse metadata: bad data"
+            "Parquet error: Could not parse metadata: Parquet error: Received empty union from remote ColumnOrder"
         );
     }
 
diff --git a/parquet/src/parquet_macros.rs b/parquet/src/parquet_macros.rs
index 41a5bf3b43f9..bbce3918b74c 100644
--- a/parquet/src/parquet_macros.rs
+++ b/parquet/src/parquet_macros.rs
@@ -181,17 +181,17 @@ macro_rules! thrift_union_all_empty {
 #[macro_export]
 #[allow(clippy::crate_in_macro_def)]
 macro_rules! thrift_union {
-    ($(#[$($def_attrs:tt)*])* union $identifier:ident { $($(#[$($field_attrs:tt)*])* $field_id:literal : $( ( $field_type:ident $(< $element_type:ident >)? ) )? $field_name:ident $(;)?)* }) => {
+    ($(#[$($def_attrs:tt)*])* union $identifier:ident $(< $lt:lifetime >)? { $($(#[$($field_attrs:tt)*])* $field_id:literal : $( ( $field_type:ident $(< $element_type:ident >)? $(< $field_lt:lifetime >)?) )? $field_name:ident $(;)?)* }) => {
         $(#[cfg_attr(not(doctest), $($def_attrs)*)])*
         #[derive(Clone, Debug, Eq, PartialEq)]
         #[allow(non_camel_case_types)]
         #[allow(non_snake_case)]
         #[allow(missing_docs)]
-        pub enum $identifier {
-            $($(#[cfg_attr(not(doctest), $($field_attrs)*)])* $field_name $( ( $crate::__thrift_union_type!{$field_type $($element_type)?} ) )?),*
+        pub enum $identifier $(<$lt>)? {
+            $($(#[cfg_attr(not(doctest), $($field_attrs)*)])* $field_name $( ( $crate::__thrift_union_type!{$field_type $($field_lt)? $($element_type)?} ) )?),*
         }
 
-        impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for $identifier {
+        impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for $identifier $(<$lt>)? {
             type Error = ParquetError;
 
             fn try_from(prot: &mut ThriftCompactInputProtocol<'a>) -> Result<Self> {
@@ -220,7 +220,7 @@ macro_rules! thrift_union {
             }
         }
 
-        impl<W: Write> WriteThrift<W> for $identifier {
+        impl<$($lt,)? W: Write> WriteThrift<W> for $identifier $(<$lt>)?  {
             const ELEMENT_TYPE: ElementType = ElementType::Struct;
 
             fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
@@ -232,7 +232,7 @@ macro_rules! thrift_union {
             }
         }
 
-        impl<W: Write> WriteThriftField<W> for $identifier {
+        impl<$($lt,)? W: Write> WriteThriftField<W> for $identifier $(<$lt>)? {
             fn write_thrift_field(&self, writer: &mut ThriftCompactOutputProtocol<W>, field_id: i16, last_field_id: i16) -> Result<i16> {
                 writer.write_field_begin(FieldType::Struct, field_id, last_field_id)?;
                 self.write_thrift(writer)?;
@@ -466,6 +466,9 @@ macro_rules! __thrift_field_type {
 #[doc(hidden)]
 #[macro_export]
 macro_rules! __thrift_union_type {
+    (binary $lt:lifetime) => { &$lt [u8] };
+    (string $lt:lifetime) => { &$lt str };
+    ($field_type:ident $lt:lifetime) => { $field_type<$lt> };
     ($field_type:ident) => { $field_type };
     (list $field_type:ident) => { Vec<$field_type> };
 }
diff --git a/parquet/tests/arrow_reader/bad_data.rs b/parquet/tests/arrow_reader/bad_data.rs
index 619bbb862fe1..58e342ab39d1 100644
--- a/parquet/tests/arrow_reader/bad_data.rs
+++ b/parquet/tests/arrow_reader/bad_data.rs
@@ -82,7 +82,7 @@ fn test_parquet_1481() {
     let err = read_file("PARQUET-1481.parquet").unwrap_err();
     assert_eq!(
         err.to_string(),
-        "Parquet error: Unexpected parquet Type: -7"
+        "Parquet error: Could not parse metadata: Parquet error: Unexpected Type -7"
     );
 }
 

From 1eaa17b1bd629748eeebd17319c295f2f71f6bb4 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@live.com>
Date: Tue, 26 Aug 2025 12:53:42 -0700
Subject: [PATCH 33/46] refactor and clippy fixes

---
 parquet/benches/metadata.rs             |  12 ---
 parquet/src/file/metadata/reader.rs     | 120 +-----------------------
 parquet/src/file/metadata/thrift_gen.rs |  88 +++++++++++++++--
 parquet/src/parquet_thrift.rs           |   4 +-
 4 files changed, 86 insertions(+), 138 deletions(-)

diff --git a/parquet/benches/metadata.rs b/parquet/benches/metadata.rs
index 3c293462a157..151d928957ff 100644
--- a/parquet/benches/metadata.rs
+++ b/parquet/benches/metadata.rs
@@ -211,12 +211,6 @@ fn criterion_benchmark(c: &mut Criterion) {
         })
     });
 
-    c.bench_function("decode parquet metadata new", |b| {
-        b.iter(|| {
-            ParquetMetaDataReader::decode_file_metadata(&meta_data).unwrap();
-        })
-    });
-
     let buf: Bytes = black_box(encoded_meta()).into();
     c.bench_function("decode parquet metadata (wide)", |b| {
         b.iter(|| {
@@ -230,12 +224,6 @@ fn criterion_benchmark(c: &mut Criterion) {
         })
     });
 
-    c.bench_function("decode parquet metadata new (wide)", |b| {
-        b.iter(|| {
-            ParquetMetaDataReader::decode_file_metadata(&buf).unwrap();
-        })
-    });
-
     // rewrite file with page statistics. then read page headers.
     #[cfg(feature = "arrow")]
     let (file_bytes, metadata) = rewrite_file(data.clone());
diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs
index ddccf39703bc..7ab2db2f7ff3 100644
--- a/parquet/src/file/metadata/reader.rs
+++ b/parquet/src/file/metadata/reader.rs
@@ -17,15 +17,9 @@
 
 use std::{io::Read, ops::Range};
 
-use crate::parquet_thrift::ThriftCompactInputProtocol;
 #[cfg(feature = "encryption")]
-use crate::{
-    encryption::{
-        decrypt::{CryptoContext, FileDecryptionProperties, FileDecryptor},
-        modules::create_footer_aad,
-    },
-    file::metadata::thrift_gen::EncryptionAlgorithm,
-};
+use crate::encryption::decrypt::{CryptoContext, FileDecryptionProperties};
+use crate::parquet_thrift::ThriftCompactInputProtocol;
 use bytes::Bytes;
 
 use crate::errors::{ParquetError, Result};
@@ -953,56 +947,7 @@ impl ParquetMetaDataReader {
         encrypted_footer: bool,
         file_decryption_properties: Option<&FileDecryptionProperties>,
     ) -> Result<ParquetMetaData> {
-        use crate::file::metadata::thrift_gen::parquet_metadata_with_encryption;
-
-        let mut prot = ThriftCompactInputProtocol::new(buf);
-        let mut file_decryptor = None;
-        let decrypted_fmd_buf;
-
-        if encrypted_footer {
-            if let Some(file_decryption_properties) = file_decryption_properties {
-                use crate::file::metadata::thrift_gen::{EncryptionAlgorithm, FileCryptoMetaData};
-
-                let t_file_crypto_metadata: FileCryptoMetaData =
-                    FileCryptoMetaData::try_from(&mut prot)
-                        .map_err(|e| general_err!("Could not parse crypto metadata: {}", e))?;
-                let supply_aad_prefix = match &t_file_crypto_metadata.encryption_algorithm {
-                    EncryptionAlgorithm::AES_GCM_V1(algo) => algo.supply_aad_prefix,
-                    _ => Some(false),
-                }
-                .unwrap_or(false);
-                if supply_aad_prefix && file_decryption_properties.aad_prefix().is_none() {
-                    return Err(general_err!(
-                        "Parquet file was encrypted with an AAD prefix that is not stored in the file, \
-                        but no AAD prefix was provided in the file decryption properties"
-                    ));
-                }
-                let decryptor = get_file_decryptor(
-                    t_file_crypto_metadata.encryption_algorithm,
-                    t_file_crypto_metadata.key_metadata.as_deref(),
-                    file_decryption_properties,
-                )?;
-                let footer_decryptor = decryptor.get_footer_decryptor();
-                let aad_footer = create_footer_aad(decryptor.file_aad())?;
-
-                decrypted_fmd_buf = footer_decryptor?
-                    .decrypt(prot.as_slice().as_ref(), aad_footer.as_ref())
-                    .map_err(|_| {
-                        general_err!(
-                            "Provided footer key and AAD were unable to decrypt parquet footer"
-                        )
-                    })?;
-                prot = ThriftCompactInputProtocol::new(decrypted_fmd_buf.as_ref());
-
-                file_decryptor = Some(decryptor);
-            } else {
-                return Err(general_err!("Parquet file has an encrypted footer but decryption properties were not provided"));
-            }
-        }
-
-        parquet_metadata_with_encryption(
-            &mut prot,
-            file_decryptor,
+        super::thrift_gen::parquet_metadata_with_encryption(
             file_decryption_properties,
             encrypted_footer,
             buf,
@@ -1020,50 +965,12 @@ impl ParquetMetaDataReader {
         let mut prot = ThriftCompactInputProtocol::new(buf);
         ParquetMetaData::try_from(&mut prot)
     }
-
-    /// create meta data from thrift encoded bytes
-    pub fn decode_file_metadata(buf: &[u8]) -> Result<ParquetMetaData> {
-        let mut prot = ThriftCompactInputProtocol::new(buf);
-        ParquetMetaData::try_from(&mut prot)
-    }
-}
-
-#[cfg(feature = "encryption")]
-pub(super) fn get_file_decryptor(
-    encryption_algorithm: EncryptionAlgorithm,
-    footer_key_metadata: Option<&[u8]>,
-    file_decryption_properties: &FileDecryptionProperties,
-) -> Result<FileDecryptor> {
-    match encryption_algorithm {
-        EncryptionAlgorithm::AES_GCM_V1(algo) => {
-            let aad_file_unique = algo
-                .aad_file_unique
-                .ok_or_else(|| general_err!("AAD unique file identifier is not set"))?;
-            let aad_prefix = if let Some(aad_prefix) = file_decryption_properties.aad_prefix() {
-                aad_prefix.clone()
-            } else {
-                algo.aad_prefix.map(|v| v.to_vec()).unwrap_or_default()
-            };
-            let aad_file_unique = aad_file_unique.to_vec();
-
-            FileDecryptor::new(
-                file_decryption_properties,
-                footer_key_metadata,
-                aad_file_unique,
-                aad_prefix,
-            )
-        }
-        EncryptionAlgorithm::AES_GCM_CTR_V1(_) => Err(nyi_err!(
-            "The AES_GCM_CTR_V1 encryption algorithm is not yet supported"
-        )),
-    }
 }
 
 #[cfg(test)]
 mod tests {
     use super::*;
     use bytes::Bytes;
-    use zstd::zstd_safe::WriteBuf;
 
     use crate::file::reader::Length;
     use crate::util::test_common::file_util::get_test_file;
@@ -1210,27 +1117,6 @@ mod tests {
             "EOF: Parquet file too small. Size is 1728 but need 1729"
         );
     }
-
-    #[test]
-    fn test_new_decoder() {
-        let file = get_test_file("alltypes_tiny_pages.parquet");
-        let len = file.len();
-
-        // read entire file
-        let bytes = file.get_bytes(0, len as usize).unwrap();
-        let mut footer = [0u8; FOOTER_SIZE];
-        footer.copy_from_slice(bytes.slice(len as usize - FOOTER_SIZE..).as_slice());
-        let tail = ParquetMetaDataReader::decode_footer_tail(&footer).unwrap();
-        let meta_len = tail.metadata_length();
-        let metadata_bytes = bytes.slice(len as usize - FOOTER_SIZE - meta_len..);
-
-        // get ParquetMetaData
-        let m = ParquetMetaDataReader::decode_file_metadata(&metadata_bytes).unwrap();
-        let m2 = ParquetMetaDataReader::decode_metadata(&metadata_bytes).unwrap();
-
-        // check that metadatas are equivalent
-        assert_eq!(m, m2);
-    }
 }
 
 #[cfg(all(feature = "async", feature = "arrow", test))]
diff --git a/parquet/src/file/metadata/thrift_gen.rs b/parquet/src/file/metadata/thrift_gen.rs
index 60e9b5036916..869bdbd20ac8 100644
--- a/parquet/src/file/metadata/thrift_gen.rs
+++ b/parquet/src/file/metadata/thrift_gen.rs
@@ -595,14 +595,57 @@ fn row_group_from_encrypted_thrift(
 }
 
 #[cfg(feature = "encryption")]
-pub(crate) fn parquet_metadata_with_encryption<'a>(
-    prot: &mut ThriftCompactInputProtocol<'a>,
-    mut file_decryptor: Option<FileDecryptor>,
+pub(crate) fn parquet_metadata_with_encryption(
     file_decryption_properties: Option<&FileDecryptionProperties>,
     encrypted_footer: bool,
     buf: &[u8],
 ) -> Result<ParquetMetaData> {
-    let file_meta = super::thrift_gen::FileMetaData::try_from(prot)
+    let mut prot = ThriftCompactInputProtocol::new(buf);
+    let mut file_decryptor = None;
+    let decrypted_fmd_buf;
+
+    if encrypted_footer {
+        if let Some(file_decryption_properties) = file_decryption_properties {
+            let t_file_crypto_metadata: FileCryptoMetaData =
+                FileCryptoMetaData::try_from(&mut prot)
+                    .map_err(|e| general_err!("Could not parse crypto metadata: {}", e))?;
+            let supply_aad_prefix = match &t_file_crypto_metadata.encryption_algorithm {
+                EncryptionAlgorithm::AES_GCM_V1(algo) => algo.supply_aad_prefix,
+                _ => Some(false),
+            }
+            .unwrap_or(false);
+            if supply_aad_prefix && file_decryption_properties.aad_prefix().is_none() {
+                return Err(general_err!(
+                        "Parquet file was encrypted with an AAD prefix that is not stored in the file, \
+                        but no AAD prefix was provided in the file decryption properties"
+                    ));
+            }
+            let decryptor = get_file_decryptor(
+                t_file_crypto_metadata.encryption_algorithm,
+                t_file_crypto_metadata.key_metadata,
+                file_decryption_properties,
+            )?;
+            let footer_decryptor = decryptor.get_footer_decryptor();
+            let aad_footer = crate::encryption::modules::create_footer_aad(decryptor.file_aad())?;
+
+            decrypted_fmd_buf = footer_decryptor?
+                .decrypt(prot.as_slice().as_ref(), aad_footer.as_ref())
+                .map_err(|_| {
+                    general_err!(
+                        "Provided footer key and AAD were unable to decrypt parquet footer"
+                    )
+                })?;
+            prot = ThriftCompactInputProtocol::new(decrypted_fmd_buf.as_ref());
+
+            file_decryptor = Some(decryptor);
+        } else {
+            return Err(general_err!(
+                "Parquet file has an encrypted footer but decryption properties were not provided"
+            ));
+        }
+    }
+
+    let file_meta = super::thrift_gen::FileMetaData::try_from(&mut prot)
         .map_err(|e| general_err!("Could not parse metadata: {}", e))?;
 
     let version = file_meta.version;
@@ -617,9 +660,9 @@ pub(crate) fn parquet_metadata_with_encryption<'a>(
         (file_meta.encryption_algorithm, file_decryption_properties)
     {
         // File has a plaintext footer but encryption algorithm is set
-        let file_decryptor_value = crate::file::metadata::reader::get_file_decryptor(
+        let file_decryptor_value = get_file_decryptor(
             algo,
-            file_meta.footer_signing_key_metadata.as_deref(),
+            file_meta.footer_signing_key_metadata,
             file_decryption_properties,
         )?;
         if file_decryption_properties.check_plaintext_footer_integrity() && !encrypted_footer {
@@ -677,6 +720,37 @@ pub(crate) fn parquet_metadata_with_encryption<'a>(
     Ok(metadata)
 }
 
+#[cfg(feature = "encryption")]
+pub(super) fn get_file_decryptor(
+    encryption_algorithm: EncryptionAlgorithm,
+    footer_key_metadata: Option<&[u8]>,
+    file_decryption_properties: &FileDecryptionProperties,
+) -> Result<FileDecryptor> {
+    match encryption_algorithm {
+        EncryptionAlgorithm::AES_GCM_V1(algo) => {
+            let aad_file_unique = algo
+                .aad_file_unique
+                .ok_or_else(|| general_err!("AAD unique file identifier is not set"))?;
+            let aad_prefix = if let Some(aad_prefix) = file_decryption_properties.aad_prefix() {
+                aad_prefix.clone()
+            } else {
+                algo.aad_prefix.map(|v| v.to_vec()).unwrap_or_default()
+            };
+            let aad_file_unique = aad_file_unique.to_vec();
+
+            FileDecryptor::new(
+                file_decryption_properties,
+                footer_key_metadata,
+                aad_file_unique,
+                aad_prefix,
+            )
+        }
+        EncryptionAlgorithm::AES_GCM_CTR_V1(_) => Err(nyi_err!(
+            "The AES_GCM_CTR_V1 encryption algorithm is not yet supported"
+        )),
+    }
+}
+
 /// Create ParquetMetaData from thrift input. Note that this only decodes the file metadata in
 /// the Parquet footer. Page indexes will need to be added later.
 impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for ParquetMetaData {
@@ -772,7 +846,7 @@ mod tests {
             ymax: 128.5.into(),
             zmin: Some(11.0.into()),
             zmax: Some(1300.0.into()),
-            mmin: Some(3.14.into()),
+            mmin: Some(3.7.into()),
             mmax: Some(42.0.into()),
         });
     }
diff --git a/parquet/src/parquet_thrift.rs b/parquet/src/parquet_thrift.rs
index 4f04d990860e..ac5d72ecdd69 100644
--- a/parquet/src/parquet_thrift.rs
+++ b/parquet/src/parquet_thrift.rs
@@ -665,8 +665,8 @@ where
 
     fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
         writer.write_list_begin(T::ELEMENT_TYPE, self.len())?;
-        for i in 0..self.len() {
-            self[i].write_thrift(writer)?;
+        for item in self {
+            item.write_thrift(writer)?;
         }
         Ok(())
     }

From 713e38abb2ef6b2e406e96fd38d3845b0d2f9084 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@live.com>
Date: Tue, 26 Aug 2025 14:49:55 -0700
Subject: [PATCH 34/46] add page header defs

---
 parquet/src/file/metadata/thrift_gen.rs | 104 +++++++++++++++++++++++-
 parquet/src/parquet_macros.rs           |   1 +
 2 files changed, 102 insertions(+), 3 deletions(-)

diff --git a/parquet/src/file/metadata/thrift_gen.rs b/parquet/src/file/metadata/thrift_gen.rs
index 869bdbd20ac8..630126ea8be8 100644
--- a/parquet/src/file/metadata/thrift_gen.rs
+++ b/parquet/src/file/metadata/thrift_gen.rs
@@ -21,7 +21,9 @@ use std::io::Write;
 use std::sync::Arc;
 
 use crate::{
-    basic::{ColumnOrder, Compression, ConvertedType, Encoding, LogicalType, Repetition, Type},
+    basic::{
+        ColumnOrder, Compression, ConvertedType, Encoding, LogicalType, PageType, Repetition, Type,
+    },
     data_type::{ByteArray, FixedLenByteArray, Int96},
     errors::{ParquetError, Result},
     file::{
@@ -64,6 +66,102 @@ pub(crate) struct SchemaElement<'a> {
 }
 );
 
+thrift_struct!(
+pub(crate) struct DataPageHeader {
+  /// Number of values, including NULLs, in this data page.
+  ///
+  /// If a OffsetIndex is present, a page must begin at a row
+  /// boundary (repetition_level = 0). Otherwise, pages may begin
+  /// within a row (repetition_level > 0).
+  1: required i32 num_values
+
+  /// Encoding used for this data page
+  2: required Encoding encoding
+
+  /// Encoding used for definition levels
+  3: required Encoding definition_level_encoding;
+
+  /// Encoding used for repetition levels
+  4: required Encoding repetition_level_encoding;
+
+  // Optional statistics for the data in this page
+  // page stats are pretty useless...lets ignore them
+  //5: optional Statistics statistics;
+}
+);
+
+thrift_struct!(
+    pub(crate) struct IndexPageHeader {}
+);
+
+thrift_struct!(
+pub(crate) struct DictionaryPageHeader {
+  /// Number of values in the dictionary
+  1: required i32 num_values;
+
+  /// Encoding using this dictionary page
+  2: required Encoding encoding
+
+  /// If true, the entries in the dictionary are sorted in ascending order
+  3: optional bool is_sorted;
+}
+);
+
+thrift_struct!(
+pub(crate) struct DataPageHeaderV2 {
+  /// Number of values, including NULLs, in this data page.
+  1: required i32 num_values
+  /// Number of NULL values, in this data page.
+  /// Number of non-null = num_values - num_nulls which is also the number of values in the data section
+  2: required i32 num_nulls
+  /// Number of rows in this data page. Every page must begin at a
+  /// row boundary (repetition_level = 0): rows must **not** be
+  /// split across page boundaries when using V2 data pages.
+  3: required i32 num_rows
+  /// Encoding used for data in this page
+  4: required Encoding encoding
+
+  // repetition levels and definition levels are always using RLE (without size in it)
+
+  /// Length of the definition levels
+  5: required i32 definition_levels_byte_length;
+  /// Length of the repetition levels
+  6: required i32 repetition_levels_byte_length;
+
+  /// Whether the values are compressed.
+  /// Which means the section of the page between
+  /// definition_levels_byte_length + repetition_levels_byte_length + 1 and compressed_page_size (included)
+  /// is compressed with the compression_codec.
+  /// If missing it is considered compressed
+  7: optional bool is_compressed = true;
+
+  // Optional statistics for the data in this page
+  //8: optional Statistics statistics;
+}
+);
+
+thrift_struct!(
+pub(crate) struct PageHeader {
+  /// the type of the page: indicates which of the *_header fields is set
+  1: required PageType type_
+
+  /// Uncompressed page size in bytes (not including this header)
+  2: required i32 uncompressed_page_size
+
+  /// Compressed (and potentially encrypted) page size in bytes, not including this header
+  3: required i32 compressed_page_size
+
+  /// The 32-bit CRC checksum for the page, to be be calculated as follows:
+  4: optional i32 crc
+
+  // Headers for page specific data.  One only will be set.
+  5: optional DataPageHeader data_page_header;
+  6: optional IndexPageHeader index_page_header;
+  7: optional DictionaryPageHeader dictionary_page_header;
+  8: optional DataPageHeaderV2 data_page_header_v2;
+}
+);
+
 thrift_struct!(
 pub(crate) struct AesGcmV1<'a> {
   /// AAD prefix
@@ -226,7 +324,7 @@ struct SizeStatistics {
 );
 
 thrift_struct!(
-struct Statistics<'a> {
+pub(crate) struct Statistics<'a> {
    1: optional binary<'a> max;
    2: optional binary<'a> min;
    3: optional i64 null_count;
@@ -358,7 +456,7 @@ fn convert_column(
     Ok(result)
 }
 
-fn convert_stats(
+pub(crate) fn convert_stats(
     physical_type: Type,
     thrift_stats: Option<Statistics>,
 ) -> Result<Option<crate::file::statistics::Statistics>> {
diff --git a/parquet/src/parquet_macros.rs b/parquet/src/parquet_macros.rs
index bbce3918b74c..eba279f47c0e 100644
--- a/parquet/src/parquet_macros.rs
+++ b/parquet/src/parquet_macros.rs
@@ -312,6 +312,7 @@ macro_rules! thrift_struct {
 
             #[allow(unused_assignments)]
             fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
+                #[allow(unused_mut, unused_variables)]
                 let mut last_field_id = 0i16;
                 $($crate::__thrift_write_required_or_optional_field!($required_or_optional $field_name, $field_id, $field_type, self, writer, last_field_id);)*
                 writer.write_struct_end()

From 79e8f85cd9b7a584f74f6639d272767acaeaeaf1 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@live.com>
Date: Wed, 27 Aug 2025 08:47:50 -0700
Subject: [PATCH 35/46] totally rework the input side add ReadThrift trait and
 make ThriftCompactInputProtocol a trait

---
 parquet/src/basic.rs                        |  39 +-
 parquet/src/file/column_crypto_metadata.rs  |   4 +-
 parquet/src/file/metadata/mod.rs            |   4 +-
 parquet/src/file/metadata/reader.rs         |   6 +-
 parquet/src/file/metadata/thrift_gen.rs     |  25 +-
 parquet/src/file/page_encoding_stats.rs     |   6 +-
 parquet/src/file/page_index/index_reader.rs |  14 +-
 parquet/src/file/page_index/offset_index.rs |  12 +-
 parquet/src/parquet_macros.rs               |  46 +--
 parquet/src/parquet_thrift.rs               | 409 ++++++++++----------
 10 files changed, 279 insertions(+), 286 deletions(-)

diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs
index cf451b961f69..8cf6b5f85b8b 100644
--- a/parquet/src/basic.rs
+++ b/parquet/src/basic.rs
@@ -26,8 +26,8 @@ use std::{fmt, str};
 
 pub use crate::compression::{BrotliLevel, GzipLevel, ZstdLevel};
 use crate::parquet_thrift::{
-    ElementType, FieldType, ThriftCompactInputProtocol, ThriftCompactOutputProtocol, WriteThrift,
-    WriteThriftField,
+    ElementType, FieldType, ReadThrift, ThriftCompactInputProtocol, ThriftCompactOutputProtocol,
+    WriteThrift, WriteThriftField,
 };
 use crate::{thrift_enum, thrift_struct, thrift_union_all_empty};
 
@@ -165,9 +165,8 @@ pub enum ConvertedType {
     INTERVAL,
 }
 
-impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for ConvertedType {
-    type Error = ParquetError;
-    fn try_from(prot: &mut ThriftCompactInputProtocol<'a>) -> Result<Self> {
+impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for ConvertedType {
+    fn read_thrift(prot: &mut R) -> Result<Self> {
         let val = prot.read_i32()?;
         Ok(match val {
             0 => Self::UTF8,
@@ -361,9 +360,8 @@ pub enum LogicalType {
     },
 }
 
-impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for LogicalType {
-    type Error = ParquetError;
-    fn try_from(prot: &mut ThriftCompactInputProtocol<'a>) -> Result<Self> {
+impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for LogicalType {
+    fn read_thrift(prot: &mut R) -> Result<Self> {
         prot.read_struct_begin()?;
 
         let field_ident = prot.read_field_begin()?;
@@ -388,7 +386,7 @@ impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for LogicalType {
                 Self::Enum
             }
             5 => {
-                let val = DecimalType::try_from(&mut *prot)?;
+                let val = DecimalType::read_thrift(&mut *prot)?;
                 Self::Decimal {
                     scale: val.scale,
                     precision: val.precision,
@@ -399,21 +397,21 @@ impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for LogicalType {
                 Self::Date
             }
             7 => {
-                let val = TimeType::try_from(&mut *prot)?;
+                let val = TimeType::read_thrift(&mut *prot)?;
                 Self::Time {
                     is_adjusted_to_u_t_c: val.is_adjusted_to_u_t_c,
                     unit: val.unit,
                 }
             }
             8 => {
-                let val = TimestampType::try_from(&mut *prot)?;
+                let val = TimestampType::read_thrift(&mut *prot)?;
                 Self::Timestamp {
                     is_adjusted_to_u_t_c: val.is_adjusted_to_u_t_c,
                     unit: val.unit,
                 }
             }
             10 => {
-                let val = IntType::try_from(&mut *prot)?;
+                let val = IntType::read_thrift(&mut *prot)?;
                 Self::Integer {
                     is_signed: val.is_signed,
                     bit_width: val.bit_width,
@@ -440,19 +438,19 @@ impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for LogicalType {
                 Self::Float16
             }
             16 => {
-                let val = VariantType::try_from(&mut *prot)?;
+                let val = VariantType::read_thrift(&mut *prot)?;
                 Self::Variant {
                     specification_version: val.specification_version,
                 }
             }
             17 => {
-                let val = GeometryType::try_from(&mut *prot)?;
+                let val = GeometryType::read_thrift(&mut *prot)?;
                 Self::Geometry {
                     crs: val.crs.map(|s| s.to_owned()),
                 }
             }
             18 => {
-                let val = GeographyType::try_from(&mut *prot)?;
+                let val = GeographyType::read_thrift(&mut *prot)?;
                 Self::Geography {
                     crs: val.crs.map(|s| s.to_owned()),
                     algorithm: val.algorithm,
@@ -756,9 +754,8 @@ pub enum Compression {
     LZ4_RAW,
 }
 
-impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for Compression {
-    type Error = ParquetError;
-    fn try_from(prot: &mut ThriftCompactInputProtocol<'a>) -> Result<Self> {
+impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for Compression {
+    fn read_thrift(prot: &mut R) -> Result<Self> {
         let val = prot.read_i32()?;
         Ok(match val {
             0 => Self::UNCOMPRESSED,
@@ -1123,10 +1120,8 @@ impl ColumnOrder {
     }
 }
 
-impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for ColumnOrder {
-    type Error = ParquetError;
-
-    fn try_from(prot: &mut ThriftCompactInputProtocol<'a>) -> Result<Self> {
+impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for ColumnOrder {
+    fn read_thrift(prot: &mut R) -> Result<Self> {
         prot.read_struct_begin()?;
         let field_ident = prot.read_field_begin()?;
         if field_ident.field_type == FieldType::Stop {
diff --git a/parquet/src/file/column_crypto_metadata.rs b/parquet/src/file/column_crypto_metadata.rs
index 5bba07357947..6a538bd42bc0 100644
--- a/parquet/src/file/column_crypto_metadata.rs
+++ b/parquet/src/file/column_crypto_metadata.rs
@@ -26,8 +26,8 @@ use crate::format::{
     EncryptionWithFooterKey as TEncryptionWithFooterKey,
 };
 use crate::parquet_thrift::{
-    ElementType, FieldType, ThriftCompactInputProtocol, ThriftCompactOutputProtocol, WriteThrift,
-    WriteThriftField,
+    read_thrift_vec, ElementType, FieldType, ReadThrift, ThriftCompactInputProtocol,
+    ThriftCompactOutputProtocol, WriteThrift, WriteThriftField,
 };
 use crate::{thrift_struct, thrift_union};
 
diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs
index 0c4372e38683..6f3a842d0985 100644
--- a/parquet/src/file/metadata/mod.rs
+++ b/parquet/src/file/metadata/mod.rs
@@ -121,8 +121,8 @@ use crate::{
 use crate::{
     basic::{ColumnOrder, Compression, Encoding, Type},
     parquet_thrift::{
-        ElementType, FieldType, ThriftCompactInputProtocol, ThriftCompactOutputProtocol,
-        WriteThrift, WriteThriftField,
+        ElementType, FieldType, ReadThrift, ThriftCompactInputProtocol,
+        ThriftCompactOutputProtocol, WriteThrift, WriteThriftField,
     },
 };
 use crate::{
diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs
index 7ab2db2f7ff3..f5661f6d0cf3 100644
--- a/parquet/src/file/metadata/reader.rs
+++ b/parquet/src/file/metadata/reader.rs
@@ -19,7 +19,7 @@ use std::{io::Read, ops::Range};
 
 #[cfg(feature = "encryption")]
 use crate::encryption::decrypt::{CryptoContext, FileDecryptionProperties};
-use crate::parquet_thrift::ThriftCompactInputProtocol;
+use crate::parquet_thrift::{ReadThrift, ThriftSliceInputProtocol};
 use bytes::Bytes;
 
 use crate::errors::{ParquetError, Result};
@@ -962,8 +962,8 @@ impl ParquetMetaDataReader {
     ///
     /// [Parquet Spec]: https://github.com/apache/parquet-format#metadata
     pub fn decode_metadata(buf: &[u8]) -> Result<ParquetMetaData> {
-        let mut prot = ThriftCompactInputProtocol::new(buf);
-        ParquetMetaData::try_from(&mut prot)
+        let mut prot = ThriftSliceInputProtocol::new(buf);
+        ParquetMetaData::read_thrift(&mut prot)
     }
 }
 
diff --git a/parquet/src/file/metadata/thrift_gen.rs b/parquet/src/file/metadata/thrift_gen.rs
index 630126ea8be8..b656bacc8c7d 100644
--- a/parquet/src/file/metadata/thrift_gen.rs
+++ b/parquet/src/file/metadata/thrift_gen.rs
@@ -35,8 +35,8 @@ use crate::{
         statistics::ValueStatistics,
     },
     parquet_thrift::{
-        ElementType, FieldType, ThriftCompactInputProtocol, ThriftCompactOutputProtocol,
-        WriteThrift, WriteThriftField,
+        read_thrift_vec, ElementType, FieldType, ReadThrift, ThriftCompactInputProtocol,
+        ThriftCompactOutputProtocol, WriteThrift, WriteThriftField,
     },
     schema::types::{parquet_schema_from_array, ColumnDescriptor, SchemaDescriptor},
     thrift_struct, thrift_union,
@@ -46,6 +46,7 @@ use crate::{
 use crate::{
     encryption::decrypt::{FileDecryptionProperties, FileDecryptor},
     file::column_crypto_metadata::ColumnCryptoMetaData,
+    parquet_thrift::ThriftSliceInputProtocol,
     schema::types::SchemaDescPtr,
 };
 
@@ -141,6 +142,7 @@ pub(crate) struct DataPageHeaderV2 {
 );
 
 thrift_struct!(
+#[allow(dead_code)]
 pub(crate) struct PageHeader {
   /// the type of the page: indicates which of the *_header fields is set
   1: required PageType type_
@@ -668,8 +670,8 @@ fn row_group_from_encrypted_thrift(
                         )
                     })?;
 
-            let mut prot = ThriftCompactInputProtocol::new(decrypted_cc_buf.as_slice());
-            let col_meta = ColumnMetaData::try_from(&mut prot)?;
+            let mut prot = ThriftSliceInputProtocol::new(decrypted_cc_buf.as_slice());
+            let col_meta = ColumnMetaData::read_thrift(&mut prot)?;
             c.meta_data = Some(col_meta);
             columns.push(convert_column(c, d.clone())?);
         } else {
@@ -698,14 +700,14 @@ pub(crate) fn parquet_metadata_with_encryption(
     encrypted_footer: bool,
     buf: &[u8],
 ) -> Result<ParquetMetaData> {
-    let mut prot = ThriftCompactInputProtocol::new(buf);
+    let mut prot = ThriftSliceInputProtocol::new(buf);
     let mut file_decryptor = None;
     let decrypted_fmd_buf;
 
     if encrypted_footer {
         if let Some(file_decryption_properties) = file_decryption_properties {
             let t_file_crypto_metadata: FileCryptoMetaData =
-                FileCryptoMetaData::try_from(&mut prot)
+                FileCryptoMetaData::read_thrift(&mut prot)
                     .map_err(|e| general_err!("Could not parse crypto metadata: {}", e))?;
             let supply_aad_prefix = match &t_file_crypto_metadata.encryption_algorithm {
                 EncryptionAlgorithm::AES_GCM_V1(algo) => algo.supply_aad_prefix,
@@ -733,7 +735,7 @@ pub(crate) fn parquet_metadata_with_encryption(
                         "Provided footer key and AAD were unable to decrypt parquet footer"
                     )
                 })?;
-            prot = ThriftCompactInputProtocol::new(decrypted_fmd_buf.as_ref());
+            prot = ThriftSliceInputProtocol::new(decrypted_fmd_buf.as_ref());
 
             file_decryptor = Some(decryptor);
         } else {
@@ -743,7 +745,7 @@ pub(crate) fn parquet_metadata_with_encryption(
         }
     }
 
-    let file_meta = super::thrift_gen::FileMetaData::try_from(&mut prot)
+    let file_meta = super::thrift_gen::FileMetaData::read_thrift(&mut prot)
         .map_err(|e| general_err!("Could not parse metadata: {}", e))?;
 
     let version = file_meta.version;
@@ -851,10 +853,9 @@ pub(super) fn get_file_decryptor(
 
 /// Create ParquetMetaData from thrift input. Note that this only decodes the file metadata in
 /// the Parquet footer. Page indexes will need to be added later.
-impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for ParquetMetaData {
-    type Error = ParquetError;
-    fn try_from(prot: &mut ThriftCompactInputProtocol<'a>) -> Result<Self> {
-        let file_meta = super::thrift_gen::FileMetaData::try_from(prot)?;
+impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for ParquetMetaData {
+    fn read_thrift(prot: &mut R) -> Result<Self> {
+        let file_meta = super::thrift_gen::FileMetaData::read_thrift(prot)?;
 
         let version = file_meta.version;
         let num_rows = file_meta.num_rows;
diff --git a/parquet/src/file/page_encoding_stats.rs b/parquet/src/file/page_encoding_stats.rs
index 2d433dc9b3f1..934e177de0da 100644
--- a/parquet/src/file/page_encoding_stats.rs
+++ b/parquet/src/file/page_encoding_stats.rs
@@ -20,10 +20,10 @@
 use std::io::Write;
 
 use crate::basic::{Encoding, PageType};
-use crate::errors::{ParquetError, Result};
+use crate::errors::Result;
 use crate::parquet_thrift::{
-    ElementType, FieldType, ThriftCompactInputProtocol, ThriftCompactOutputProtocol, WriteThrift,
-    WriteThriftField,
+    ElementType, FieldType, ReadThrift, ThriftCompactInputProtocol, ThriftCompactOutputProtocol,
+    WriteThrift, WriteThriftField,
 };
 use crate::thrift_struct;
 
diff --git a/parquet/src/file/page_index/index_reader.rs b/parquet/src/file/page_index/index_reader.rs
index e9cf119224c9..3db597954e6c 100644
--- a/parquet/src/file/page_index/index_reader.rs
+++ b/parquet/src/file/page_index/index_reader.rs
@@ -27,8 +27,8 @@ use crate::file::page_index::column_index::{
 use crate::file::page_index::offset_index::OffsetIndexMetaData;
 use crate::file::reader::ChunkReader;
 use crate::parquet_thrift::{
-    ElementType, FieldType, ThriftCompactInputProtocol, ThriftCompactOutputProtocol, WriteThrift,
-    WriteThriftField,
+    read_thrift_vec, ElementType, FieldType, ReadThrift, ThriftCompactInputProtocol,
+    ThriftCompactOutputProtocol, ThriftSliceInputProtocol, WriteThrift, WriteThriftField,
 };
 use crate::thrift_struct;
 use std::io::Write;
@@ -136,15 +136,15 @@ pub fn read_offset_indexes<R: ChunkReader>(
 }
 
 pub(crate) fn decode_offset_index(data: &[u8]) -> Result<OffsetIndexMetaData, ParquetError> {
-    let mut prot = ThriftCompactInputProtocol::new(data);
+    let mut prot = ThriftSliceInputProtocol::new(data);
 
     // Try to read fast-path first. If that fails, fall back to slower but more robust
     // decoder.
     match OffsetIndexMetaData::try_from_fast(&mut prot) {
         Ok(offset_index) => Ok(offset_index),
         Err(_) => {
-            prot = ThriftCompactInputProtocol::new(data);
-            OffsetIndexMetaData::try_from(&mut prot)
+            prot = ThriftSliceInputProtocol::new(data);
+            OffsetIndexMetaData::read_thrift(&mut prot)
         }
     }
 }
@@ -166,8 +166,8 @@ pub(crate) fn decode_column_index(
     data: &[u8],
     column_type: Type,
 ) -> Result<ColumnIndexMetaData, ParquetError> {
-    let mut prot = ThriftCompactInputProtocol::new(data);
-    let index = ThriftColumnIndex::try_from(&mut prot)?;
+    let mut prot = ThriftSliceInputProtocol::new(data);
+    let index = ThriftColumnIndex::read_thrift(&mut prot)?;
 
     let index = match column_type {
         Type::BOOLEAN => {
diff --git a/parquet/src/file/page_index/offset_index.rs b/parquet/src/file/page_index/offset_index.rs
index ac2620af09d8..2153b8ed3009 100644
--- a/parquet/src/file/page_index/offset_index.rs
+++ b/parquet/src/file/page_index/offset_index.rs
@@ -22,8 +22,8 @@
 use std::io::Write;
 
 use crate::parquet_thrift::{
-    ElementType, FieldType, ThriftCompactInputProtocol, ThriftCompactOutputProtocol, WriteThrift,
-    WriteThriftField,
+    read_thrift_vec, ElementType, FieldType, ReadThrift, ThriftCompactInputProtocol,
+    ThriftCompactOutputProtocol, WriteThrift, WriteThriftField,
 };
 use crate::{
     errors::{ParquetError, Result},
@@ -113,7 +113,9 @@ impl OffsetIndexMetaData {
     // Fast-path read of offset index. This works because we expect all field deltas to be 1,
     // and there's no nesting beyond PageLocation, so no need to save the last field id. Like
     // read_page_locations(), this will fail if absolute field id's are used.
-    pub(super) fn try_from_fast<'a>(prot: &mut ThriftCompactInputProtocol<'a>) -> Result<Self> {
+    pub(super) fn try_from_fast<'a, R: ThriftCompactInputProtocol<'a>>(
+        prot: &mut R,
+    ) -> Result<Self> {
         // Offset index is a struct with 2 fields. First field is an array of PageLocations,
         // the second an optional array of i64.
 
@@ -140,7 +142,7 @@ impl OffsetIndexMetaData {
                     "encountered unknown field while reading OffsetIndex"
                 ));
             }
-            let vec = Vec::<i64>::try_from(&mut *prot)?;
+            let vec = read_thrift_vec::<i64, R>(&mut *prot)?;
             unencoded_byte_array_data_bytes = Some(vec);
 
             // this one should be Stop
@@ -164,7 +166,7 @@ impl OffsetIndexMetaData {
 
 // Note: this will fail if the fields are either out of order, or if a suboptimal
 // encoder doesn't use field deltas.
-fn read_page_location<'a>(prot: &mut ThriftCompactInputProtocol<'a>) -> Result<PageLocation> {
+fn read_page_location<'a, R: ThriftCompactInputProtocol<'a>>(prot: &mut R) -> Result<PageLocation> {
     // there are 3 fields, all mandatory, so all field deltas should be 1
     let (field_type, delta) = prot.read_field_header()?;
     if delta != 1 || field_type != FieldType::I64 as u8 {
diff --git a/parquet/src/parquet_macros.rs b/parquet/src/parquet_macros.rs
index eba279f47c0e..60e2f452f4f2 100644
--- a/parquet/src/parquet_macros.rs
+++ b/parquet/src/parquet_macros.rs
@@ -33,10 +33,9 @@ macro_rules! thrift_enum {
             $($(#[cfg_attr(not(doctest), $($field_attrs)*)])* $field_name = $field_value,)*
         }
 
-        impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for $identifier {
-            type Error = ParquetError;
+        impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for $identifier {
             #[allow(deprecated)]
-            fn try_from(prot: &mut ThriftCompactInputProtocol<'a>) -> Result<Self> {
+            fn read_thrift(prot: &mut R) -> Result<Self> {
                 let val = prot.read_i32()?;
                 match val {
                     $($field_value => Ok(Self::$field_name),)*
@@ -105,10 +104,8 @@ macro_rules! thrift_union_all_empty {
             $($(#[cfg_attr(not(doctest), $($field_attrs)*)])* $field_name),*
         }
 
-        impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for $identifier {
-            type Error = ParquetError;
-
-            fn try_from(prot: &mut ThriftCompactInputProtocol<'a>) -> Result<Self> {
+        impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for $identifier {
+            fn read_thrift(prot: &mut R) -> Result<Self> {
                 prot.read_struct_begin()?;
                 let field_ident = prot.read_field_begin()?;
                 if field_ident.field_type == FieldType::Stop {
@@ -191,10 +188,8 @@ macro_rules! thrift_union {
             $($(#[cfg_attr(not(doctest), $($field_attrs)*)])* $field_name $( ( $crate::__thrift_union_type!{$field_type $($field_lt)? $($element_type)?} ) )?),*
         }
 
-        impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for $identifier $(<$lt>)? {
-            type Error = ParquetError;
-
-            fn try_from(prot: &mut ThriftCompactInputProtocol<'a>) -> Result<Self> {
+        impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for $identifier $(<$lt>)? {
+            fn read_thrift(prot: &mut R) -> Result<Self> {
                 prot.read_struct_begin()?;
                 let field_ident = prot.read_field_begin()?;
                 if field_ident.field_type == FieldType::Stop {
@@ -279,9 +274,8 @@ macro_rules! thrift_struct {
             $($(#[cfg_attr(not(doctest), $($field_attrs)*)])* $vis $field_name: $crate::__thrift_required_or_optional!($required_or_optional $crate::__thrift_field_type!($field_type $($field_lt)? $($element_type)?))),*
         }
 
-        impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for $identifier $(<$lt>)? {
-            type Error = ParquetError;
-            fn try_from(prot: &mut ThriftCompactInputProtocol<'a>) -> Result<Self> {
+        impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for $identifier $(<$lt>)? {
+            fn read_thrift(prot: &mut R) -> Result<Self> {
                 $(let mut $field_name: Option<$crate::__thrift_field_type!($field_type $($field_lt)? $($element_type)?)> = None;)*
                 prot.read_struct_begin()?;
                 loop {
@@ -414,38 +408,38 @@ macro_rules! __thrift_result_required_or_optional {
 #[macro_export]
 macro_rules! __thrift_read_field {
     ($prot:tt, list $lt:lifetime binary) => {
-        Vec::<&'a [u8]>::try_from(&mut *$prot)?
+        read_thrift_vec::<&'a [u8], R>(&mut *$prot)?
     };
     ($prot:tt, list $lt:lifetime $element_type:ident) => {
-        Vec::<$element_type>::try_from(&mut *$prot)?
+        read_thrift_vec::<$element_type, R>(&mut *$prot)?
     };
     ($prot:tt, list string) => {
-        Vec::<String>::try_from(&mut *$prot)?
+        read_thrift_vec::<String, R>(&mut *$prot)?
     };
     ($prot:tt, list $element_type:ident) => {
-        Vec::<$element_type>::try_from(&mut *$prot)?
+        read_thrift_vec::<$element_type, R>(&mut *$prot)?
     };
     ($prot:tt, string $lt:lifetime) => {
-        <&$lt str>::try_from(&mut *$prot)?
+        <&$lt str>::read_thrift(&mut *$prot)?
     };
     ($prot:tt, binary $lt:lifetime) => {
-        <&$lt [u8]>::try_from(&mut *$prot)?
+        <&$lt [u8]>::read_thrift(&mut *$prot)?
     };
     ($prot:tt, $field_type:ident $lt:lifetime) => {
-        $field_type::try_from(&mut *$prot)?
+        $field_type::read_thrift(&mut *$prot)?
     };
     ($prot:tt, string) => {
-        String::try_from(&mut *$prot)?
+        String::read_thrift(&mut *$prot)?
     };
     ($prot:tt, binary) => {
         // this one needs to not conflict with `list<i8>`
         $prot.read_bytes()?.to_vec()
     };
     ($prot:tt, double) => {
-        $crate::parquet_thrift::OrderedF64::try_from(&mut *$prot)?
+        $crate::parquet_thrift::OrderedF64::read_thrift(&mut *$prot)?
     };
     ($prot:tt, $field_type:ident) => {
-        $field_type::try_from(&mut *$prot)?
+        $field_type::read_thrift(&mut *$prot)?
     };
 }
 
@@ -478,10 +472,10 @@ macro_rules! __thrift_union_type {
 #[macro_export]
 macro_rules! __thrift_read_variant {
     ($prot:tt, $field_name:ident $field_type:ident) => {
-        Self::$field_name($field_type::try_from(&mut *$prot)?)
+        Self::$field_name($field_type::read_thrift(&mut *$prot)?)
     };
     ($prot:tt, $field_name:ident list $field_type:ident) => {
-        Self::$field_name(Vec::<$field_type>::try_from(&mut *$prot)?)
+        Self::$field_name(Vec::<$field_type>::read_thrift(&mut *$prot)?)
     };
     ($prot:tt, $field_name:ident) => {{
         $prot.skip_empty_struct()?;
diff --git a/parquet/src/parquet_thrift.rs b/parquet/src/parquet_thrift.rs
index ac5d72ecdd69..29e209e2f21f 100644
--- a/parquet/src/parquet_thrift.rs
+++ b/parquet/src/parquet_thrift.rs
@@ -167,41 +167,12 @@ pub(crate) struct ListIdentifier {
     pub(crate) size: i32,
 }
 
-/// A more performant implementation of [`TCompactInputProtocol`] that reads a slice
-///
-/// [`TCompactInputProtocol`]: thrift::protocol::TCompactInputProtocol
-pub(crate) struct ThriftCompactInputProtocol<'a> {
-    buf: &'a [u8],
-    // Identifier of the last field deserialized for a struct.
-    last_read_field_id: i16,
-    // Stack of the last read field ids (a new entry is added each time a nested struct is read).
-    read_field_id_stack: Vec<i16>,
-    // Boolean value for a field.
-    // Saved because boolean fields and their value are encoded in a single byte,
-    // and reading the field only occurs after the field id is read.
-    pending_read_bool_value: Option<bool>,
-}
+pub(crate) trait ThriftCompactInputProtocol<'a> {
+    fn read_byte(&mut self) -> Result<u8>;
 
-impl<'b, 'a: 'b> ThriftCompactInputProtocol<'a> {
-    pub fn new(buf: &'a [u8]) -> Self {
-        Self {
-            buf,
-            last_read_field_id: 0,
-            read_field_id_stack: Vec::with_capacity(16),
-            pending_read_bool_value: None,
-        }
-    }
+    fn read_bytes(&mut self) -> Result<&'a [u8]>;
 
-    pub fn reset_buffer(&mut self, buf: &'a [u8]) {
-        self.buf = buf;
-        self.last_read_field_id = 0;
-        self.read_field_id_stack.clear();
-        self.pending_read_bool_value = None;
-    }
-
-    pub fn as_slice(&self) -> &'a [u8] {
-        self.buf
-    }
+    fn skip_bytes(&mut self, n: usize) -> Result<()>;
 
     fn read_vlq(&mut self) -> Result<u64> {
         let mut in_progress = 0;
@@ -221,7 +192,7 @@ impl<'b, 'a: 'b> ThriftCompactInputProtocol<'a> {
         Ok((val >> 1) as i64 ^ -((val & 1) as i64))
     }
 
-    fn read_list_set_begin(&mut self) -> Result<(ElementType, i32)> {
+    fn read_list_begin(&mut self) -> Result<ListIdentifier> {
         let header = self.read_byte()?;
         let element_type = ElementType::try_from(header & 0x0f)?;
 
@@ -233,22 +204,17 @@ impl<'b, 'a: 'b> ThriftCompactInputProtocol<'a> {
             self.read_vlq()? as _
         };
 
-        Ok((element_type, element_count))
+        Ok(ListIdentifier {
+            element_type,
+            size: element_count,
+        })
     }
 
-    pub(crate) fn read_struct_begin(&mut self) -> Result<()> {
-        self.read_field_id_stack.push(self.last_read_field_id);
-        self.last_read_field_id = 0;
-        Ok(())
-    }
+    fn read_struct_begin(&mut self) -> Result<()>;
 
-    pub(crate) fn read_struct_end(&mut self) -> Result<()> {
-        self.last_read_field_id = self
-            .read_field_id_stack
-            .pop()
-            .expect("should have previous field ids");
-        Ok(())
-    }
+    fn read_struct_end(&mut self) -> Result<()>;
+
+    fn read_field_begin(&mut self) -> Result<FieldIdentifier>;
 
     // This is a specialized version of read_field_begin, solely for use in parsing
     // PageLocation structs in the offset index. This function assumes that the delta
@@ -256,138 +222,37 @@ impl<'b, 'a: 'b> ThriftCompactInputProtocol<'a> {
     // will be read. This also skips validation of the field type.
     //
     // Returns a tuple of (field_type, field_delta)
-    pub(crate) fn read_field_header(&mut self) -> Result<(u8, u8)> {
+    fn read_field_header(&mut self) -> Result<(u8, u8)> {
         let field_type = self.read_byte()?;
         let field_delta = (field_type & 0xf0) >> 4;
         let field_type = field_type & 0xf;
         Ok((field_type, field_delta))
     }
 
-    pub(crate) fn read_field_begin(&mut self) -> Result<FieldIdentifier> {
-        // we can read at least one byte, which is:
-        // - the type
-        // - the field delta and the type
-        let field_type = self.read_byte()?;
-        let field_delta = (field_type & 0xf0) >> 4;
-        let field_type = FieldType::try_from(field_type & 0xf)?;
+    fn read_bool(&mut self) -> Result<bool>;
 
-        match field_type {
-            FieldType::Stop => Ok(FieldIdentifier {
-                field_type: FieldType::Stop,
-                id: 0,
-            }),
-            _ => {
-                // special handling for bools
-                if field_type == FieldType::BooleanFalse {
-                    self.pending_read_bool_value = Some(false);
-                } else if field_type == FieldType::BooleanTrue {
-                    self.pending_read_bool_value = Some(true);
-                }
-                if field_delta != 0 {
-                    self.last_read_field_id = self
-                        .last_read_field_id
-                        .checked_add(field_delta as i16)
-                        .map_or_else(
-                            || {
-                                Err(general_err!(format!(
-                                    "cannot add {} to {}",
-                                    field_delta, self.last_read_field_id
-                                )))
-                            },
-                            Ok,
-                        )?;
-                } else {
-                    self.last_read_field_id = self.read_i16()?;
-                };
-
-                Ok(FieldIdentifier {
-                    field_type,
-                    id: self.last_read_field_id,
-                })
-            }
-        }
-    }
-
-    pub(crate) fn read_bool(&mut self) -> Result<bool> {
-        match self.pending_read_bool_value.take() {
-            Some(b) => Ok(b),
-            None => {
-                let b = self.read_byte()?;
-                // Previous versions of the thrift specification said to use 0 and 1 inside collections,
-                // but that differed from existing implementations.
-                // The specification was updated in https://github.com/apache/thrift/commit/2c29c5665bc442e703480bb0ee60fe925ffe02e8.
-                // At least the go implementation seems to have followed the previously documented values.
-                match b {
-                    0x01 => Ok(true),
-                    0x00 | 0x02 => Ok(false),
-                    unkn => Err(general_err!(format!("cannot convert {unkn} into bool"))),
-                }
-            }
-        }
-    }
-
-    pub(crate) fn read_bytes(&mut self) -> Result<&'b [u8]> {
-        let len = self.read_vlq()? as usize;
-        let ret = self.buf.get(..len).ok_or_else(eof_error)?;
-        self.buf = &self.buf[len..];
-        Ok(ret)
-    }
-
-    pub(crate) fn read_string(&mut self) -> Result<&'b str> {
+    fn read_string(&mut self) -> Result<&'a str> {
         let slice = self.read_bytes()?;
         Ok(std::str::from_utf8(slice)?)
     }
 
-    pub(crate) fn read_i8(&mut self) -> Result<i8> {
+    fn read_i8(&mut self) -> Result<i8> {
         Ok(self.read_byte()? as _)
     }
 
-    pub(crate) fn read_i16(&mut self) -> Result<i16> {
+    fn read_i16(&mut self) -> Result<i16> {
         Ok(self.read_zig_zag()? as _)
     }
 
-    pub(crate) fn read_i32(&mut self) -> Result<i32> {
+    fn read_i32(&mut self) -> Result<i32> {
         Ok(self.read_zig_zag()? as _)
     }
 
-    pub(crate) fn read_i64(&mut self) -> Result<i64> {
+    fn read_i64(&mut self) -> Result<i64> {
         self.read_zig_zag()
     }
 
-    pub(crate) fn read_double(&mut self) -> Result<f64> {
-        let slice = self.buf.get(..8).ok_or_else(eof_error)?;
-        self.buf = &self.buf[8..];
-        match slice.try_into() {
-            Ok(slice) => Ok(f64::from_le_bytes(slice)),
-            Err(_) => Err(general_err!("Unexpected error converting slice")),
-        }
-    }
-
-    pub(crate) fn read_list_begin(&mut self) -> Result<ListIdentifier> {
-        let (element_type, element_count) = self.read_list_set_begin()?;
-        Ok(ListIdentifier {
-            element_type,
-            size: element_count,
-        })
-    }
-
-    pub(crate) fn read_list_end(&mut self) -> Result<()> {
-        Ok(())
-    }
-
-    #[inline]
-    fn read_byte(&mut self) -> Result<u8> {
-        let ret = *self.buf.first().ok_or_else(eof_error)?;
-        self.buf = &self.buf[1..];
-        Ok(ret)
-    }
-
-    #[inline]
-    fn skip_bytes(&mut self, n: usize) -> Result<()> {
-        self.buf.get(..n).ok_or_else(eof_error)?;
-        self.buf = &self.buf[n..];
-        Ok(())
-    }
+    fn read_double(&mut self) -> Result<f64>;
 
     fn skip_vlq(&mut self) -> Result<()> {
         loop {
@@ -405,14 +270,14 @@ impl<'b, 'a: 'b> ThriftCompactInputProtocol<'a> {
 
     /// Skip a field with type `field_type` recursively until the default
     /// maximum skip depth is reached.
-    pub(crate) fn skip(&mut self, field_type: FieldType) -> Result<()> {
+    fn skip(&mut self, field_type: FieldType) -> Result<()> {
         // TODO: magic number
         self.skip_till_depth(field_type, 64)
     }
 
     /// Empty structs in unions consist of a single byte of 0 for the field stop record.
     /// This skips that byte without pushing to the field id stack.
-    pub(crate) fn skip_empty_struct(&mut self) -> Result<()> {
+    fn skip_empty_struct(&mut self) -> Result<()> {
         let b = self.read_byte()?;
         if b != 0 {
             Err(general_err!("Empty struct has fields"))
@@ -452,7 +317,7 @@ impl<'b, 'a: 'b> ThriftCompactInputProtocol<'a> {
                     let element_type = FieldType::try_from(list_ident.element_type)?;
                     self.skip_till_depth(element_type, depth - 1)?;
                 }
-                self.read_list_end()
+                Ok(())
             }
             // no list or map types in parquet format
             u => Err(general_err!(format!("cannot skip field type {:?}", &u))),
@@ -460,90 +325,226 @@ impl<'b, 'a: 'b> ThriftCompactInputProtocol<'a> {
     }
 }
 
+pub(crate) struct ThriftSliceInputProtocol<'a> {
+    buf: &'a [u8],
+    // Identifier of the last field deserialized for a struct.
+    last_read_field_id: i16,
+    // Stack of the last read field ids (a new entry is added each time a nested struct is read).
+    read_field_id_stack: Vec<i16>,
+    // Boolean value for a field.
+    // Saved because boolean fields and their value are encoded in a single byte,
+    // and reading the field only occurs after the field id is read.
+    pending_read_bool_value: Option<bool>,
+}
+
+impl<'a> ThriftSliceInputProtocol<'a> {
+    pub fn new(buf: &'a [u8]) -> Self {
+        Self {
+            buf,
+            last_read_field_id: 0,
+            read_field_id_stack: Vec::with_capacity(16),
+            pending_read_bool_value: None,
+        }
+    }
+
+    pub fn reset_buffer(&mut self, buf: &'a [u8]) {
+        self.buf = buf;
+        self.last_read_field_id = 0;
+        self.read_field_id_stack.clear();
+        self.pending_read_bool_value = None;
+    }
+
+    pub fn as_slice(&self) -> &'a [u8] {
+        self.buf
+    }
+}
+
+impl<'b, 'a: 'b> ThriftCompactInputProtocol<'b> for ThriftSliceInputProtocol<'a> {
+    #[inline]
+    fn read_byte(&mut self) -> Result<u8> {
+        let ret = *self.buf.first().ok_or_else(eof_error)?;
+        self.buf = &self.buf[1..];
+        Ok(ret)
+    }
+
+    fn read_bytes(&mut self) -> Result<&'b [u8]> {
+        let len = self.read_vlq()? as usize;
+        let ret = self.buf.get(..len).ok_or_else(eof_error)?;
+        self.buf = &self.buf[len..];
+        Ok(ret)
+    }
+
+    #[inline]
+    fn skip_bytes(&mut self, n: usize) -> Result<()> {
+        self.buf.get(..n).ok_or_else(eof_error)?;
+        self.buf = &self.buf[n..];
+        Ok(())
+    }
+
+    fn read_double(&mut self) -> Result<f64> {
+        let slice = self.buf.get(..8).ok_or_else(eof_error)?;
+        self.buf = &self.buf[8..];
+        match slice.try_into() {
+            Ok(slice) => Ok(f64::from_le_bytes(slice)),
+            Err(_) => Err(general_err!("Unexpected error converting slice")),
+        }
+    }
+
+    fn read_struct_begin(&mut self) -> Result<()> {
+        self.read_field_id_stack.push(self.last_read_field_id);
+        self.last_read_field_id = 0;
+        Ok(())
+    }
+
+    fn read_struct_end(&mut self) -> Result<()> {
+        self.last_read_field_id = self
+            .read_field_id_stack
+            .pop()
+            .expect("should have previous field ids");
+        Ok(())
+    }
+
+    fn read_field_begin(&mut self) -> Result<FieldIdentifier> {
+        // we can read at least one byte, which is:
+        // - the type
+        // - the field delta and the type
+        let field_type = self.read_byte()?;
+        let field_delta = (field_type & 0xf0) >> 4;
+        let field_type = FieldType::try_from(field_type & 0xf)?;
+
+        match field_type {
+            FieldType::Stop => Ok(FieldIdentifier {
+                field_type: FieldType::Stop,
+                id: 0,
+            }),
+            _ => {
+                // special handling for bools
+                if field_type == FieldType::BooleanFalse {
+                    self.pending_read_bool_value = Some(false);
+                } else if field_type == FieldType::BooleanTrue {
+                    self.pending_read_bool_value = Some(true);
+                }
+                if field_delta != 0 {
+                    self.last_read_field_id = self
+                        .last_read_field_id
+                        .checked_add(field_delta as i16)
+                        .map_or_else(
+                            || {
+                                Err(general_err!(format!(
+                                    "cannot add {} to {}",
+                                    field_delta, self.last_read_field_id
+                                )))
+                            },
+                            Ok,
+                        )?;
+                } else {
+                    self.last_read_field_id = self.read_i16()?;
+                };
+
+                Ok(FieldIdentifier {
+                    field_type,
+                    id: self.last_read_field_id,
+                })
+            }
+        }
+    }
+
+    fn read_bool(&mut self) -> Result<bool> {
+        match self.pending_read_bool_value.take() {
+            Some(b) => Ok(b),
+            None => {
+                let b = self.read_byte()?;
+                // Previous versions of the thrift specification said to use 0 and 1 inside collections,
+                // but that differed from existing implementations.
+                // The specification was updated in https://github.com/apache/thrift/commit/2c29c5665bc442e703480bb0ee60fe925ffe02e8.
+                // At least the go implementation seems to have followed the previously documented values.
+                match b {
+                    0x01 => Ok(true),
+                    0x00 | 0x02 => Ok(false),
+                    unkn => Err(general_err!(format!("cannot convert {unkn} into bool"))),
+                }
+            }
+        }
+    }
+}
+
 fn eof_error() -> ParquetError {
     eof_err!("Unexpected EOF")
 }
 
-impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for bool {
-    type Error = ParquetError;
-    fn try_from(prot: &mut ThriftCompactInputProtocol<'a>) -> Result<Self> {
+pub(crate) trait ReadThrift<'a, R: ThriftCompactInputProtocol<'a>> {
+    // used to read generated enums and structs
+    fn read_thrift(prot: &mut R) -> Result<Self>
+    where
+        Self: Sized;
+}
+
+impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for bool {
+    fn read_thrift(prot: &mut R) -> Result<Self> {
         prot.read_bool()
     }
 }
 
-impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for i8 {
-    type Error = ParquetError;
-    fn try_from(prot: &mut ThriftCompactInputProtocol<'a>) -> Result<Self> {
+impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for i8 {
+    fn read_thrift(prot: &mut R) -> Result<Self> {
         prot.read_i8()
     }
 }
 
-impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for i16 {
-    type Error = ParquetError;
-    fn try_from(prot: &mut ThriftCompactInputProtocol<'a>) -> Result<Self> {
+impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for i16 {
+    fn read_thrift(prot: &mut R) -> Result<Self> {
         prot.read_i16()
     }
 }
 
-impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for i32 {
-    type Error = ParquetError;
-    fn try_from(prot: &mut ThriftCompactInputProtocol<'a>) -> Result<Self> {
+impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for i32 {
+    fn read_thrift(prot: &mut R) -> Result<Self> {
         prot.read_i32()
     }
 }
 
-impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for i64 {
-    type Error = ParquetError;
-    fn try_from(prot: &mut ThriftCompactInputProtocol<'a>) -> Result<Self> {
+impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for i64 {
+    fn read_thrift(prot: &mut R) -> Result<Self> {
         prot.read_i64()
     }
 }
 
-impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for OrderedF64 {
-    type Error = ParquetError;
-    fn try_from(prot: &mut ThriftCompactInputProtocol<'a>) -> Result<Self> {
+impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for OrderedF64 {
+    fn read_thrift(prot: &mut R) -> Result<Self> {
         Ok(OrderedF64(prot.read_double()?))
     }
 }
 
-impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for &'a str {
-    type Error = ParquetError;
-    fn try_from(prot: &mut ThriftCompactInputProtocol<'a>) -> Result<Self> {
+impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for &'a str {
+    fn read_thrift(prot: &mut R) -> Result<Self> {
         prot.read_string()
     }
 }
 
-impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for String {
-    type Error = ParquetError;
-    fn try_from(prot: &mut ThriftCompactInputProtocol<'a>) -> Result<Self> {
+impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for String {
+    fn read_thrift(prot: &mut R) -> Result<Self> {
         Ok(prot.read_string()?.to_owned())
     }
 }
 
-impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for &'a [u8] {
-    type Error = ParquetError;
-    fn try_from(prot: &mut ThriftCompactInputProtocol<'a>) -> Result<Self> {
+impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for &'a [u8] {
+    fn read_thrift(prot: &mut R) -> Result<Self> {
         prot.read_bytes()
     }
 }
 
-impl<'a, T> TryFrom<&mut ThriftCompactInputProtocol<'a>> for Vec<T>
+pub(crate) fn read_thrift_vec<'a, T, R>(prot: &mut R) -> Result<Vec<T>>
 where
-    T: for<'b> TryFrom<&'b mut ThriftCompactInputProtocol<'a>>,
-    ParquetError: for<'b> From<<T as TryFrom<&'b mut ThriftCompactInputProtocol<'a>>>::Error>,
+    R: ThriftCompactInputProtocol<'a>,
+    T: ReadThrift<'a, R>,
 {
-    type Error = ParquetError;
-
-    fn try_from(prot: &mut ThriftCompactInputProtocol<'a>) -> Result<Self, Self::Error> {
-        let list_ident = prot.read_list_begin()?;
-        let mut res = Vec::with_capacity(list_ident.size as usize);
-        for _ in 0..list_ident.size {
-            let val = T::try_from(prot)?;
-            res.push(val);
-        }
-
-        Ok(res)
+    let list_ident = prot.read_list_begin()?;
+    let mut res = Vec::with_capacity(list_ident.size as usize);
+    for _ in 0..list_ident.size {
+        let val = T::read_thrift(prot)?;
+        res.push(val);
     }
+    Ok(res)
 }
 
 /////////////////////////
@@ -900,11 +901,11 @@ pub(crate) mod tests {
 
     pub(crate) fn test_roundtrip<T>(val: T)
     where
-        T: for<'a> TryFrom<&'a mut ThriftCompactInputProtocol<'a>>
+        T: for<'a> ReadThrift<'a, ThriftSliceInputProtocol<'a>>
             + WriteThrift<Vec<u8>>
             + PartialEq
             + Debug,
-        for<'a> <T as TryFrom<&'a mut ThriftCompactInputProtocol<'a>>>::Error: Debug,
+        //for<'a> <T as TryFrom<&'a mut ThriftCompactInputProtocol<'a>>>::Error: Debug,
     {
         let buf = Vec::<u8>::new();
         let mut writer = ThriftCompactOutputProtocol::new(buf);
@@ -912,8 +913,8 @@ pub(crate) mod tests {
 
         //println!("serialized: {:x?}", writer.inner());
 
-        let mut prot = ThriftCompactInputProtocol::new(writer.inner());
-        let read_val = T::try_from(&mut prot).unwrap();
+        let mut prot = ThriftSliceInputProtocol::new(writer.inner());
+        let read_val = T::read_thrift(&mut prot).unwrap();
         assert_eq!(val, read_val);
     }
 

From b31c9e69c0dcb75cf45a1ec4491bdf0808461293 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@live.com>
Date: Wed, 27 Aug 2025 11:54:37 -0700
Subject: [PATCH 36/46] rework struct field reading

---
 parquet/src/basic.rs          |  13 +--
 parquet/src/parquet_macros.rs |  45 +++++-----
 parquet/src/parquet_thrift.rs | 162 +++++++++++++---------------------
 3 files changed, 87 insertions(+), 133 deletions(-)

diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs
index 8cf6b5f85b8b..4aeca93cfbde 100644
--- a/parquet/src/basic.rs
+++ b/parquet/src/basic.rs
@@ -362,9 +362,7 @@ pub enum LogicalType {
 
 impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for LogicalType {
     fn read_thrift(prot: &mut R) -> Result<Self> {
-        prot.read_struct_begin()?;
-
-        let field_ident = prot.read_field_begin()?;
+        let field_ident = prot.read_field_begin(0)?;
         if field_ident.field_type == FieldType::Stop {
             return Err(general_err!("received empty union from remote LogicalType"));
         }
@@ -463,13 +461,12 @@ impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for LogicalType {
                 }
             }
         };
-        let field_ident = prot.read_field_begin()?;
+        let field_ident = prot.read_field_begin(field_ident.id)?;
         if field_ident.field_type != FieldType::Stop {
             return Err(general_err!(
                 "Received multiple fields for union from remote LogicalType"
             ));
         }
-        prot.read_struct_end()?;
         Ok(ret)
     }
 }
@@ -1122,8 +1119,7 @@ impl ColumnOrder {
 
 impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for ColumnOrder {
     fn read_thrift(prot: &mut R) -> Result<Self> {
-        prot.read_struct_begin()?;
-        let field_ident = prot.read_field_begin()?;
+        let field_ident = prot.read_field_begin(0)?;
         if field_ident.field_type == FieldType::Stop {
             return Err(general_err!("Received empty union from remote ColumnOrder"));
         }
@@ -1138,13 +1134,12 @@ impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for ColumnOrder {
                 Self::UNKNOWN
             }
         };
-        let field_ident = prot.read_field_begin()?;
+        let field_ident = prot.read_field_begin(field_ident.id)?;
         if field_ident.field_type != FieldType::Stop {
             return Err(general_err!(
                 "Received multiple fields for union from remote ColumnOrder"
             ));
         }
-        prot.read_struct_end()?;
         Ok(ret)
     }
 }
diff --git a/parquet/src/parquet_macros.rs b/parquet/src/parquet_macros.rs
index 60e2f452f4f2..3941d84c0dda 100644
--- a/parquet/src/parquet_macros.rs
+++ b/parquet/src/parquet_macros.rs
@@ -106,8 +106,7 @@ macro_rules! thrift_union_all_empty {
 
         impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for $identifier {
             fn read_thrift(prot: &mut R) -> Result<Self> {
-                prot.read_struct_begin()?;
-                let field_ident = prot.read_field_begin()?;
+                let field_ident = prot.read_field_begin(0)?;
                 if field_ident.field_type == FieldType::Stop {
                     return Err(general_err!("Received empty union from remote {}", stringify!($identifier)));
                 }
@@ -121,13 +120,12 @@ macro_rules! thrift_union_all_empty {
                         return Err(general_err!("Unexpected {} {}", stringify!($identifier), field_ident.id));
                     }
                 };
-                let field_ident = prot.read_field_begin()?;
+                let field_ident = prot.read_field_begin(field_ident.id)?;
                 if field_ident.field_type != FieldType::Stop {
                     return Err(general_err!(
                         "Received multiple fields for union from remote {}", stringify!($identifier)
                     ));
                 }
-                prot.read_struct_end()?;
                 Ok(ret)
             }
         }
@@ -190,8 +188,7 @@ macro_rules! thrift_union {
 
         impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for $identifier $(<$lt>)? {
             fn read_thrift(prot: &mut R) -> Result<Self> {
-                prot.read_struct_begin()?;
-                let field_ident = prot.read_field_begin()?;
+                let field_ident = prot.read_field_begin(0)?;
                 if field_ident.field_type == FieldType::Stop {
                     return Err(general_err!("Received empty union from remote {}", stringify!($identifier)));
                 }
@@ -204,13 +201,12 @@ macro_rules! thrift_union {
                         return Err(general_err!("Unexpected {} {}", stringify!($identifier), field_ident.id));
                     }
                 };
-                let field_ident = prot.read_field_begin()?;
+                let field_ident = prot.read_field_begin(field_ident.id)?;
                 if field_ident.field_type != FieldType::Stop {
                     return Err(general_err!(
                         concat!("Received multiple fields for union from remote {}", stringify!($identifier))
                     ));
                 }
-                prot.read_struct_end()?;
                 Ok(ret)
             }
         }
@@ -277,23 +273,23 @@ macro_rules! thrift_struct {
         impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for $identifier $(<$lt>)? {
             fn read_thrift(prot: &mut R) -> Result<Self> {
                 $(let mut $field_name: Option<$crate::__thrift_field_type!($field_type $($field_lt)? $($element_type)?)> = None;)*
-                prot.read_struct_begin()?;
+                let mut last_field_id = 0i16;
                 loop {
-                    let field_ident = prot.read_field_begin()?;
+                    let field_ident = prot.read_field_begin(last_field_id)?;
                     if field_ident.field_type == FieldType::Stop {
                         break;
                     }
                     match field_ident.id {
                         $($field_id => {
-                            let val = $crate::__thrift_read_field!(prot, $field_type $($field_lt)? $($element_type)?);
+                            let val = $crate::__thrift_read_field!(prot, field_ident, $field_type $($field_lt)? $($element_type)?);
                             $field_name = Some(val);
                         })*
                         _ => {
                             prot.skip(field_ident.field_type)?;
                         }
                     };
+                    last_field_id = field_ident.id;
                 }
-                prot.read_struct_end()?;
                 $($crate::__thrift_result_required_or_optional!($required_or_optional $field_name);)*
                 Ok(Self {
                     $($field_name),*
@@ -407,38 +403,41 @@ macro_rules! __thrift_result_required_or_optional {
 #[doc(hidden)]
 #[macro_export]
 macro_rules! __thrift_read_field {
-    ($prot:tt, list $lt:lifetime binary) => {
+    ($prot:tt, $field_ident:tt, list $lt:lifetime binary) => {
         read_thrift_vec::<&'a [u8], R>(&mut *$prot)?
     };
-    ($prot:tt, list $lt:lifetime $element_type:ident) => {
+    ($prot:tt, $field_ident:tt, list $lt:lifetime $element_type:ident) => {
         read_thrift_vec::<$element_type, R>(&mut *$prot)?
     };
-    ($prot:tt, list string) => {
+    ($prot:tt, $field_ident:tt, list string) => {
         read_thrift_vec::<String, R>(&mut *$prot)?
     };
-    ($prot:tt, list $element_type:ident) => {
+    ($prot:tt, $field_ident:tt, list $element_type:ident) => {
         read_thrift_vec::<$element_type, R>(&mut *$prot)?
     };
-    ($prot:tt, string $lt:lifetime) => {
+    ($prot:tt, $field_ident:tt, string $lt:lifetime) => {
         <&$lt str>::read_thrift(&mut *$prot)?
     };
-    ($prot:tt, binary $lt:lifetime) => {
+    ($prot:tt, $field_ident:tt, binary $lt:lifetime) => {
         <&$lt [u8]>::read_thrift(&mut *$prot)?
     };
-    ($prot:tt, $field_type:ident $lt:lifetime) => {
+    ($prot:tt, $field_ident:tt, $field_type:ident $lt:lifetime) => {
         $field_type::read_thrift(&mut *$prot)?
     };
-    ($prot:tt, string) => {
+    ($prot:tt, $field_ident:tt, string) => {
         String::read_thrift(&mut *$prot)?
     };
-    ($prot:tt, binary) => {
+    ($prot:tt, $field_ident:tt, binary) => {
         // this one needs to not conflict with `list<i8>`
         $prot.read_bytes()?.to_vec()
     };
-    ($prot:tt, double) => {
+    ($prot:tt, $field_ident:tt, double) => {
         $crate::parquet_thrift::OrderedF64::read_thrift(&mut *$prot)?
     };
-    ($prot:tt, $field_type:ident) => {
+    ($prot:tt, $field_ident:tt, bool) => {
+        $field_ident.bool_val.unwrap()
+    };
+    ($prot:tt, $field_ident:tt, $field_type:ident) => {
         $field_type::read_thrift(&mut *$prot)?
     };
 }
diff --git a/parquet/src/parquet_thrift.rs b/parquet/src/parquet_thrift.rs
index 29e209e2f21f..b38f6780183f 100644
--- a/parquet/src/parquet_thrift.rs
+++ b/parquet/src/parquet_thrift.rs
@@ -159,6 +159,7 @@ impl TryFrom<u8> for ElementType {
 pub(crate) struct FieldIdentifier {
     pub(crate) field_type: FieldType,
     pub(crate) id: i16,
+    pub(crate) bool_val: Option<bool>,
 }
 
 #[derive(Clone, Debug, Eq, PartialEq)]
@@ -210,11 +211,50 @@ pub(crate) trait ThriftCompactInputProtocol<'a> {
         })
     }
 
-    fn read_struct_begin(&mut self) -> Result<()>;
+    fn read_field_begin(&mut self, last_field_id: i16) -> Result<FieldIdentifier> {
+        // we can read at least one byte, which is:
+        // - the type
+        // - the field delta and the type
+        let field_type = self.read_byte()?;
+        let field_delta = (field_type & 0xf0) >> 4;
+        let field_type = FieldType::try_from(field_type & 0xf)?;
+        let mut bool_val: Option<bool> = None;
 
-    fn read_struct_end(&mut self) -> Result<()>;
+        match field_type {
+            FieldType::Stop => Ok(FieldIdentifier {
+                field_type: FieldType::Stop,
+                id: 0,
+                bool_val,
+            }),
+            _ => {
+                // special handling for bools
+                if field_type == FieldType::BooleanFalse {
+                    bool_val = Some(false);
+                } else if field_type == FieldType::BooleanTrue {
+                    bool_val = Some(true);
+                }
+                let field_id = if field_delta != 0 {
+                    last_field_id.checked_add(field_delta as i16).map_or_else(
+                        || {
+                            Err(general_err!(format!(
+                                "cannot add {} to {}",
+                                field_delta, last_field_id
+                            )))
+                        },
+                        Ok,
+                    )?
+                } else {
+                    self.read_i16()?
+                };
 
-    fn read_field_begin(&mut self) -> Result<FieldIdentifier>;
+                Ok(FieldIdentifier {
+                    field_type,
+                    id: field_id,
+                    bool_val,
+                })
+            }
+        }
+    }
 
     // This is a specialized version of read_field_begin, solely for use in parsing
     // PageLocation structs in the offset index. This function assumes that the delta
@@ -229,7 +269,19 @@ pub(crate) trait ThriftCompactInputProtocol<'a> {
         Ok((field_type, field_delta))
     }
 
-    fn read_bool(&mut self) -> Result<bool>;
+    // not to be used for bool struct fields, just for bool arrays
+    fn read_bool(&mut self) -> Result<bool> {
+        let b = self.read_byte()?;
+        // Previous versions of the thrift specification said to use 0 and 1 inside collections,
+        // but that differed from existing implementations.
+        // The specification was updated in https://github.com/apache/thrift/commit/2c29c5665bc442e703480bb0ee60fe925ffe02e8.
+        // At least the go implementation seems to have followed the previously documented values.
+        match b {
+            0x01 => Ok(true),
+            0x00 | 0x02 => Ok(false),
+            unkn => Err(general_err!(format!("cannot convert {unkn} into bool"))),
+        }
+    }
 
     fn read_string(&mut self) -> Result<&'a str> {
         let slice = self.read_bytes()?;
@@ -301,15 +353,16 @@ pub(crate) trait ThriftCompactInputProtocol<'a> {
             FieldType::Double => self.skip_bytes(8).map(|_| ()),
             FieldType::Binary => self.skip_binary().map(|_| ()),
             FieldType::Struct => {
-                self.read_struct_begin()?;
+                let mut last_field_id = 0i16;
                 loop {
-                    let field_ident = self.read_field_begin()?;
+                    let field_ident = self.read_field_begin(last_field_id)?;
                     if field_ident.field_type == FieldType::Stop {
                         break;
                     }
                     self.skip_till_depth(field_ident.field_type, depth - 1)?;
+                    last_field_id = field_ident.id;
                 }
-                self.read_struct_end()
+                Ok(())
             }
             FieldType::List => {
                 let list_ident = self.read_list_begin()?;
@@ -327,31 +380,15 @@ pub(crate) trait ThriftCompactInputProtocol<'a> {
 
 pub(crate) struct ThriftSliceInputProtocol<'a> {
     buf: &'a [u8],
-    // Identifier of the last field deserialized for a struct.
-    last_read_field_id: i16,
-    // Stack of the last read field ids (a new entry is added each time a nested struct is read).
-    read_field_id_stack: Vec<i16>,
-    // Boolean value for a field.
-    // Saved because boolean fields and their value are encoded in a single byte,
-    // and reading the field only occurs after the field id is read.
-    pending_read_bool_value: Option<bool>,
 }
 
 impl<'a> ThriftSliceInputProtocol<'a> {
     pub fn new(buf: &'a [u8]) -> Self {
-        Self {
-            buf,
-            last_read_field_id: 0,
-            read_field_id_stack: Vec::with_capacity(16),
-            pending_read_bool_value: None,
-        }
+        Self { buf }
     }
 
     pub fn reset_buffer(&mut self, buf: &'a [u8]) {
         self.buf = buf;
-        self.last_read_field_id = 0;
-        self.read_field_id_stack.clear();
-        self.pending_read_bool_value = None;
     }
 
     pub fn as_slice(&self) -> &'a [u8] {
@@ -389,83 +426,6 @@ impl<'b, 'a: 'b> ThriftCompactInputProtocol<'b> for ThriftSliceInputProtocol<'a>
             Err(_) => Err(general_err!("Unexpected error converting slice")),
         }
     }
-
-    fn read_struct_begin(&mut self) -> Result<()> {
-        self.read_field_id_stack.push(self.last_read_field_id);
-        self.last_read_field_id = 0;
-        Ok(())
-    }
-
-    fn read_struct_end(&mut self) -> Result<()> {
-        self.last_read_field_id = self
-            .read_field_id_stack
-            .pop()
-            .expect("should have previous field ids");
-        Ok(())
-    }
-
-    fn read_field_begin(&mut self) -> Result<FieldIdentifier> {
-        // we can read at least one byte, which is:
-        // - the type
-        // - the field delta and the type
-        let field_type = self.read_byte()?;
-        let field_delta = (field_type & 0xf0) >> 4;
-        let field_type = FieldType::try_from(field_type & 0xf)?;
-
-        match field_type {
-            FieldType::Stop => Ok(FieldIdentifier {
-                field_type: FieldType::Stop,
-                id: 0,
-            }),
-            _ => {
-                // special handling for bools
-                if field_type == FieldType::BooleanFalse {
-                    self.pending_read_bool_value = Some(false);
-                } else if field_type == FieldType::BooleanTrue {
-                    self.pending_read_bool_value = Some(true);
-                }
-                if field_delta != 0 {
-                    self.last_read_field_id = self
-                        .last_read_field_id
-                        .checked_add(field_delta as i16)
-                        .map_or_else(
-                            || {
-                                Err(general_err!(format!(
-                                    "cannot add {} to {}",
-                                    field_delta, self.last_read_field_id
-                                )))
-                            },
-                            Ok,
-                        )?;
-                } else {
-                    self.last_read_field_id = self.read_i16()?;
-                };
-
-                Ok(FieldIdentifier {
-                    field_type,
-                    id: self.last_read_field_id,
-                })
-            }
-        }
-    }
-
-    fn read_bool(&mut self) -> Result<bool> {
-        match self.pending_read_bool_value.take() {
-            Some(b) => Ok(b),
-            None => {
-                let b = self.read_byte()?;
-                // Previous versions of the thrift specification said to use 0 and 1 inside collections,
-                // but that differed from existing implementations.
-                // The specification was updated in https://github.com/apache/thrift/commit/2c29c5665bc442e703480bb0ee60fe925ffe02e8.
-                // At least the go implementation seems to have followed the previously documented values.
-                match b {
-                    0x01 => Ok(true),
-                    0x00 | 0x02 => Ok(false),
-                    unkn => Err(general_err!(format!("cannot convert {unkn} into bool"))),
-                }
-            }
-        }
-    }
 }
 
 fn eof_error() -> ParquetError {

From 8c4e49df507034c3ccb79d6d519b344a8c5ced27 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@live.com>
Date: Wed, 27 Aug 2025 12:26:46 -0700
Subject: [PATCH 37/46] fix skipping bool fields

---
 parquet/src/parquet_thrift.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/parquet/src/parquet_thrift.rs b/parquet/src/parquet_thrift.rs
index b38f6780183f..29cf4a3d7c88 100644
--- a/parquet/src/parquet_thrift.rs
+++ b/parquet/src/parquet_thrift.rs
@@ -345,7 +345,8 @@ pub(crate) trait ThriftCompactInputProtocol<'a> {
         }
 
         match field_type {
-            FieldType::BooleanFalse | FieldType::BooleanTrue => self.read_bool().map(|_| ()),
+            // boolean field has no data
+            FieldType::BooleanFalse | FieldType::BooleanTrue => Ok(()),
             FieldType::Byte => self.read_i8().map(|_| ()),
             FieldType::I16 => self.skip_vlq().map(|_| ()),
             FieldType::I32 => self.skip_vlq().map(|_| ()),

From e0e18529c7d6fce96f9a95f01a16dda2c6b18092 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@live.com>
Date: Wed, 27 Aug 2025 12:39:36 -0700
Subject: [PATCH 38/46] remove cruft

---
 parquet/src/parquet_thrift.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/parquet/src/parquet_thrift.rs b/parquet/src/parquet_thrift.rs
index 29cf4a3d7c88..8b2ab4943a50 100644
--- a/parquet/src/parquet_thrift.rs
+++ b/parquet/src/parquet_thrift.rs
@@ -866,7 +866,6 @@ pub(crate) mod tests {
             + WriteThrift<Vec<u8>>
             + PartialEq
             + Debug,
-        //for<'a> <T as TryFrom<&'a mut ThriftCompactInputProtocol<'a>>>::Error: Debug,
     {
         let buf = Vec::<u8>::new();
         let mut writer = ThriftCompactOutputProtocol::new(buf);

From d8081a9388629fe45089e62d192eab90e6441f58 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@live.com>
Date: Wed, 27 Aug 2025 17:35:45 -0700
Subject: [PATCH 39/46] fix clippy issues

---
 parquet/src/file/metadata/thrift_gen.rs | 2 +-
 parquet/src/parquet_thrift.rs           | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/parquet/src/file/metadata/thrift_gen.rs b/parquet/src/file/metadata/thrift_gen.rs
index 1dc829e5cfe2..f15a5a6b16d8 100644
--- a/parquet/src/file/metadata/thrift_gen.rs
+++ b/parquet/src/file/metadata/thrift_gen.rs
@@ -548,7 +548,7 @@ mod tests {
             ymax: 128.5.into(),
             zmin: Some(11.0.into()),
             zmax: Some(1300.0.into()),
-            mmin: Some(3.14.into()),
+            mmin: Some(3.7.into()),
             mmax: Some(42.0.into()),
         });
     }
diff --git a/parquet/src/parquet_thrift.rs b/parquet/src/parquet_thrift.rs
index 4f04d990860e..ac5d72ecdd69 100644
--- a/parquet/src/parquet_thrift.rs
+++ b/parquet/src/parquet_thrift.rs
@@ -665,8 +665,8 @@ where
 
     fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
         writer.write_list_begin(T::ELEMENT_TYPE, self.len())?;
-        for i in 0..self.len() {
-            self[i].write_thrift(writer)?;
+        for item in self {
+            item.write_thrift(writer)?;
         }
         Ok(())
     }

From 5d6c8b1303ece5f98984d78067796e372b5291bc Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@live.com>
Date: Wed, 27 Aug 2025 17:45:01 -0700
Subject: [PATCH 40/46] allow unused page header structs

---
 parquet/src/file/metadata/thrift_gen.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/parquet/src/file/metadata/thrift_gen.rs b/parquet/src/file/metadata/thrift_gen.rs
index 630126ea8be8..06229fb1812f 100644
--- a/parquet/src/file/metadata/thrift_gen.rs
+++ b/parquet/src/file/metadata/thrift_gen.rs
@@ -141,6 +141,7 @@ pub(crate) struct DataPageHeaderV2 {
 );
 
 thrift_struct!(
+#[allow(dead_code)]
 pub(crate) struct PageHeader {
   /// the type of the page: indicates which of the *_header fields is set
   1: required PageType type_

From 709e8130f9f6eda29290dfbd8907e0ab7cd143fc Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@live.com>
Date: Thu, 28 Aug 2025 18:50:14 -0700
Subject: [PATCH 41/46] remove Write from WriteThrift

---
 parquet/src/basic.rs          | 28 +++++------
 parquet/src/parquet_macros.rs | 32 ++++++------
 parquet/src/parquet_thrift.rs | 94 +++++++++++++++++------------------
 3 files changed, 77 insertions(+), 77 deletions(-)

diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs
index cf451b961f69..5fd49043731e 100644
--- a/parquet/src/basic.rs
+++ b/parquet/src/basic.rs
@@ -197,17 +197,17 @@ impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for ConvertedType {
     }
 }
 
-impl<W: Write> WriteThrift<W> for ConvertedType {
+impl WriteThrift for ConvertedType {
     const ELEMENT_TYPE: ElementType = ElementType::I32;
 
-    fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
+    fn write_thrift<W: Write>(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
         // because we've added NONE, the variant values are off by 1, so correct that here
         writer.write_i32(*self as i32 - 1)
     }
 }
 
-impl<W: Write> WriteThriftField<W> for ConvertedType {
-    fn write_thrift_field(
+impl WriteThriftField for ConvertedType {
+    fn write_thrift_field<W: Write>(
         &self,
         writer: &mut ThriftCompactOutputProtocol<W>,
         field_id: i16,
@@ -476,10 +476,10 @@ impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for LogicalType {
     }
 }
 
-impl<W: Write> WriteThrift<W> for LogicalType {
+impl WriteThrift for LogicalType {
     const ELEMENT_TYPE: ElementType = ElementType::Struct;
 
-    fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
+    fn write_thrift<W: Write>(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
         match self {
             Self::String => {
                 writer.write_empty_struct(1, 0)?;
@@ -575,8 +575,8 @@ impl<W: Write> WriteThrift<W> for LogicalType {
     }
 }
 
-impl<W: Write> WriteThriftField<W> for LogicalType {
-    fn write_thrift_field(
+impl WriteThriftField for LogicalType {
+    fn write_thrift_field<W: Write>(
         &self,
         writer: &mut ThriftCompactOutputProtocol<W>,
         field_id: i16,
@@ -776,10 +776,10 @@ impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for Compression {
 
 // FIXME
 // ugh...why did we add compression level to some variants if we don't use them????
-impl<W: Write> WriteThrift<W> for Compression {
+impl WriteThrift for Compression {
     const ELEMENT_TYPE: ElementType = ElementType::I32;
 
-    fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
+    fn write_thrift<W: Write>(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
         let id: i32 = match *self {
             Self::UNCOMPRESSED => 0,
             Self::SNAPPY => 1,
@@ -794,8 +794,8 @@ impl<W: Write> WriteThrift<W> for Compression {
     }
 }
 
-impl<W: Write> WriteThriftField<W> for Compression {
-    fn write_thrift_field(
+impl WriteThriftField for Compression {
+    fn write_thrift_field<W: Write>(
         &self,
         writer: &mut ThriftCompactOutputProtocol<W>,
         field_id: i16,
@@ -1154,10 +1154,10 @@ impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for ColumnOrder {
     }
 }
 
-impl<W: Write> WriteThrift<W> for ColumnOrder {
+impl WriteThrift for ColumnOrder {
     const ELEMENT_TYPE: ElementType = ElementType::Struct;
 
-    fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
+    fn write_thrift<W: Write>(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
         match *self {
             Self::TYPE_DEFINED_ORDER(_) => {
                 writer.write_field_begin(FieldType::Struct, 1, 0)?;
diff --git a/parquet/src/parquet_macros.rs b/parquet/src/parquet_macros.rs
index 41a5bf3b43f9..ae1d772a07cb 100644
--- a/parquet/src/parquet_macros.rs
+++ b/parquet/src/parquet_macros.rs
@@ -51,16 +51,16 @@ macro_rules! thrift_enum {
             }
         }
 
-        impl<W: Write> WriteThrift<W> for $identifier {
+        impl WriteThrift for $identifier {
             const ELEMENT_TYPE: ElementType = ElementType::I32;
 
-            fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
+            fn write_thrift<W: Write>(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
                 writer.write_i32(*self as i32)
             }
         }
 
-        impl<W: Write> WriteThriftField<W> for $identifier {
-            fn write_thrift_field(&self, writer: &mut ThriftCompactOutputProtocol<W>, field_id: i16, last_field_id: i16) -> Result<i16> {
+        impl WriteThriftField for $identifier {
+            fn write_thrift_field<W: Write>(&self, writer: &mut ThriftCompactOutputProtocol<W>, field_id: i16, last_field_id: i16) -> Result<i16> {
                 writer.write_field_begin(FieldType::I32, field_id, last_field_id)?;
                 self.write_thrift(writer)?;
                 Ok(field_id)
@@ -135,10 +135,10 @@ macro_rules! thrift_union_all_empty {
             }
         }
 
-        impl<W: Write> WriteThrift<W> for $identifier {
+        impl WriteThrift for $identifier {
             const ELEMENT_TYPE: ElementType = ElementType::Struct;
 
-            fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
+            fn write_thrift<W: Write>(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
                 match *self {
                     $(Self::$field_name => writer.write_empty_struct($field_id, 0)?,)*
                 };
@@ -147,8 +147,8 @@ macro_rules! thrift_union_all_empty {
             }
         }
 
-        impl<W: Write> WriteThriftField<W> for $identifier {
-            fn write_thrift_field(&self, writer: &mut ThriftCompactOutputProtocol<W>, field_id: i16, last_field_id: i16) -> Result<i16> {
+        impl WriteThriftField for $identifier {
+            fn write_thrift_field<W: Write>(&self, writer: &mut ThriftCompactOutputProtocol<W>, field_id: i16, last_field_id: i16) -> Result<i16> {
                 writer.write_field_begin(FieldType::Struct, field_id, last_field_id)?;
                 self.write_thrift(writer)?;
                 Ok(field_id)
@@ -220,10 +220,10 @@ macro_rules! thrift_union {
             }
         }
 
-        impl<W: Write> WriteThrift<W> for $identifier {
+        impl WriteThrift for $identifier {
             const ELEMENT_TYPE: ElementType = ElementType::Struct;
 
-            fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
+            fn write_thrift<W: Write>(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
                 match self {
                     $($crate::__thrift_write_variant_lhs!($field_name $($field_type)?, variant_val) =>
                       $crate::__thrift_write_variant_rhs!($field_id $($field_type)?, writer, variant_val),)*
@@ -232,8 +232,8 @@ macro_rules! thrift_union {
             }
         }
 
-        impl<W: Write> WriteThriftField<W> for $identifier {
-            fn write_thrift_field(&self, writer: &mut ThriftCompactOutputProtocol<W>, field_id: i16, last_field_id: i16) -> Result<i16> {
+        impl WriteThriftField for $identifier {
+            fn write_thrift_field<W: Write>(&self, writer: &mut ThriftCompactOutputProtocol<W>, field_id: i16, last_field_id: i16) -> Result<i16> {
                 writer.write_field_begin(FieldType::Struct, field_id, last_field_id)?;
                 self.write_thrift(writer)?;
                 Ok(field_id)
@@ -307,19 +307,19 @@ macro_rules! thrift_struct {
             }
         }
 
-        impl<$($lt,)? W: Write> WriteThrift<W> for $identifier $(<$lt>)? {
+        impl $(<$lt>)? WriteThrift for $identifier $(<$lt>)? {
             const ELEMENT_TYPE: ElementType = ElementType::Struct;
 
             #[allow(unused_assignments)]
-            fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
+            fn write_thrift<W: Write>(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
                 let mut last_field_id = 0i16;
                 $($crate::__thrift_write_required_or_optional_field!($required_or_optional $field_name, $field_id, $field_type, self, writer, last_field_id);)*
                 writer.write_struct_end()
             }
         }
 
-        impl<$($lt,)? W: Write> WriteThriftField<W> for $identifier $(<$lt>)? {
-            fn write_thrift_field(&self, writer: &mut ThriftCompactOutputProtocol<W>, field_id: i16, last_field_id: i16) -> Result<i16> {
+        impl $(<$lt>)? WriteThriftField for $identifier $(<$lt>)? {
+            fn write_thrift_field<W: Write>(&self, writer: &mut ThriftCompactOutputProtocol<W>, field_id: i16, last_field_id: i16) -> Result<i16> {
                 writer.write_field_begin(FieldType::Struct, field_id, last_field_id)?;
                 self.write_thrift(writer)?;
                 Ok(field_id)
diff --git a/parquet/src/parquet_thrift.rs b/parquet/src/parquet_thrift.rs
index ac5d72ecdd69..593aec4e0f2b 100644
--- a/parquet/src/parquet_thrift.rs
+++ b/parquet/src/parquet_thrift.rs
@@ -650,20 +650,20 @@ impl<W: Write> ThriftCompactOutputProtocol<W> {
     }
 }
 
-pub(crate) trait WriteThrift<W: Write> {
+pub(crate) trait WriteThrift {
     const ELEMENT_TYPE: ElementType;
 
     // used to write generated enums and structs
-    fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()>;
+    fn write_thrift<W: Write>(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()>;
 }
 
-impl<T, W: Write> WriteThrift<W> for Vec<T>
+impl<T> WriteThrift for Vec<T>
 where
-    T: WriteThrift<W>,
+    T: WriteThrift,
 {
     const ELEMENT_TYPE: ElementType = ElementType::List;
 
-    fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
+    fn write_thrift<W: Write>(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
         writer.write_list_begin(T::ELEMENT_TYPE, self.len())?;
         for item in self {
             item.write_thrift(writer)?;
@@ -672,82 +672,82 @@ where
     }
 }
 
-impl<W: Write> WriteThrift<W> for bool {
+impl WriteThrift for bool {
     const ELEMENT_TYPE: ElementType = ElementType::Bool;
 
-    fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
+    fn write_thrift<W: Write>(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
         writer.write_bool(*self)
     }
 }
 
-impl<W: Write> WriteThrift<W> for i8 {
+impl WriteThrift for i8 {
     const ELEMENT_TYPE: ElementType = ElementType::Byte;
 
-    fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
+    fn write_thrift<W: Write>(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
         writer.write_i8(*self)
     }
 }
 
-impl<W: Write> WriteThrift<W> for i16 {
+impl WriteThrift for i16 {
     const ELEMENT_TYPE: ElementType = ElementType::I16;
 
-    fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
+    fn write_thrift<W: Write>(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
         writer.write_i16(*self)
     }
 }
 
-impl<W: Write> WriteThrift<W> for i32 {
+impl WriteThrift for i32 {
     const ELEMENT_TYPE: ElementType = ElementType::I32;
 
-    fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
+    fn write_thrift<W: Write>(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
         writer.write_i32(*self)
     }
 }
 
-impl<W: Write> WriteThrift<W> for i64 {
+impl WriteThrift for i64 {
     const ELEMENT_TYPE: ElementType = ElementType::I64;
 
-    fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
+    fn write_thrift<W: Write>(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
         writer.write_i64(*self)
     }
 }
 
-impl<W: Write> WriteThrift<W> for OrderedF64 {
+impl WriteThrift for OrderedF64 {
     const ELEMENT_TYPE: ElementType = ElementType::Double;
 
-    fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
+    fn write_thrift<W: Write>(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
         writer.write_double(self.0)
     }
 }
 
-impl<W: Write> WriteThrift<W> for &[u8] {
+impl WriteThrift for &[u8] {
     const ELEMENT_TYPE: ElementType = ElementType::Binary;
 
-    fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
+    fn write_thrift<W: Write>(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
         writer.write_bytes(self)
     }
 }
 
-impl<W: Write> WriteThrift<W> for &str {
+impl WriteThrift for &str {
     const ELEMENT_TYPE: ElementType = ElementType::Binary;
 
-    fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
+    fn write_thrift<W: Write>(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
         writer.write_bytes(self.as_bytes())
     }
 }
 
-impl<W: Write> WriteThrift<W> for String {
+impl WriteThrift for String {
     const ELEMENT_TYPE: ElementType = ElementType::Binary;
 
-    fn write_thrift(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
+    fn write_thrift<W: Write>(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
         writer.write_bytes(self.as_bytes())
     }
 }
 
-pub(crate) trait WriteThriftField<W: Write> {
+pub(crate) trait WriteThriftField {
     // used to write struct fields (which may be basic types or generated types).
     // write the field header and field value. returns `field_id`.
-    fn write_thrift_field(
+    fn write_thrift_field<W: Write>(
         &self,
         writer: &mut ThriftCompactOutputProtocol<W>,
         field_id: i16,
@@ -755,8 +755,8 @@ pub(crate) trait WriteThriftField<W: Write> {
     ) -> Result<i16>;
 }
 
-impl<W: Write> WriteThriftField<W> for bool {
-    fn write_thrift_field(
+impl WriteThriftField for bool {
+    fn write_thrift_field<W: Write>(
         &self,
         writer: &mut ThriftCompactOutputProtocol<W>,
         field_id: i16,
@@ -771,8 +771,8 @@ impl<W: Write> WriteThriftField<W> for bool {
     }
 }
 
-impl<W: Write> WriteThriftField<W> for i8 {
-    fn write_thrift_field(
+impl WriteThriftField for i8 {
+    fn write_thrift_field<W: Write>(
         &self,
         writer: &mut ThriftCompactOutputProtocol<W>,
         field_id: i16,
@@ -784,8 +784,8 @@ impl<W: Write> WriteThriftField<W> for i8 {
     }
 }
 
-impl<W: Write> WriteThriftField<W> for i16 {
-    fn write_thrift_field(
+impl WriteThriftField for i16 {
+    fn write_thrift_field<W: Write>(
         &self,
         writer: &mut ThriftCompactOutputProtocol<W>,
         field_id: i16,
@@ -797,8 +797,8 @@ impl<W: Write> WriteThriftField<W> for i16 {
     }
 }
 
-impl<W: Write> WriteThriftField<W> for i32 {
-    fn write_thrift_field(
+impl WriteThriftField for i32 {
+    fn write_thrift_field<W: Write>(
         &self,
         writer: &mut ThriftCompactOutputProtocol<W>,
         field_id: i16,
@@ -810,8 +810,8 @@ impl<W: Write> WriteThriftField<W> for i32 {
     }
 }
 
-impl<W: Write> WriteThriftField<W> for i64 {
-    fn write_thrift_field(
+impl WriteThriftField for i64 {
+    fn write_thrift_field<W: Write>(
         &self,
         writer: &mut ThriftCompactOutputProtocol<W>,
         field_id: i16,
@@ -823,8 +823,8 @@ impl<W: Write> WriteThriftField<W> for i64 {
     }
 }
 
-impl<W: Write> WriteThriftField<W> for OrderedF64 {
-    fn write_thrift_field(
+impl WriteThriftField for OrderedF64 {
+    fn write_thrift_field<W: Write>(
         &self,
         writer: &mut ThriftCompactOutputProtocol<W>,
         field_id: i16,
@@ -836,8 +836,8 @@ impl<W: Write> WriteThriftField<W> for OrderedF64 {
     }
 }
 
-impl<W: Write> WriteThriftField<W> for &[u8] {
-    fn write_thrift_field(
+impl WriteThriftField for &[u8] {
+    fn write_thrift_field<W: Write>(
         &self,
         writer: &mut ThriftCompactOutputProtocol<W>,
         field_id: i16,
@@ -849,8 +849,8 @@ impl<W: Write> WriteThriftField<W> for &[u8] {
     }
 }
 
-impl<W: Write> WriteThriftField<W> for &str {
-    fn write_thrift_field(
+impl WriteThriftField for &str {
+    fn write_thrift_field<W: Write>(
         &self,
         writer: &mut ThriftCompactOutputProtocol<W>,
         field_id: i16,
@@ -862,8 +862,8 @@ impl<W: Write> WriteThriftField<W> for &str {
     }
 }
 
-impl<W: Write> WriteThriftField<W> for String {
-    fn write_thrift_field(
+impl WriteThriftField for String {
+    fn write_thrift_field<W: Write>(
         &self,
         writer: &mut ThriftCompactOutputProtocol<W>,
         field_id: i16,
@@ -875,11 +875,11 @@ impl<W: Write> WriteThriftField<W> for String {
     }
 }
 
-impl<T, W: Write> WriteThriftField<W> for Vec<T>
+impl<T> WriteThriftField for Vec<T>
 where
-    T: WriteThrift<W>,
+    T: WriteThrift,
 {
-    fn write_thrift_field(
+    fn write_thrift_field<W: Write>(
         &self,
         writer: &mut ThriftCompactOutputProtocol<W>,
         field_id: i16,
@@ -901,7 +901,7 @@ pub(crate) mod tests {
     pub(crate) fn test_roundtrip<T>(val: T)
     where
         T: for<'a> TryFrom<&'a mut ThriftCompactInputProtocol<'a>>
-            + WriteThrift<Vec<u8>>
+            + WriteThrift
             + PartialEq
             + Debug,
         for<'a> <T as TryFrom<&'a mut ThriftCompactInputProtocol<'a>>>::Error: Debug,

From 057945627d5ef81141382e1ffd3d3ab49d13cb6c Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@live.com>
Date: Thu, 28 Aug 2025 18:56:50 -0700
Subject: [PATCH 42/46] finish merge

---
 parquet/src/parquet_macros.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/parquet/src/parquet_macros.rs b/parquet/src/parquet_macros.rs
index 222020797817..9405a9b174e4 100644
--- a/parquet/src/parquet_macros.rs
+++ b/parquet/src/parquet_macros.rs
@@ -232,8 +232,8 @@ macro_rules! thrift_union {
             }
         }
 
-        impl $(<$lt>)? WriteThriftField<W> for $identifier $(<$lt>)? {
-            fn write_thrift_field(&self, writer: &mut ThriftCompactOutputProtocol<W>, field_id: i16, last_field_id: i16) -> Result<i16> {
+        impl $(<$lt>)? WriteThriftField for $identifier $(<$lt>)? {
+            fn write_thrift_field<W: Write>(&self, writer: &mut ThriftCompactOutputProtocol<W>, field_id: i16, last_field_id: i16) -> Result<i16> {
                 writer.write_field_begin(FieldType::Struct, field_id, last_field_id)?;
                 self.write_thrift(writer)?;
                 Ok(field_id)

From f81a732402bb6c8896ec204f15bf23a4943dfcf1 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@live.com>
Date: Wed, 10 Sep 2025 14:05:41 -0700
Subject: [PATCH 43/46] get a start on some documentation and add some TODOs

---
 parquet/src/basic.rs          |  5 +-
 parquet/src/parquet_macros.rs |  4 ++
 parquet/src/parquet_thrift.rs | 89 +++++++++++++++++++++++++++++++++--
 3 files changed, 93 insertions(+), 5 deletions(-)

diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs
index 5fd49043731e..5fffb56cdf74 100644
--- a/parquet/src/basic.rs
+++ b/parquet/src/basic.rs
@@ -774,8 +774,9 @@ impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for Compression {
     }
 }
 
-// FIXME
-// ugh...why did we add compression level to some variants if we don't use them????
+// TODO(ets): explore replacing this with a thrift_enum!(ThriftCompression) for the serialization
+// and then provide `From` impls to convert back and forth. This is necessary due to the addition
+// of compression level to some variants.
 impl WriteThrift for Compression {
     const ELEMENT_TYPE: ElementType = ElementType::I32;
 
diff --git a/parquet/src/parquet_macros.rs b/parquet/src/parquet_macros.rs
index ae1d772a07cb..eb523a6982a0 100644
--- a/parquet/src/parquet_macros.rs
+++ b/parquet/src/parquet_macros.rs
@@ -20,6 +20,10 @@
 // They allow for pasting sections of the Parquet thrift IDL file
 // into a macro to generate rust structures and implementations.
 
+// TODO(ets): These macros need a good bit of documentation so other developers will be able
+// to use them correctly. Also need to write a .md file with complete examples of both how
+// to use the macros, and how to implement custom readers and writers when necessary.
+
 #[macro_export]
 #[allow(clippy::crate_in_macro_def)]
 /// macro to generate rust enums from a thrift enum definition
diff --git a/parquet/src/parquet_thrift.rs b/parquet/src/parquet_thrift.rs
index 593aec4e0f2b..590e5d9e1eb7 100644
--- a/parquet/src/parquet_thrift.rs
+++ b/parquet/src/parquet_thrift.rs
@@ -549,24 +549,36 @@ where
 /////////////////////////
 // thrift compact output
 
+/// Low-level object used to serialize structs to the Thrift [compact output] protocol.
+///
+/// This struct serves as a wrapper around a [`Write`] object, to which thrift encoded data
+/// will written. The implementation provides functions to write Thrift primitive types, as well
+/// as functions used in the encoding of lists and structs. This should rarely be used directly,
+/// but is instead intended for use by implementers of [`WriteThrift`] and [`WriteThriftField`].
+///
+/// [compact output]: https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md
 pub(crate) struct ThriftCompactOutputProtocol<W: Write> {
     writer: W,
 }
 
 impl<W: Write> ThriftCompactOutputProtocol<W> {
+    /// Create a new `ThriftCompactOutputProtocol` wrapping the byte sink `writer`.
     pub(crate) fn new(writer: W) -> Self {
         Self { writer }
     }
 
+    /// Return a reference to the underlying `Write`.
     pub(crate) fn inner(&self) -> &W {
         &self.writer
     }
 
+    /// Write a single byte to the output stream.
     fn write_byte(&mut self, b: u8) -> Result<()> {
         self.writer.write_all(&[b])?;
         Ok(())
     }
 
+    /// Write the given `u64` as a ULEB128 encoded varint.
     fn write_vlq(&mut self, val: u64) -> Result<()> {
         let mut v = val;
         while v > 0x7f {
@@ -576,11 +588,16 @@ impl<W: Write> ThriftCompactOutputProtocol<W> {
         self.write_byte(v as u8)
     }
 
+    /// Write the given `i64` as a zig-zag encoded varint.
     fn write_zig_zag(&mut self, val: i64) -> Result<()> {
         let s = (val < 0) as i64;
         self.write_vlq((((val ^ -s) << 1) + s) as u64)
     }
 
+    /// Used to mark the start of a Thrift struct field of type `field_type`. `last_field_id`
+    /// is used to compute a delta to the given `field_id` per the compact protocol [spec].
+    ///
+    /// [spec]: https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md#struct-encoding
     pub(crate) fn write_field_begin(
         &mut self,
         field_type: FieldType,
@@ -596,6 +613,7 @@ impl<W: Write> ThriftCompactOutputProtocol<W> {
         }
     }
 
+    /// Used to indicate the start of a list of `element_type` elements.
     pub(crate) fn write_list_begin(&mut self, element_type: ElementType, len: usize) -> Result<()> {
         if len < 15 {
             self.write_byte((len as u8) << 4 | element_type as u8)
@@ -605,22 +623,29 @@ impl<W: Write> ThriftCompactOutputProtocol<W> {
         }
     }
 
+    /// Used to mark the end of a struct. This must be called after all fields of the struct have
+    /// been written.
     pub(crate) fn write_struct_end(&mut self) -> Result<()> {
         self.write_byte(0)
     }
 
+    /// Serialize a slice of `u8`s. This will encode a length, and then write the bytes without
+    /// further encoding.
     pub(crate) fn write_bytes(&mut self, val: &[u8]) -> Result<()> {
         self.write_vlq(val.len() as u64)?;
         self.writer.write_all(val)?;
         Ok(())
     }
 
+    /// Short-cut method used to encode structs that have no fields (often used in Thrift unions).
+    /// This simply encodes the field id and then immediately writes the end-of-struct marker.
     pub(crate) fn write_empty_struct(&mut self, field_id: i16, last_field_id: i16) -> Result<i16> {
         self.write_field_begin(FieldType::Struct, field_id, last_field_id)?;
         self.write_struct_end()?;
         Ok(last_field_id)
     }
 
+    /// Write a boolean value.
     pub(crate) fn write_bool(&mut self, val: bool) -> Result<()> {
         match val {
             true => self.write_byte(1),
@@ -628,35 +653,47 @@ impl<W: Write> ThriftCompactOutputProtocol<W> {
         }
     }
 
+    /// Write a zig-zag encoded `i8` value.
     pub(crate) fn write_i8(&mut self, val: i8) -> Result<()> {
         self.write_byte(val as u8)
     }
 
+    /// Write a zig-zag encoded `i16` value.
     pub(crate) fn write_i16(&mut self, val: i16) -> Result<()> {
         self.write_zig_zag(val as _)
     }
 
+    /// Write a zig-zag encoded `i32` value.
     pub(crate) fn write_i32(&mut self, val: i32) -> Result<()> {
         self.write_zig_zag(val as _)
     }
 
+    /// Write a zig-zag encoded `i64` value.
     pub(crate) fn write_i64(&mut self, val: i64) -> Result<()> {
         self.write_zig_zag(val as _)
     }
 
+    /// Write a double value.
     pub(crate) fn write_double(&mut self, val: f64) -> Result<()> {
         self.writer.write_all(&val.to_le_bytes())?;
         Ok(())
     }
 }
 
+/// Trait implemented by objects that are to be serialized to a Thrift [compact output] protocol
+/// stream. Implementations are also provided for primitive Thrift types.
+///
+/// [compact output]: https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md
 pub(crate) trait WriteThrift {
+    /// The [`ElementType`] to use when a list of this object is written.
     const ELEMENT_TYPE: ElementType;
 
-    // used to write generated enums and structs
+    /// Serialize this object to the given `writer`.
     fn write_thrift<W: Write>(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()>;
 }
 
+/// Implementation for a vector of thrift serializable objects that implement [`WriteThrift`].
+/// This will write the necessary list header and then serialize the elements one-at-a-time.
 impl<T> WriteThrift for Vec<T>
 where
     T: WriteThrift,
@@ -744,9 +781,55 @@ impl WriteThrift for String {
     }
 }
 
+/// Trait implemented by objects that are fields of Thrift structs.
+///
+/// For example, given the Thrift struct definition
+/// ```
+/// struct MyStruct {
+///   1: required i32 field1
+///   2: optional bool field2
+///   3: optional OtherStruct field3
+/// }
+/// ```
+///
+/// which becomes in Rust
+/// ```rust
+/// struct MyStruct {
+///   field1: i32,
+///   field2: Option<bool>,
+///   field3: Option<OtherStruct>,
+/// }
+/// ```
+/// the impl of `WriteThrift` for `MyStruct` will use the `WriteThriftField` impls for `i32`,
+/// `bool`, and `OtherStruct`.
+///
+/// ```
+/// impl WriteThrift for MyStruct {
+///   const ELEMENT_TYPE: ElementType = ElementType::Double;
+///
+///   fn write_thrift<W: Write>(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
+///     let mut last_field_id = 0i16;
+///     last_field_id = self.field1.write_thrift_field(writer, 1, last_field_id)?;
+///     if self.field2.is_some() {
+///       // if field2 is `None` then this assignment won't happen and last_field_id will remain
+///       // `1` when writing `field3`
+///       last_field_id = self.field2.write_thrift_field(writer, 2, last_field_id)?;
+///     }
+///     if self.field3.is_some() {
+///       // no need to assign last_field_id since this is the final field.
+///       self.field3.write_thrift_field(writer, 3, last_field_id)?;
+///     }
+///   }
+/// }
+/// ```
+///
 pub(crate) trait WriteThriftField {
-    // used to write struct fields (which may be basic types or generated types).
-    // write the field header and field value. returns `field_id`.
+    /// Used to write struct fields (which may be primitive or IDL defined types). This will
+    /// write the field marker for the given `field_id`, using `last_field_id` to compute the
+    /// field delta used by the Thrift [compact protocol]. On success this will return `field_id`
+    /// to be used in chaining.
+    ///
+    /// [compact protocol]: https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md#struct-encoding
     fn write_thrift_field<W: Write>(
         &self,
         writer: &mut ThriftCompactOutputProtocol<W>,

From 7268dd3343be9ab1070614aaa5d599e3dd3110d6 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@live.com>
Date: Wed, 10 Sep 2025 14:39:09 -0700
Subject: [PATCH 44/46] fix docs

---
 parquet/src/parquet_thrift.rs | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/parquet/src/parquet_thrift.rs b/parquet/src/parquet_thrift.rs
index 590e5d9e1eb7..9b83c0a01b8d 100644
--- a/parquet/src/parquet_thrift.rs
+++ b/parquet/src/parquet_thrift.rs
@@ -784,7 +784,7 @@ impl WriteThrift for String {
 /// Trait implemented by objects that are fields of Thrift structs.
 ///
 /// For example, given the Thrift struct definition
-/// ```
+/// ```ignore
 /// struct MyStruct {
 ///   1: required i32 field1
 ///   2: optional bool field2
@@ -793,7 +793,8 @@ impl WriteThrift for String {
 /// ```
 ///
 /// which becomes in Rust
-/// ```rust
+/// ```no_run
+/// # struct OtherStruct {}
 /// struct MyStruct {
 ///   field1: i32,
 ///   field2: Option<bool>,
@@ -803,10 +804,8 @@ impl WriteThrift for String {
 /// the impl of `WriteThrift` for `MyStruct` will use the `WriteThriftField` impls for `i32`,
 /// `bool`, and `OtherStruct`.
 ///
-/// ```
+/// ```ignore
 /// impl WriteThrift for MyStruct {
-///   const ELEMENT_TYPE: ElementType = ElementType::Double;
-///
 ///   fn write_thrift<W: Write>(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
 ///     let mut last_field_id = 0i16;
 ///     last_field_id = self.field1.write_thrift_field(writer, 1, last_field_id)?;
@@ -819,6 +818,7 @@ impl WriteThrift for String {
 ///       // no need to assign last_field_id since this is the final field.
 ///       self.field3.write_thrift_field(writer, 3, last_field_id)?;
 ///     }
+///     writer.write_struct_end()
 ///   }
 /// }
 /// ```

From cfa674012cefad01070b7c9c03b4e4059496105b Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@live.com>
Date: Wed, 10 Sep 2025 15:38:11 -0700
Subject: [PATCH 45/46] backport fix for tests without encryption

---
 parquet/src/file/serialized_reader.rs  | 6 ++++++
 parquet/tests/arrow_reader/bad_data.rs | 3 +++
 2 files changed, 9 insertions(+)

diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs
index 335f0bc3601b..728598045315 100644
--- a/parquet/src/file/serialized_reader.rs
+++ b/parquet/src/file/serialized_reader.rs
@@ -1875,10 +1875,16 @@ mod tests {
             80, 65, 82, 49,
         ];
         let ret = SerializedFileReader::new(Bytes::copy_from_slice(&data));
+        #[cfg(feature = "encryption")]
         assert_eq!(
             ret.err().unwrap().to_string(),
             "Parquet error: Could not parse metadata: Parquet error: Received empty union from remote ColumnOrder"
         );
+        #[cfg(not(feature = "encryption"))]
+        assert_eq!(
+            ret.err().unwrap().to_string(),
+            "Parquet error: Received empty union from remote ColumnOrder"
+        );
     }
 
     #[test]
diff --git a/parquet/tests/arrow_reader/bad_data.rs b/parquet/tests/arrow_reader/bad_data.rs
index 58e342ab39d1..ecf449a7ce61 100644
--- a/parquet/tests/arrow_reader/bad_data.rs
+++ b/parquet/tests/arrow_reader/bad_data.rs
@@ -80,10 +80,13 @@ fn test_invalid_files() {
 #[test]
 fn test_parquet_1481() {
     let err = read_file("PARQUET-1481.parquet").unwrap_err();
+    #[cfg(feature = "encryption")]
     assert_eq!(
         err.to_string(),
         "Parquet error: Could not parse metadata: Parquet error: Unexpected Type -7"
     );
+    #[cfg(not(feature = "encryption"))]
+    assert_eq!(err.to_string(), "Parquet error: Unexpected Type -7");
 }
 
 #[test]

From 82f31a41934d606015b31726a7b09bb61669c190 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@live.com>
Date: Thu, 11 Sep 2025 07:47:44 -0700
Subject: [PATCH 46/46] add documentation

---
 parquet/src/parquet_thrift.rs | 89 +++++++++++++++++++++++++++--------
 1 file changed, 69 insertions(+), 20 deletions(-)

diff --git a/parquet/src/parquet_thrift.rs b/parquet/src/parquet_thrift.rs
index e37942f96207..17847d0b71e5 100644
--- a/parquet/src/parquet_thrift.rs
+++ b/parquet/src/parquet_thrift.rs
@@ -24,10 +24,9 @@ use std::{cmp::Ordering, io::Write};
 
 use crate::errors::{ParquetError, Result};
 
-// Couldn't implement thrift structs with f64 do to lack of Eq
-// for f64. This is a hacky workaround for now...there are other
-// wrappers out there that should probably be used instead.
-// thrift seems to re-export an impl from ordered-float
+/// Wrapper for thrift `double` fields. This is used to provide
+/// an implementation of `Eq` for floats. This implementation
+/// uses IEEE 754 total order.
 #[derive(Debug, Clone, Copy, PartialEq)]
 pub struct OrderedF64(f64);
 
@@ -156,25 +155,52 @@ impl TryFrom<u8> for ElementType {
     }
 }
 
+/// Struct used to describe a [thrift struct] field during decoding.
+///
+/// [thrift struct]: https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md#struct-encoding
 pub(crate) struct FieldIdentifier {
+    /// The type for the field.
     pub(crate) field_type: FieldType,
+    /// The field's `id`. May be computed from delta or directly decoded.
     pub(crate) id: i16,
+    /// Stores the value for booleans.
+    ///
+    /// Boolean fields store no data, instead the field type is either boolean true, or
+    /// boolean false.
     pub(crate) bool_val: Option<bool>,
 }
 
+/// Struct used to describe a [thrift list].
+///
+/// [thrift list]: https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md#list-and-set
 #[derive(Clone, Debug, Eq, PartialEq)]
 pub(crate) struct ListIdentifier {
+    /// The type for each element in the list.
     pub(crate) element_type: ElementType,
+    /// Number of elements contained in the list.
     pub(crate) size: i32,
 }
 
+/// Low-level object used to deserialize structs encoded with the Thrift [compact] protocol.
+///
+/// Implementation of this trait must provide the low-level functions `read_byte`, `read_bytes`,
+/// `skip_bytes`, and `read_double`. These primitives are used by the default functions provided
+/// here to perform deserialization.
+///
+/// [compact]: https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md
 pub(crate) trait ThriftCompactInputProtocol<'a> {
+    /// Read a single byte from the input.
     fn read_byte(&mut self) -> Result<u8>;
 
+    /// Read a Thrift encoded [binary] from the input.
+    ///
+    /// [binary]: https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md#binary-encoding
     fn read_bytes(&mut self) -> Result<&'a [u8]>;
 
+    /// Skip the next `n` bytes of input.
     fn skip_bytes(&mut self, n: usize) -> Result<()>;
 
+    /// Read a ULEB128 encoded unsigned varint from the input.
     fn read_vlq(&mut self) -> Result<u64> {
         let mut in_progress = 0;
         let mut shift = 0;
@@ -188,11 +214,13 @@ pub(crate) trait ThriftCompactInputProtocol<'a> {
         }
     }
 
+    /// Read a zig-zag encoded signed varint from the input.
     fn read_zig_zag(&mut self) -> Result<i64> {
         let val = self.read_vlq()?;
         Ok((val >> 1) as i64 ^ -((val & 1) as i64))
     }
 
+    /// Read the [`ListIdentifier`] for a Thrift encoded list.
     fn read_list_begin(&mut self) -> Result<ListIdentifier> {
         let header = self.read_byte()?;
         let element_type = ElementType::try_from(header & 0x0f)?;
@@ -211,6 +239,7 @@ pub(crate) trait ThriftCompactInputProtocol<'a> {
         })
     }
 
+    /// Read the [`FieldIdentifier`] for a field in a Thrift encoded struct.
     fn read_field_begin(&mut self, last_field_id: i16) -> Result<FieldIdentifier> {
         // we can read at least one byte, which is:
         // - the type
@@ -256,12 +285,12 @@ pub(crate) trait ThriftCompactInputProtocol<'a> {
         }
     }
 
-    // This is a specialized version of read_field_begin, solely for use in parsing
-    // PageLocation structs in the offset index. This function assumes that the delta
-    // field will always be less than 0xf, fields will be in order, and no boolean fields
-    // will be read. This also skips validation of the field type.
-    //
-    // Returns a tuple of (field_type, field_delta)
+    /// This is a specialized version of [`Self::read_field_begin`], solely for use in parsing
+    /// simple structs. This function assumes that the delta field will always be less than 0xf,
+    /// fields will be in order, and no boolean fields will be read.
+    /// This also skips validation of the field type.
+    ///
+    /// Returns a tuple of `(field_type, field_delta)`.
     fn read_field_header(&mut self) -> Result<(u8, u8)> {
         let field_type = self.read_byte()?;
         let field_delta = (field_type & 0xf0) >> 4;
@@ -269,7 +298,8 @@ pub(crate) trait ThriftCompactInputProtocol<'a> {
         Ok((field_type, field_delta))
     }
 
-    // not to be used for bool struct fields, just for bool arrays
+    /// Read a boolean list element. This should not be used for struct fields. For the latter,
+    /// use the [`FieldIdentifier::bool_val`] field.
     fn read_bool(&mut self) -> Result<bool> {
         let b = self.read_byte()?;
         // Previous versions of the thrift specification said to use 0 and 1 inside collections,
@@ -283,29 +313,38 @@ pub(crate) trait ThriftCompactInputProtocol<'a> {
         }
     }
 
+    /// Read a Thrift [binary] as a UTF-8 encoded string.
+    ///
+    /// [binary]: https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md#binary-encoding
     fn read_string(&mut self) -> Result<&'a str> {
         let slice = self.read_bytes()?;
         Ok(std::str::from_utf8(slice)?)
     }
 
+    /// Read an `i8`.
     fn read_i8(&mut self) -> Result<i8> {
         Ok(self.read_byte()? as _)
     }
 
+    /// Read an `i16`.
     fn read_i16(&mut self) -> Result<i16> {
         Ok(self.read_zig_zag()? as _)
     }
 
+    /// Read an `i32`.
     fn read_i32(&mut self) -> Result<i32> {
         Ok(self.read_zig_zag()? as _)
     }
 
+    /// Read an `i64`.
     fn read_i64(&mut self) -> Result<i64> {
         self.read_zig_zag()
     }
 
+    /// Read a Thrift `double` as `f64`.
     fn read_double(&mut self) -> Result<f64>;
 
+    /// Skip a ULEB128 encoded varint.
     fn skip_vlq(&mut self) -> Result<()> {
         loop {
             let byte = self.read_byte()?;
@@ -315,20 +354,24 @@ pub(crate) trait ThriftCompactInputProtocol<'a> {
         }
     }
 
+    /// Skip a thrift [binary].
+    ///
+    /// [binary]: https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md#binary-encoding
     fn skip_binary(&mut self) -> Result<()> {
         let len = self.read_vlq()? as usize;
         self.skip_bytes(len)
     }
 
     /// Skip a field with type `field_type` recursively until the default
-    /// maximum skip depth is reached.
+    /// maximum skip depth (currently 64) is reached.
     fn skip(&mut self, field_type: FieldType) -> Result<()> {
-        // TODO: magic number
-        self.skip_till_depth(field_type, 64)
+        const DEFAULT_SKIP_DEPTH: i8 = 64;
+        self.skip_till_depth(field_type, DEFAULT_SKIP_DEPTH)
     }
 
     /// Empty structs in unions consist of a single byte of 0 for the field stop record.
-    /// This skips that byte without pushing to the field id stack.
+    /// This skips that byte without encuring the cost of processing the [`FieldIdentifier`].
+    /// Will return an error if the struct is not actually empty.
     fn skip_empty_struct(&mut self) -> Result<()> {
         let b = self.read_byte()?;
         if b != 0 {
@@ -379,19 +422,23 @@ pub(crate) trait ThriftCompactInputProtocol<'a> {
     }
 }
 
+/// A high performance Thrift reader that reads from a slice of bytes.
 pub(crate) struct ThriftSliceInputProtocol<'a> {
     buf: &'a [u8],
 }
 
 impl<'a> ThriftSliceInputProtocol<'a> {
+    /// Create a new `ThriftSliceInputProtocol` using the bytes in `buf`.
     pub fn new(buf: &'a [u8]) -> Self {
         Self { buf }
     }
 
+    /// Re-initialize this reader with a new slice.
     pub fn reset_buffer(&mut self, buf: &'a [u8]) {
         self.buf = buf;
     }
 
+    /// Return the current buffer as a slice.
     pub fn as_slice(&self) -> &'a [u8] {
         self.buf
     }
@@ -433,8 +480,10 @@ fn eof_error() -> ParquetError {
     eof_err!("Unexpected EOF")
 }
 
+/// Trait implemented for objects that can be deserialized from a Thrift input stream.
+/// Implementations are provided for Thrift primitive types.
 pub(crate) trait ReadThrift<'a, R: ThriftCompactInputProtocol<'a>> {
-    // used to read generated enums and structs
+    /// Read an object of type `Self` from the input protocol object.
     fn read_thrift(prot: &mut R) -> Result<Self>
     where
         Self: Sized;
@@ -494,6 +543,9 @@ impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for &'a [u8] {
     }
 }
 
+/// Read a Thrift encoded [list] from the input protocol object.
+///
+/// [list]: https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md#list-and-set
 pub(crate) fn read_thrift_vec<'a, T, R>(prot: &mut R) -> Result<Vec<T>>
 where
     R: ThriftCompactInputProtocol<'a>,
@@ -945,10 +997,7 @@ pub(crate) mod tests {
 
     pub(crate) fn test_roundtrip<T>(val: T)
     where
-        T: for<'a> ReadThrift<'a, ThriftSliceInputProtocol<'a>>
-            + WriteThrift
-            + PartialEq
-            + Debug,
+        T: for<'a> ReadThrift<'a, ThriftSliceInputProtocol<'a>> + WriteThrift + PartialEq + Debug,
     {
         let buf = Vec::<u8>::new();
         let mut writer = ThriftCompactOutputProtocol::new(buf);