Skip to content

Commit a795030

Browse files
[Variant] Use BTreeMap for VariantBuilder.dict and ObjectBuilder.fields to maintain invariants upon entry writes (#7720)
# Which issue does this PR close? - It doesn't directly close the issue, but it's related to #7698 # Rationale for this change This commit changes the `dict` field in `VariantBuilder` + the `fields` field in `ObjectBuilder` to be `BTreeMap`s, and checks for existing field names in a object before appending a new field. These collections are often used in places where having an already sorted structure would be more performant. Inside of `ObjectBuilder::finish()`, we sort the fields by `field_name` and we can use the fact that `VariantBuilder`'s `dict` maintains a sorted mapping to `field_id` by `field_name`. To check whether an existing field name exists in a object, it is simply two lookups: 1) to find the `field_name: &str`'s unique `field_name_id`, and 2) check if the `ObjectBuilder` `fields` already has a key with that `field_name_id`. We make `ObjectBuilder` `fields` a `BTreeMap` sorted by `field_id`. Since `field_id`s correlate to insertion order, we now have some notion of which fields were inserted first. This also improves the time to look up the max field id, as it changes the linear scan over the entire `fields` collection to a logarithmic call using `fields.keys().last()`.
1 parent ce69739 commit a795030

File tree

1 file changed

+94
-20
lines changed

1 file changed

+94
-20
lines changed

parquet-variant/src/builder.rs

Lines changed: 94 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
// under the License.
1717
use crate::decoder::{VariantBasicType, VariantPrimitiveType};
1818
use crate::{ShortString, Variant};
19-
use std::collections::HashMap;
19+
use std::collections::BTreeMap;
2020

2121
const BASIC_TYPE_BITS: u8 = 2;
2222
const UNIX_EPOCH_DATE: chrono::NaiveDate = chrono::NaiveDate::from_ymd_opt(1970, 1, 1).unwrap();
@@ -166,15 +166,15 @@ fn make_room_for_header(buffer: &mut Vec<u8>, start_pos: usize, header_size: usi
166166
///
167167
pub struct VariantBuilder {
168168
buffer: Vec<u8>,
169-
dict: HashMap<String, u32>,
169+
dict: BTreeMap<String, u32>,
170170
dict_keys: Vec<String>,
171171
}
172172

173173
impl VariantBuilder {
174174
pub fn new() -> Self {
175175
Self {
176176
buffer: Vec::new(),
177-
dict: HashMap::new(),
177+
dict: BTreeMap::new(),
178178
dict_keys: Vec::new(),
179179
}
180180
}
@@ -296,7 +296,7 @@ impl VariantBuilder {
296296

297297
/// Add key to dictionary, return its ID
298298
fn add_key(&mut self, key: &str) -> u32 {
299-
use std::collections::hash_map::Entry;
299+
use std::collections::btree_map::Entry;
300300
match self.dict.entry(key.to_string()) {
301301
Entry::Occupied(entry) => *entry.get(),
302302
Entry::Vacant(entry) => {
@@ -482,7 +482,7 @@ impl<'a> ListBuilder<'a> {
482482
pub struct ObjectBuilder<'a> {
483483
parent: &'a mut VariantBuilder,
484484
start_pos: usize,
485-
fields: Vec<(u32, usize)>, // (field_id, offset)
485+
fields: BTreeMap<u32, usize>, // (field_id, offset)
486486
}
487487

488488
impl<'a> ObjectBuilder<'a> {
@@ -491,7 +491,7 @@ impl<'a> ObjectBuilder<'a> {
491491
Self {
492492
parent,
493493
start_pos,
494-
fields: Vec::new(),
494+
fields: BTreeMap::new(),
495495
}
496496
}
497497

@@ -500,25 +500,27 @@ impl<'a> ObjectBuilder<'a> {
500500
let id = self.parent.add_key(key);
501501
let field_start = self.parent.offset() - self.start_pos;
502502
self.parent.append_value(value);
503-
self.fields.push((id, field_start));
503+
let res = self.fields.insert(id, field_start);
504+
debug_assert!(res.is_none());
504505
}
505506

506507
/// Finalize object with sorted fields
507-
pub fn finish(mut self) {
508-
// Sort fields by key name
509-
self.fields.sort_by(|a, b| {
510-
let key_a = &self.parent.dict_keys[a.0 as usize];
511-
let key_b = &self.parent.dict_keys[b.0 as usize];
512-
key_a.cmp(key_b)
513-
});
514-
508+
pub fn finish(self) {
515509
let data_size = self.parent.offset() - self.start_pos;
516510
let num_fields = self.fields.len();
517511
let is_large = num_fields > u8::MAX as usize;
518512
let size_bytes = if is_large { 4 } else { 1 };
519513

520-
let max_id = self.fields.iter().map(|&(id, _)| id).max().unwrap_or(0);
521-
let id_size = int_size(max_id as usize);
514+
let field_ids_by_sorted_field_name = self
515+
.parent
516+
.dict
517+
.iter()
518+
.filter_map(|(_, id)| self.fields.contains_key(id).then_some(*id))
519+
.collect::<Vec<_>>();
520+
521+
let max_id = self.fields.keys().last().copied().unwrap_or(0) as usize;
522+
523+
let id_size = int_size(max_id);
522524
let offset_size = int_size(data_size);
523525

524526
let header_size = 1
@@ -542,17 +544,18 @@ impl<'a> ObjectBuilder<'a> {
542544
}
543545

544546
// Write field IDs (sorted order)
545-
for &(id, _) in &self.fields {
547+
for id in &field_ids_by_sorted_field_name {
546548
write_offset(
547549
&mut self.parent.buffer[pos..pos + id_size as usize],
548-
id as usize,
550+
*id as usize,
549551
id_size,
550552
);
551553
pos += id_size as usize;
552554
}
553555

554556
// Write field offsets
555-
for &(_, offset) in &self.fields {
557+
for id in &field_ids_by_sorted_field_name {
558+
let &offset = self.fields.get(id).unwrap();
556559
write_offset(
557560
&mut self.parent.buffer[pos..pos + offset_size as usize],
558561
offset,
@@ -749,6 +752,77 @@ mod tests {
749752
assert_eq!(field_ids, vec![1, 2, 0]);
750753
}
751754

755+
#[test]
756+
fn test_object_and_metadata_ordering() {
757+
let mut builder = VariantBuilder::new();
758+
759+
let mut obj = builder.new_object();
760+
761+
obj.append_value("zebra", "stripes"); // ID = 0
762+
obj.append_value("apple", "red"); // ID = 1
763+
764+
{
765+
// fields_map is ordered by insertion order (field id)
766+
let fields_map = obj.fields.keys().copied().collect::<Vec<_>>();
767+
assert_eq!(fields_map, vec![0, 1]);
768+
769+
// dict is ordered by field names
770+
// NOTE: when we support nested objects, we'll want to perform a filter by fields_map field ids
771+
let dict_metadata = obj
772+
.parent
773+
.dict
774+
.iter()
775+
.map(|(f, i)| (f.as_str(), *i))
776+
.collect::<Vec<_>>();
777+
778+
assert_eq!(dict_metadata, vec![("apple", 1), ("zebra", 0)]);
779+
780+
// dict_keys is ordered by insertion order (field id)
781+
let dict_keys = obj
782+
.parent
783+
.dict_keys
784+
.iter()
785+
.map(|k| k.as_str())
786+
.collect::<Vec<_>>();
787+
assert_eq!(dict_keys, vec!["zebra", "apple"]);
788+
}
789+
790+
obj.append_value("banana", "yellow"); // ID = 2
791+
792+
{
793+
// fields_map is ordered by insertion order (field id)
794+
let fields_map = obj.fields.keys().copied().collect::<Vec<_>>();
795+
assert_eq!(fields_map, vec![0, 1, 2]);
796+
797+
// dict is ordered by field names
798+
// NOTE: when we support nested objects, we'll want to perform a filter by fields_map field ids
799+
let dict_metadata = obj
800+
.parent
801+
.dict
802+
.iter()
803+
.map(|(f, i)| (f.as_str(), *i))
804+
.collect::<Vec<_>>();
805+
806+
assert_eq!(
807+
dict_metadata,
808+
vec![("apple", 1), ("banana", 2), ("zebra", 0)]
809+
);
810+
811+
// dict_keys is ordered by insertion order (field id)
812+
let dict_keys = obj
813+
.parent
814+
.dict_keys
815+
.iter()
816+
.map(|k| k.as_str())
817+
.collect::<Vec<_>>();
818+
assert_eq!(dict_keys, vec!["zebra", "apple", "banana"]);
819+
}
820+
821+
obj.finish();
822+
823+
builder.finish();
824+
}
825+
752826
#[test]
753827
fn test_append_object() {
754828
let (object_metadata, object_value) = {

0 commit comments

Comments
 (0)