@@ -82,20 +82,23 @@ impl BitSequence {
8282 pub fn new ( bits : u32 , bit_len : BitLen ) -> Self {
8383 Self { bits, bit_len }
8484 }
85+
8586 pub fn bits ( & self ) -> u32 {
8687 self . bits
8788 }
89+
8890 /// The number of bits of `bits` to use.
8991 pub fn bit_len ( & self ) -> BitLen {
9092 self . bit_len
9193 }
94+
9295 /// Split the bits into a prefix of `bit_len` bits and a suffix of `self.bit_len - bit_len`
9396 /// bits.
9497 ///
9598 /// # Failure
9699 ///
97100 /// This function panics if `bit_len > self.bit_len`.
98- pub fn split ( & self , bit_len : BitLen ) -> ( u32 , u32 ) {
101+ pub fn split_bits ( & self , bit_len : BitLen ) -> ( u32 , u32 ) {
99102 let shift = self . bit_len - bit_len;
100103 match shift. into ( ) {
101104 0u8 => ( self . bits , 0 ) , // Special case: cannot >> 32
@@ -106,6 +109,25 @@ impl BitSequence {
106109 ) ,
107110 }
108111 }
112+
113+ /// Split the bits into a prefix of `bit_len` bits and a suffix of `self.bit_len - bit_len`
114+ /// bits.
115+ ///
116+ /// # Failure
117+ ///
118+ /// This function panics if `bit_len > self.bit_len`.
119+ pub fn split ( & self , bit_len : BitLen ) -> ( BitSequence , BitSequence ) {
120+ let ( prefix, suffix) = self . split_bits ( bit_len) ;
121+ (
122+ BitSequence :: new ( prefix, bit_len) ,
123+ BitSequence :: new ( suffix, self . bit_len - bit_len) ,
124+ )
125+ }
126+
127+ /// Add lowest-weight to this bit sequence bits until it reaches
128+ /// a sufficient bit length.
129+ ///
130+ /// Does nothing if the bit sequence already has a sufficient bitlength.
109131 pub fn pad_lowest_to ( & self , total_bit_len : BitLen ) -> Cow < BitSequence > {
110132 assert ! ( total_bit_len. 0 <= 32u8 ) ;
111133 if total_bit_len <= self . bit_len {
@@ -117,21 +139,29 @@ impl BitSequence {
117139 }
118140 Cow :: Owned ( BitSequence :: new ( self . bits << shift, total_bit_len) )
119141 }
142+
143+ /// Prepend a sequence of bits to a sequencce.s
144+ pub fn with_prefix ( & self , prefix : & BitSequence ) -> Self {
145+ assert ! ( ( prefix. bit_len( ) + self . bit_len( ) ) . as_u8( ) <= 32 ) ;
146+ let bits = self . bits | ( prefix. bits ( ) << self . bit_len ) ;
147+ let bit_len = self . bit_len + prefix. bit_len ;
148+ BitSequence :: new ( bits, bit_len)
149+ }
120150}
121151
122152#[ test]
123153fn test_bit_sequence_split ( ) {
124154 let bits = 0b11111111_11111111_00000000_00000000 ;
125155 let key = BitSequence :: new ( bits, BitLen ( 32 ) ) ;
126- assert_eq ! ( key. split ( BitLen ( 0 ) ) , ( 0 , bits) ) ;
127- assert_eq ! ( key. split ( BitLen ( 32 ) ) , ( bits, 0 ) ) ;
128- assert_eq ! ( key. split ( BitLen ( 16 ) ) , ( 0b11111111_11111111 , 0 ) ) ;
156+ assert_eq ! ( key. split_bits ( BitLen ( 0 ) ) , ( 0 , bits) ) ;
157+ assert_eq ! ( key. split_bits ( BitLen ( 32 ) ) , ( bits, 0 ) ) ;
158+ assert_eq ! ( key. split_bits ( BitLen ( 16 ) ) , ( 0b11111111_11111111 , 0 ) ) ;
129159
130160 let bits = 0b00000000_00000000_00000000_11111111 ;
131161 let key = BitSequence :: new ( bits, BitLen ( 16 ) ) ;
132- assert_eq ! ( key. split ( BitLen ( 0 ) ) , ( 0 , bits) ) ;
133- assert_eq ! ( key. split ( BitLen ( 16 ) ) , ( bits, 0 ) ) ;
134- assert_eq ! ( key. split ( BitLen ( 8 ) ) , ( 0 , 0b11111111 ) ) ;
162+ assert_eq ! ( key. split_bits ( BitLen ( 0 ) ) , ( 0 , bits) ) ;
163+ assert_eq ! ( key. split_bits ( BitLen ( 16 ) ) , ( bits, 0 ) ) ;
164+ assert_eq ! ( key. split_bits ( BitLen ( 8 ) ) , ( 0 , 0b11111111 ) ) ;
135165}
136166
137167/// A Huffman key
@@ -159,6 +189,10 @@ impl Key {
159189 Key ( BitSequence { bits, bit_len } )
160190 }
161191
192+ pub fn from_bit_sequence ( sequence : BitSequence ) -> Self {
193+ Self :: new ( sequence. bits , sequence. bit_len )
194+ }
195+
162196 /// The bits in this Key.
163197 ///
164198 /// # Invariant
@@ -176,6 +210,11 @@ impl Key {
176210 pub fn as_bit_sequence ( & self ) -> & BitSequence {
177211 & self . 0
178212 }
213+
214+ pub fn with_prefix ( & self , prefix : & BitSequence ) -> Self {
215+ let sequence = self . 0 . with_prefix ( prefix) ;
216+ Key :: from_bit_sequence ( sequence)
217+ }
179218}
180219
181220/// A node in the Huffman tree.
@@ -219,43 +258,46 @@ impl<T> PartialEq for Node<T> {
219258}
220259impl < T > Eq for Node < T > { }
221260
222- /// Keys associated to a sequence of values.
261+ /// Codebook associated to a sequence of values.
223262#[ derive( Clone , Debug ) ]
224- pub struct Keys < T > {
225- /// The longest bit length that actually appears in `keys `.
263+ pub struct Codebook < T > {
264+ /// The longest bit length that actually appears in `mappings `.
226265 highest_bit_len : BitLen ,
227266
228267 /// The sequence of keys.
229268 ///
230269 /// Order is meaningful.
231- keys : Vec < ( T , Key ) > ,
270+ mappings : Vec < ( T , Key ) > ,
232271}
233272
234- impl < T > Keys < T > {
273+ impl < T > Codebook < T > {
274+ /// The number of elements in this Codebook.
235275 pub fn len ( & self ) -> usize {
236- self . keys . len ( )
276+ self . mappings . len ( )
237277 }
278+
279+ /// The longest bit length that acctually appears in this Codebook.
238280 pub fn highest_bit_len ( & self ) -> BitLen {
239281 self . highest_bit_len
240282 }
241283}
242284
243- impl < T > IntoIterator for Keys < T > {
285+ impl < T > IntoIterator for Codebook < T > {
244286 type Item = ( T , Key ) ;
245287 type IntoIter = std:: vec:: IntoIter < ( T , Key ) > ;
246288 fn into_iter ( self ) -> Self :: IntoIter {
247- self . keys . into_iter ( )
289+ self . mappings . into_iter ( )
248290 }
249291}
250292
251- impl < T > Keys < T >
293+ impl < T > Codebook < T >
252294where
253295 T : Ord + Clone ,
254296{
255- /// Compute a `Keys ` from a sequence of values.
297+ /// Compute a `Codebook ` from a sequence of values.
256298 ///
257299 /// Optionally, `max_bit_len` may specify a largest acceptable bit length.
258- /// If `Keys ` may not be computed without exceeding this bit length,
300+ /// If the `Codebook ` may not be computed without exceeding this bit length,
259301 /// fail with `Err(problemantic_bit_len)`.
260302 ///
261303 /// The current implementation only attempts to produce the best compression
@@ -278,11 +320,11 @@ where
278320 let counter = map. entry ( item) . or_insert ( 0 . into ( ) ) ;
279321 * counter += 1 . into ( ) ;
280322 }
281- // Then compute the `Keys `.
323+ // Then compute the `Codebook `.
282324 Self :: from_instances ( map, max_bit_len)
283325 }
284326
285- /// Compute a `Keys ` from a sequence of values
327+ /// Compute a `Codebook ` from a sequence of values
286328 /// with a number of instances already attached.
287329 ///
288330 /// The current implementation only attempts to produce the best compression
@@ -305,27 +347,27 @@ where
305347
306348 // The bits associated to the next value.
307349 let mut bits = 0 ;
308- let mut keys = Vec :: with_capacity ( bit_lengths. len ( ) ) ;
350+ let mut mappings = Vec :: with_capacity ( bit_lengths. len ( ) ) ;
309351
310352 for i in 0 ..bit_lengths. len ( ) - 1 {
311353 let ( bit_len, symbol, next_bit_len) = (
312354 bit_lengths[ i] . 1 ,
313355 bit_lengths[ i] . 0 . clone ( ) ,
314356 bit_lengths[ i + 1 ] . 1 ,
315357 ) ;
316- keys . push ( ( symbol. clone ( ) , Key :: new ( bits, bit_len) ) ) ;
358+ mappings . push ( ( symbol. clone ( ) , Key :: new ( bits, bit_len) ) ) ;
317359 bits = ( bits + 1 ) << ( next_bit_len - bit_len) ;
318360 if bit_len > highest_bit_len {
319361 highest_bit_len = bit_len;
320362 }
321363 }
322364 // Handle the last element.
323365 let ( ref symbol, bit_len) = bit_lengths[ bit_lengths. len ( ) - 1 ] ;
324- keys . push ( ( symbol. clone ( ) , Key :: new ( bits, bit_len) ) ) ;
366+ mappings . push ( ( symbol. clone ( ) , Key :: new ( bits, bit_len) ) ) ;
325367
326368 return Ok ( Self {
327369 highest_bit_len,
328- keys ,
370+ mappings ,
329371 } ) ;
330372 }
331373
@@ -412,26 +454,106 @@ where
412454#[ test]
413455fn test_coded_from_sequence ( ) {
414456 let sample = "appl" ;
415- let coded = Keys :: from_sequence ( sample. chars ( ) , std:: u8:: MAX ) . unwrap ( ) ;
457+ let coded = Codebook :: from_sequence ( sample. chars ( ) , std:: u8:: MAX ) . unwrap ( ) ;
416458
417459 // Symbol 'p' appears twice, we should see 3 codes.
418- assert_eq ! ( coded. keys . len( ) , 3 ) ;
460+ assert_eq ! ( coded. mappings . len( ) , 3 ) ;
419461
420462 // Check order of symbols.
421- assert_eq ! ( coded. keys [ 0 ] . 0 , 'p' ) ;
422- assert_eq ! ( coded. keys [ 1 ] . 0 , 'a' ) ;
423- assert_eq ! ( coded. keys [ 2 ] . 0 , 'l' ) ;
463+ assert_eq ! ( coded. mappings [ 0 ] . 0 , 'p' ) ;
464+ assert_eq ! ( coded. mappings [ 1 ] . 0 , 'a' ) ;
465+ assert_eq ! ( coded. mappings [ 2 ] . 0 , 'l' ) ;
424466
425467 // Check bit length of symbols.
426- assert_eq ! ( coded. keys [ 0 ] . 1 . bit_len( ) , 1 . into( ) ) ;
427- assert_eq ! ( coded. keys [ 1 ] . 1 . bit_len( ) , 2 . into( ) ) ;
428- assert_eq ! ( coded. keys [ 2 ] . 1 . bit_len( ) , 2 . into( ) ) ;
468+ assert_eq ! ( coded. mappings [ 0 ] . 1 . bit_len( ) , 1 . into( ) ) ;
469+ assert_eq ! ( coded. mappings [ 1 ] . 1 . bit_len( ) , 2 . into( ) ) ;
470+ assert_eq ! ( coded. mappings [ 2 ] . 1 . bit_len( ) , 2 . into( ) ) ;
429471
430472 // Check code of symbols.
431- assert_eq ! ( coded. keys [ 0 ] . 1 . bits( ) , 0b00 ) ;
432- assert_eq ! ( coded. keys [ 1 ] . 1 . bits( ) , 0b10 ) ;
433- assert_eq ! ( coded. keys [ 2 ] . 1 . bits( ) , 0b11 ) ;
473+ assert_eq ! ( coded. mappings [ 0 ] . 1 . bits( ) , 0b00 ) ;
474+ assert_eq ! ( coded. mappings [ 1 ] . 1 . bits( ) , 0b10 ) ;
475+ assert_eq ! ( coded. mappings [ 2 ] . 1 . bits( ) , 0b11 ) ;
434476
435477 // Let's try again with a limit to 1 bit paths.
436- assert_eq ! ( Keys :: from_sequence( sample. chars( ) , 1 ) . unwrap_err( ) , 2 ) ;
478+ assert_eq ! ( Codebook :: from_sequence( sample. chars( ) , 1 ) . unwrap_err( ) , 2 ) ;
479+ }
480+
481+ impl < T > Codebook < T > {
482+ /// Return the mappings of a Codebook.
483+ pub fn mappings ( self ) -> Vec < ( T , Key ) > {
484+ self . mappings
485+ }
486+
487+ /// Split a Codebook into several Codebooks grouped by a common prefix.
488+ ///
489+ /// For instance, if `prefix_len` is 2, the result will be a vector of size 2^2
490+ /// containing:
491+ ///
492+ /// - at index 0 (= 0b00), all the keys starting with 0b00, minus the prefix 0b00;
493+ /// - at index 1 (= 0b01), all the keys starting with 0b01, minus the prefix 0b01;
494+ /// - at index 2 (= 0b10), all the keys starting with 0b10, minus the prefix 0b10;
495+ /// - at index 3 (= 0b11), all the keys starting with 0b11, minus the prefix 0b11.
496+ ///
497+ /// ```
498+ /// let sample = "appl";
499+ /// let coded = Codebook::from_sequence(sample.chars(), std::u8::MAX).unwrap();
500+ /// // 0b0 => p
501+ /// // 0b10 => a
502+ /// // 0b11 => l
503+ ///
504+ /// let buckets = coded.bucket_by_prefix(1);
505+ /// assert_eq!(buckets.len(), 2);
506+ ///
507+ /// // `buckets[0]` contains keys that start with `0`.
508+ /// assert_eq!(buckets[0].len(), 1);
509+ /// let bucket: Vec<_> = buckets[0].iter().collect();
510+ /// assert_eq!(bucket[0].0, 'p');
511+ /// assert_eq!(bucket[0].1, Key::new(0, BitLen(0))); // Key was 0b0, now empty.
512+ ///
513+ /// // `buckets[1]` contains keys that start with `1`.
514+ /// assert_eq!(buckets[0].len(), 2);
515+ /// let bucket: Vec<_> = buckets[0].iter().sorted_by_key(|(c, )| c);
516+ /// assert_eq!(bucket[0].0, 'a');
517+ /// assert_eq!(bucket[0].1, Key::new(1, BitLen(1))); // Key was 0b11, now 0b1
518+ /// assert_eq!(bucket[1].0, 'l');
519+ /// assert_eq!(bucket[1].1, Key::new(0, BitLen(1))); // Key was 0b10, now 0b0
520+ /// ```
521+ pub fn bucket_by_prefix ( self , prefix_len : BitLen ) -> Vec < Codebook < T > > {
522+ assert ! ( prefix_len < self . highest_bit_len) ;
523+
524+ // Prepare empty buckets.
525+ let mut result = Vec :: with_capacity ( 1usize << prefix_len) ;
526+ result. resize_with ( 1usize << prefix_len, || Codebook {
527+ highest_bit_len : 0 . into ( ) ,
528+ mappings : vec ! [ ] ,
529+ } ) ;
530+
531+ // Dispatch each (value, key) to its bucket.
532+ for ( value, key) in self {
533+ let ( prefix, suffix) = key. as_bit_sequence ( ) . split ( prefix_len) ;
534+ let ref mut bucket = result[ prefix. bits ( ) as usize ] ;
535+ if suffix. bit_len ( ) > bucket. highest_bit_len {
536+ bucket. highest_bit_len = suffix. bit_len ( ) ;
537+ }
538+ bucket
539+ . mappings
540+ . push ( ( value, Key :: from_bit_sequence ( suffix) ) ) ;
541+ }
542+
543+ result
544+ }
545+
546+ pub fn map < F , U > ( self , mut f : F ) -> Codebook < U >
547+ where
548+ F : FnMut ( T ) -> U ,
549+ {
550+ Codebook {
551+ highest_bit_len : self . highest_bit_len ,
552+ mappings : self
553+ . mappings
554+ . into_iter ( )
555+ . map ( |( value, key) | ( f ( value) , key) )
556+ . collect ( ) ,
557+ }
558+ }
437559}
0 commit comments