@@ -476,18 +476,20 @@ pub mod common {
476
476
/// The methods here line up with the `AsBytes` and `FromBytes` traits.
477
477
pub mod bytes {
478
478
479
+ use crate :: AsBytes ;
480
+
479
481
/// A coupled encode/decode pair for byte sequences.
480
482
pub trait EncodeDecode {
481
483
/// Encoded length in number of `u64` words required.
482
- fn length_in_words < ' a , I > ( bytes : I ) -> usize where I : Iterator < Item = ( u64 , & ' a [ u8 ] ) > ;
484
+ fn length_in_words < ' a , A > ( bytes : & A ) -> usize where A : AsBytes < ' a > ;
483
485
/// Encoded length in number of `u8` bytes required.
484
486
///
485
487
/// This method should always be eight times `Self::length_in_words`, and is provided for convenience and clarity.
486
- fn length_in_bytes < ' a , I > ( bytes : I ) -> usize where I : Iterator < Item = ( u64 , & ' a [ u8 ] ) > { 8 * Self :: length_in_words ( bytes) }
488
+ fn length_in_bytes < ' a , A > ( bytes : & A ) -> usize where A : AsBytes < ' a > { 8 * Self :: length_in_words ( bytes) }
487
489
/// Encodes `bytes` into a sequence of `u64`.
488
- fn encode < ' a , I > ( store : & mut Vec < u64 > , bytes : I ) where I : Iterator < Item = ( u64 , & ' a [ u8 ] ) > ;
490
+ fn encode < ' a , A > ( store : & mut Vec < u64 > , bytes : & A ) where A : AsBytes < ' a > ;
489
491
/// Writes `bytes` in the encoded format to an arbitrary writer.
490
- fn write < ' a , I , W : std:: io:: Write > ( writer : W , bytes : I ) -> std:: io:: Result < ( ) > where I : Iterator < Item = ( u64 , & ' a [ u8 ] ) > ;
492
+ fn write < ' a , A , W : std:: io:: Write > ( writer : W , bytes : & A ) -> std:: io:: Result < ( ) > where A : AsBytes < ' a > ;
491
493
/// Decodes bytes from a sequence of `u64`.
492
494
fn decode < ' a > ( store : & ' a [ u64 ] ) -> impl Iterator < Item =& ' a [ u8 ] > ;
493
495
}
@@ -499,18 +501,20 @@ pub mod bytes {
499
501
pub use serialization:: Sequence ;
500
502
mod serialization {
501
503
504
+ use crate :: AsBytes ;
505
+
502
506
/// Encodes and decodes bytes sequences, by prepending the length and appending the all sequences.
503
507
pub struct Sequence ;
504
508
impl super :: EncodeDecode for Sequence {
505
- fn length_in_words < ' a , I > ( bytes : I ) -> usize where I : Iterator < Item = ( u64 , & ' a [ u8 ] ) > {
509
+ fn length_in_words < ' a , A > ( bytes : & A ) -> usize where A : AsBytes < ' a > {
506
510
// Each byte slice has one `u64` for the length, and then as many `u64`s as needed to hold all bytes.
507
- bytes. map ( |( _align, bytes) | 1 + ( bytes. len ( ) + 7 ) /8 ) . sum ( )
511
+ bytes. as_bytes ( ) . map ( |( _align, bytes) | 1 + ( bytes. len ( ) + 7 ) /8 ) . sum ( )
508
512
}
509
- fn encode < ' a , I > ( store : & mut Vec < u64 > , bytes : I ) where I : Iterator < Item = ( u64 , & ' a [ u8 ] ) > {
510
- encode ( store, bytes)
513
+ fn encode < ' a , A > ( store : & mut Vec < u64 > , bytes : & A ) where A : AsBytes < ' a > {
514
+ encode ( store, bytes. as_bytes ( ) )
511
515
}
512
- fn write < ' a , I , W : std:: io:: Write > ( writer : W , bytes : I ) -> std:: io:: Result < ( ) > where I : Iterator < Item = ( u64 , & ' a [ u8 ] ) > {
513
- write ( writer, bytes)
516
+ fn write < ' a , A , W : std:: io:: Write > ( writer : W , bytes : & A ) -> std:: io:: Result < ( ) > where A : AsBytes < ' a > {
517
+ write ( writer, bytes. as_bytes ( ) )
514
518
}
515
519
fn decode < ' a > ( store : & ' a [ u64 ] ) -> impl Iterator < Item =& ' a [ u8 ] > {
516
520
decode ( store)
@@ -598,6 +602,166 @@ pub mod bytes {
598
602
}
599
603
}
600
604
605
+ /// A binary encoding of sequences of byte slices.
606
+ ///
607
+ /// The encoding starts with a sequence of n+1 offsets describing where to find the n slices in the bytes that follow.
608
+ /// Treating the offsets as a byte slice too, the each offset indicates the location (in bytes) of the end of its slice.
609
+ /// Each byte slice can be found from a pair of adjacent offsets, where the first is rounded up to a multiple of eight.
610
+ pub use serialization_neu:: Indexed ;
611
+ pub mod serialization_neu {
612
+
613
+ use crate :: AsBytes ;
614
+
615
+ /// Encodes and decodes bytes sequences, using an index of offsets.
616
+ pub struct Indexed ;
617
+ impl super :: EncodeDecode for Indexed {
618
+ fn length_in_words < ' a , A > ( bytes : & A ) -> usize where A : AsBytes < ' a > {
619
+ 1 + bytes. as_bytes ( ) . map ( |( _align, bytes) | 1 + ( bytes. len ( ) + 7 ) /8 ) . sum :: < usize > ( )
620
+ }
621
+ fn encode < ' a , A > ( store : & mut Vec < u64 > , bytes : & A ) where A : AsBytes < ' a > {
622
+ encode ( store, bytes)
623
+ }
624
+ fn write < ' a , A , W : std:: io:: Write > ( writer : W , bytes : & A ) -> std:: io:: Result < ( ) > where A : AsBytes < ' a > {
625
+ write ( writer, bytes)
626
+ }
627
+ fn decode < ' a > ( store : & ' a [ u64 ] ) -> impl Iterator < Item =& ' a [ u8 ] > {
628
+ decode ( store)
629
+ }
630
+ }
631
+
632
+ /// Encodes `item` into `u64` aligned words.
633
+ ///
634
+ /// The sequence of byte slices are appended, with padding to have each slice start `u64` aligned.
635
+ /// The sequence is then pre-pended with as many byte offsets as there are slices in `item`, plus one.
636
+ /// The byte offsets indicate where each slice ends, and by rounding up to `u64` alignemnt where the next slice begins.
637
+ /// The first offset indicates where the list of offsets itself ends, and where the first slice begins.
638
+ ///
639
+ /// We will need to visit `as_bytes` three times to extract this information, so the method should be efficient and inlined.
640
+ /// The first read writes the first offset, the second writes each other offset, and the third writes the bytes themselves.
641
+ ///
642
+ /// The offsets are zero-based, rather than based on `store.len()`.
643
+ /// If you call the method with a non-empty `store` be careful decoding.
644
+ pub fn encode < ' a , A > ( store : & mut Vec < u64 > , iter : & A )
645
+ where A : AsBytes < ' a > ,
646
+ {
647
+ // Read 1: Number of offsets we will record, equal to the number of slices plus one.
648
+ // TODO: right-size `store` before first call to `push`.
649
+ let offsets = 1 + iter. as_bytes ( ) . count ( ) ;
650
+ let offsets_end: u64 = TryInto :: < u64 > :: try_into ( ( offsets) * std:: mem:: size_of :: < u64 > ( ) ) . unwrap ( ) ;
651
+ store. push ( offsets_end) ;
652
+ // Read 2: Establish each of the offsets based on lengths of byte slices.
653
+ let mut position_bytes = offsets_end;
654
+ for ( align, bytes) in iter. as_bytes ( ) {
655
+ assert ! ( align <= 8 ) ;
656
+ // Write length in bytes, but round up to words before updating `position_bytes`.
657
+ let to_push: u64 = position_bytes + TryInto :: < u64 > :: try_into ( bytes. len ( ) ) . unwrap ( ) ;
658
+ store. push ( to_push) ;
659
+ let round_len: u64 = ( ( bytes. len ( ) + 7 ) & !7 ) . try_into ( ) . unwrap ( ) ;
660
+ position_bytes += round_len;
661
+ }
662
+ // Read 3: Append each byte slice, with padding to align starts to `u64`.
663
+ for ( _align, bytes) in iter. as_bytes ( ) {
664
+ let whole_words = 8 * ( bytes. len ( ) / 8 ) ;
665
+ // We want to extend `store` by `bytes`, but `bytes` may not be `u64` aligned.
666
+ // In the latter case, init `store` and cast and copy onto it as a byte slice.
667
+ if let Ok ( words) = bytemuck:: try_cast_slice ( & bytes[ .. whole_words] ) {
668
+ store. extend_from_slice ( words) ;
669
+ }
670
+ else {
671
+ let store_len = store. len ( ) ;
672
+ store. resize ( store_len + whole_words/8 , 0 ) ;
673
+ let slice = bytemuck:: try_cast_slice_mut ( & mut store[ store_len..] ) . expect ( "&[u64] should convert to &[u8]" ) ;
674
+ slice. copy_from_slice ( & bytes[ .. whole_words] ) ;
675
+ }
676
+ let remaining_bytes = & bytes[ whole_words..] ;
677
+ if !remaining_bytes. is_empty ( ) {
678
+ let mut remainder = 0u64 ;
679
+ let transmute: & mut [ u8 ] = bytemuck:: try_cast_slice_mut ( std:: slice:: from_mut ( & mut remainder) ) . expect ( "&[u64] should convert to &[u8]" ) ;
680
+ for ( i, byte) in remaining_bytes. iter ( ) . enumerate ( ) {
681
+ transmute[ i] = * byte;
682
+ }
683
+ store. push ( remainder) ;
684
+ }
685
+ }
686
+ }
687
+
688
+ pub fn write < ' a , A , W > ( mut writer : W , iter : & A ) -> std:: io:: Result < ( ) >
689
+ where
690
+ A : AsBytes < ' a > ,
691
+ W : std:: io:: Write ,
692
+ {
693
+ // Read 1: Number of offsets we will record, equal to the number of slices plus one.
694
+ let offsets = 1 + iter. as_bytes ( ) . count ( ) ;
695
+ let offsets_end: u64 = TryInto :: < u64 > :: try_into ( ( offsets) * std:: mem:: size_of :: < u64 > ( ) ) . unwrap ( ) ;
696
+ writer. write_all ( bytemuck:: cast_slice ( std:: slice:: from_ref ( & offsets_end) ) ) ?;
697
+ // Read 2: Establish each of the offsets based on lengths of byte slices.
698
+ let mut position_bytes = offsets_end;
699
+ for ( align, bytes) in iter. as_bytes ( ) {
700
+ assert ! ( align <= 8 ) ;
701
+ // Write length in bytes, but round up to words before updating `position_bytes`.
702
+ let to_push: u64 = position_bytes + TryInto :: < u64 > :: try_into ( bytes. len ( ) ) . unwrap ( ) ;
703
+ writer. write_all ( bytemuck:: cast_slice ( std:: slice:: from_ref ( & to_push) ) ) ?;
704
+ let round_len: u64 = ( ( bytes. len ( ) + 7 ) & !7 ) . try_into ( ) . unwrap ( ) ;
705
+ position_bytes += round_len;
706
+ }
707
+ // Read 3: Append each byte slice, with padding to align starts to `u64`.
708
+ for ( _align, bytes) in iter. as_bytes ( ) {
709
+ writer. write_all ( bytes) ?;
710
+ let padding = ( ( bytes. len ( ) + 7 ) & !7 ) - bytes. len ( ) ;
711
+ if padding > 0 {
712
+ writer. write_all ( & [ 0u8 ; 8 ] [ ..padding] ) ?;
713
+ }
714
+ }
715
+
716
+ Ok ( ( ) )
717
+ }
718
+
719
+ /// Decodes an encoded sequence of byte slices. Each result will be `u64` aligned.
720
+ pub fn decode ( store : & [ u64 ] ) -> impl Iterator < Item =& [ u8 ] > {
721
+ assert ! ( store[ 0 ] % 8 == 0 ) ;
722
+ let slices = ( store[ 0 ] / 8 ) - 1 ;
723
+ ( 0 .. slices) . map ( |i| decode_index ( store, i) )
724
+ }
725
+
726
+ /// Decodes a specific byte slice by index. It will be `u64` aligned.
727
+ #[ inline( always) ]
728
+ pub fn decode_index ( store : & [ u64 ] , index : u64 ) -> & [ u8 ] {
729
+ debug_assert ! ( index + 1 < store[ 0 ] /8 ) ;
730
+ let index: usize = index. try_into ( ) . unwrap ( ) ;
731
+ let lower: usize = ( ( store[ index] + 7 ) & !7 ) . try_into ( ) . unwrap ( ) ;
732
+ let upper: usize = ( store[ index + 1 ] ) . try_into ( ) . unwrap ( ) ;
733
+ let bytes: & [ u8 ] = bytemuck:: try_cast_slice ( store) . expect ( "&[u64] should convert to &[u8]" ) ;
734
+ & bytes[ lower .. upper]
735
+ }
736
+
737
+ #[ cfg( test) ]
738
+ mod test {
739
+
740
+ use crate :: { Columnar , Container } ;
741
+ use crate :: common:: Push ;
742
+ use crate :: AsBytes ;
743
+
744
+ use super :: { encode, decode} ;
745
+
746
+ fn assert_roundtrip < ' a , AB : AsBytes < ' a > > ( item : & AB ) {
747
+ let mut store = Vec :: new ( ) ;
748
+ encode ( & mut store, item) ;
749
+ assert ! ( item. as_bytes( ) . map( |x| x. 1 ) . eq( decode( & store) ) ) ;
750
+ }
751
+
752
+ #[ test]
753
+ fn round_trip ( ) {
754
+
755
+ let mut column: <Result < u64 , String > as Columnar >:: Container = Default :: default ( ) ;
756
+ for i in 0 ..10000u64 {
757
+ column. push ( & Ok :: < u64 , String > ( i) ) ;
758
+ column. push ( & Err :: < u64 , String > ( format ! ( "{:?}" , i) ) ) ;
759
+ }
760
+
761
+ assert_roundtrip ( & column. borrow ( ) ) ;
762
+ }
763
+ }
764
+ }
601
765
602
766
#[ cfg( test) ]
603
767
mod test {
@@ -635,7 +799,6 @@ pub mod bytes {
635
799
}
636
800
}
637
801
}
638
-
639
802
}
640
803
641
804
/// Types that prefer to be represented by `Vec<T>`.
0 commit comments