Skip to content

Commit d91bd07

Browse files
Merge pull request #30 from frankmcsherry/encode_neu
Improve byte slice encoding
2 parents 5471b54 + 6f0899b commit d91bd07

File tree

3 files changed

+181
-18
lines changed

3 files changed

+181
-18
lines changed

benches/bench.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,8 @@ fn _bench_copy<T: Columnar+Eq>(bencher: &mut Bencher, record: T) where T::Contai
5050
for _ in 0 .. 1024 {
5151
arena.push(&record);
5252
}
53-
use columnar::{AsBytes, Container};
54-
bencher.bytes = Sequence::length_in_bytes(arena.borrow().as_bytes()) as u64;
53+
use columnar::Container;
54+
bencher.bytes = Sequence::length_in_bytes(&arena.borrow()) as u64;
5555
arena.clear();
5656

5757
bencher.iter(|| {

benches/serde.rs

+5-5
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
use bencher::{benchmark_group, benchmark_main, Bencher};
2-
use columnar::{Columnar, Container, Clear, AsBytes, FromBytes};
2+
use columnar::{Columnar, Container, Clear, FromBytes};
33
use columnar::bytes::{EncodeDecode, Sequence};
44
use serde::{Serialize, Deserialize};
55

@@ -19,7 +19,7 @@ fn goser_push(b: &mut Bencher) {
1919
container.push(&log);
2020
}
2121
let mut words = vec![];
22-
Sequence::encode(&mut words, container.borrow().as_bytes());
22+
Sequence::encode(&mut words, &container.borrow());
2323
b.bytes = 8 * words.len() as u64;
2424
b.iter(|| {
2525
container.clear();
@@ -50,11 +50,11 @@ fn goser_encode(b: &mut Bencher) {
5050
container.push(&log);
5151
}
5252
let mut words = vec![];
53-
Sequence::encode(&mut words, container.borrow().as_bytes());
53+
Sequence::encode(&mut words, &container.borrow());
5454
b.bytes = 8 * words.len() as u64;
5555
b.iter(|| {
5656
words.clear();
57-
Sequence::encode(&mut words, container.borrow().as_bytes());
57+
Sequence::encode(&mut words, &container.borrow());
5858
bencher::black_box(&words);
5959
});
6060
}
@@ -67,7 +67,7 @@ fn goser_decode(b: &mut Bencher) {
6767
for _ in 0..1024 {
6868
container.push(&log);
6969
}
70-
Sequence::encode(&mut words, container.borrow().as_bytes());
70+
Sequence::encode(&mut words, &container.borrow());
7171
b.bytes = 8 * words.len() as u64;
7272
b.iter(|| {
7373
let mut slices = Sequence::decode(&mut words);

src/lib.rs

+174-11
Original file line numberDiff line numberDiff line change
@@ -476,18 +476,20 @@ pub mod common {
476476
/// The methods here line up with the `AsBytes` and `FromBytes` traits.
477477
pub mod bytes {
478478

479+
use crate::AsBytes;
480+
479481
/// A coupled encode/decode pair for byte sequences.
480482
pub trait EncodeDecode {
481483
/// Encoded length in number of `u64` words required.
482-
fn length_in_words<'a, I>(bytes: I) -> usize where I : Iterator<Item=(u64, &'a [u8])>;
484+
fn length_in_words<'a, A>(bytes: &A) -> usize where A : AsBytes<'a>;
483485
/// Encoded length in number of `u8` bytes required.
484486
///
485487
/// This method should always be eight times `Self::length_in_words`, and is provided for convenience and clarity.
486-
fn length_in_bytes<'a, I>(bytes: I) -> usize where I : Iterator<Item=(u64, &'a [u8])> { 8 * Self::length_in_words(bytes) }
488+
fn length_in_bytes<'a, A>(bytes: &A) -> usize where A : AsBytes<'a> { 8 * Self::length_in_words(bytes) }
487489
/// Encodes `bytes` into a sequence of `u64`.
488-
fn encode<'a, I>(store: &mut Vec<u64>, bytes: I) where I : Iterator<Item=(u64, &'a [u8])>;
490+
fn encode<'a, A>(store: &mut Vec<u64>, bytes: &A) where A : AsBytes<'a>;
489491
/// Writes `bytes` in the encoded format to an arbitrary writer.
490-
fn write<'a, I, W: std::io::Write>(writer: W, bytes: I) -> std::io::Result<()> where I : Iterator<Item=(u64, &'a [u8])>;
492+
fn write<'a, A, W: std::io::Write>(writer: W, bytes: &A) -> std::io::Result<()> where A : AsBytes<'a>;
491493
/// Decodes bytes from a sequence of `u64`.
492494
fn decode<'a>(store: &'a [u64]) -> impl Iterator<Item=&'a [u8]>;
493495
}
@@ -499,18 +501,20 @@ pub mod bytes {
499501
pub use serialization::Sequence;
500502
mod serialization {
501503

504+
use crate::AsBytes;
505+
502506
/// Encodes and decodes bytes sequences, by prepending the length and appending the all sequences.
503507
pub struct Sequence;
504508
impl super::EncodeDecode for Sequence {
505-
fn length_in_words<'a, I>(bytes: I) -> usize where I : Iterator<Item=(u64, &'a [u8])> {
509+
fn length_in_words<'a, A>(bytes: &A) -> usize where A : AsBytes<'a> {
506510
// Each byte slice has one `u64` for the length, and then as many `u64`s as needed to hold all bytes.
507-
bytes.map(|(_align, bytes)| 1 + (bytes.len() + 7)/8).sum()
511+
bytes.as_bytes().map(|(_align, bytes)| 1 + (bytes.len() + 7)/8).sum()
508512
}
509-
fn encode<'a, I>(store: &mut Vec<u64>, bytes: I) where I : Iterator<Item=(u64, &'a [u8])> {
510-
encode(store, bytes)
513+
fn encode<'a, A>(store: &mut Vec<u64>, bytes: &A) where A : AsBytes<'a> {
514+
encode(store, bytes.as_bytes())
511515
}
512-
fn write<'a, I, W: std::io::Write>(writer: W, bytes: I) -> std::io::Result<()> where I : Iterator<Item=(u64, &'a [u8])> {
513-
write(writer, bytes)
516+
fn write<'a, A, W: std::io::Write>(writer: W, bytes: &A) -> std::io::Result<()> where A : AsBytes<'a> {
517+
write(writer, bytes.as_bytes())
514518
}
515519
fn decode<'a>(store: &'a [u64]) -> impl Iterator<Item=&'a [u8]> {
516520
decode(store)
@@ -598,6 +602,166 @@ pub mod bytes {
598602
}
599603
}
600604

605+
/// A binary encoding of sequences of byte slices.
606+
///
607+
/// The encoding starts with a sequence of n+1 offsets describing where to find the n slices in the bytes that follow.
608+
/// Treating the offsets as a byte slice too, the each offset indicates the location (in bytes) of the end of its slice.
609+
/// Each byte slice can be found from a pair of adjacent offsets, where the first is rounded up to a multiple of eight.
610+
pub use serialization_neu::Indexed;
611+
pub mod serialization_neu {
612+
613+
use crate::AsBytes;
614+
615+
/// Encodes and decodes bytes sequences, using an index of offsets.
616+
pub struct Indexed;
617+
impl super::EncodeDecode for Indexed {
618+
fn length_in_words<'a, A>(bytes: &A) -> usize where A : AsBytes<'a> {
619+
1 + bytes.as_bytes().map(|(_align, bytes)| 1 + (bytes.len() + 7)/8).sum::<usize>()
620+
}
621+
fn encode<'a, A>(store: &mut Vec<u64>, bytes: &A) where A : AsBytes<'a> {
622+
encode(store, bytes)
623+
}
624+
fn write<'a, A, W: std::io::Write>(writer: W, bytes: &A) -> std::io::Result<()> where A : AsBytes<'a> {
625+
write(writer, bytes)
626+
}
627+
fn decode<'a>(store: &'a [u64]) -> impl Iterator<Item=&'a [u8]> {
628+
decode(store)
629+
}
630+
}
631+
632+
/// Encodes `item` into `u64` aligned words.
633+
///
634+
/// The sequence of byte slices are appended, with padding to have each slice start `u64` aligned.
635+
/// The sequence is then pre-pended with as many byte offsets as there are slices in `item`, plus one.
636+
/// The byte offsets indicate where each slice ends, and by rounding up to `u64` alignemnt where the next slice begins.
637+
/// The first offset indicates where the list of offsets itself ends, and where the first slice begins.
638+
///
639+
/// We will need to visit `as_bytes` three times to extract this information, so the method should be efficient and inlined.
640+
/// The first read writes the first offset, the second writes each other offset, and the third writes the bytes themselves.
641+
///
642+
/// The offsets are zero-based, rather than based on `store.len()`.
643+
/// If you call the method with a non-empty `store` be careful decoding.
644+
pub fn encode<'a, A>(store: &mut Vec<u64>, iter: &A)
645+
where A : AsBytes<'a>,
646+
{
647+
// Read 1: Number of offsets we will record, equal to the number of slices plus one.
648+
// TODO: right-size `store` before first call to `push`.
649+
let offsets = 1 + iter.as_bytes().count();
650+
let offsets_end: u64 = TryInto::<u64>::try_into((offsets) * std::mem::size_of::<u64>()).unwrap();
651+
store.push(offsets_end);
652+
// Read 2: Establish each of the offsets based on lengths of byte slices.
653+
let mut position_bytes = offsets_end;
654+
for (align, bytes) in iter.as_bytes() {
655+
assert!(align <= 8);
656+
// Write length in bytes, but round up to words before updating `position_bytes`.
657+
let to_push: u64 = position_bytes + TryInto::<u64>::try_into(bytes.len()).unwrap();
658+
store.push(to_push);
659+
let round_len: u64 = ((bytes.len() + 7) & !7).try_into().unwrap();
660+
position_bytes += round_len;
661+
}
662+
// Read 3: Append each byte slice, with padding to align starts to `u64`.
663+
for (_align, bytes) in iter.as_bytes() {
664+
let whole_words = 8 * (bytes.len() / 8);
665+
// We want to extend `store` by `bytes`, but `bytes` may not be `u64` aligned.
666+
// In the latter case, init `store` and cast and copy onto it as a byte slice.
667+
if let Ok(words) = bytemuck::try_cast_slice(&bytes[.. whole_words]) {
668+
store.extend_from_slice(words);
669+
}
670+
else {
671+
let store_len = store.len();
672+
store.resize(store_len + whole_words/8, 0);
673+
let slice = bytemuck::try_cast_slice_mut(&mut store[store_len..]).expect("&[u64] should convert to &[u8]");
674+
slice.copy_from_slice(&bytes[.. whole_words]);
675+
}
676+
let remaining_bytes = &bytes[whole_words..];
677+
if !remaining_bytes.is_empty() {
678+
let mut remainder = 0u64;
679+
let transmute: &mut [u8] = bytemuck::try_cast_slice_mut(std::slice::from_mut(&mut remainder)).expect("&[u64] should convert to &[u8]");
680+
for (i, byte) in remaining_bytes.iter().enumerate() {
681+
transmute[i] = *byte;
682+
}
683+
store.push(remainder);
684+
}
685+
}
686+
}
687+
688+
pub fn write<'a, A, W>(mut writer: W, iter: &A) -> std::io::Result<()>
689+
where
690+
A: AsBytes<'a>,
691+
W: std::io::Write,
692+
{
693+
// Read 1: Number of offsets we will record, equal to the number of slices plus one.
694+
let offsets = 1 + iter.as_bytes().count();
695+
let offsets_end: u64 = TryInto::<u64>::try_into((offsets) * std::mem::size_of::<u64>()).unwrap();
696+
writer.write_all(bytemuck::cast_slice(std::slice::from_ref(&offsets_end)))?;
697+
// Read 2: Establish each of the offsets based on lengths of byte slices.
698+
let mut position_bytes = offsets_end;
699+
for (align, bytes) in iter.as_bytes() {
700+
assert!(align <= 8);
701+
// Write length in bytes, but round up to words before updating `position_bytes`.
702+
let to_push: u64 = position_bytes + TryInto::<u64>::try_into(bytes.len()).unwrap();
703+
writer.write_all(bytemuck::cast_slice(std::slice::from_ref(&to_push)))?;
704+
let round_len: u64 = ((bytes.len() + 7) & !7).try_into().unwrap();
705+
position_bytes += round_len;
706+
}
707+
// Read 3: Append each byte slice, with padding to align starts to `u64`.
708+
for (_align, bytes) in iter.as_bytes() {
709+
writer.write_all(bytes)?;
710+
let padding = ((bytes.len() + 7) & !7) - bytes.len();
711+
if padding > 0 {
712+
writer.write_all(&[0u8;8][..padding])?;
713+
}
714+
}
715+
716+
Ok(())
717+
}
718+
719+
/// Decodes an encoded sequence of byte slices. Each result will be `u64` aligned.
720+
pub fn decode(store: &[u64]) -> impl Iterator<Item=&[u8]> {
721+
assert!(store[0] % 8 == 0);
722+
let slices = (store[0] / 8) - 1;
723+
(0 .. slices).map(|i| decode_index(store, i))
724+
}
725+
726+
/// Decodes a specific byte slice by index. It will be `u64` aligned.
727+
#[inline(always)]
728+
pub fn decode_index(store: &[u64], index: u64) -> &[u8] {
729+
debug_assert!(index + 1 < store[0]/8);
730+
let index: usize = index.try_into().unwrap();
731+
let lower: usize = ((store[index] + 7) & !7).try_into().unwrap();
732+
let upper: usize = (store[index + 1]).try_into().unwrap();
733+
let bytes: &[u8] = bytemuck::try_cast_slice(store).expect("&[u64] should convert to &[u8]");
734+
&bytes[lower .. upper]
735+
}
736+
737+
#[cfg(test)]
738+
mod test {
739+
740+
use crate::{Columnar, Container};
741+
use crate::common::Push;
742+
use crate::AsBytes;
743+
744+
use super::{encode, decode};
745+
746+
fn assert_roundtrip<'a, AB: AsBytes<'a>>(item: &AB) {
747+
let mut store = Vec::new();
748+
encode(&mut store, item);
749+
assert!(item.as_bytes().map(|x| x.1).eq(decode(&store)));
750+
}
751+
752+
#[test]
753+
fn round_trip() {
754+
755+
let mut column: <Result<u64, String> as Columnar>::Container = Default::default();
756+
for i in 0..10000u64 {
757+
column.push(&Ok::<u64, String>(i));
758+
column.push(&Err::<u64, String>(format!("{:?}", i)));
759+
}
760+
761+
assert_roundtrip(&column.borrow());
762+
}
763+
}
764+
}
601765

602766
#[cfg(test)]
603767
mod test {
@@ -635,7 +799,6 @@ pub mod bytes {
635799
}
636800
}
637801
}
638-
639802
}
640803

641804
/// Types that prefer to be represented by `Vec<T>`.

0 commit comments

Comments
 (0)