Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve ISAAC performance (take 2) #45

Merged
merged 8 commits into from
Nov 11, 2017
77 changes: 77 additions & 0 deletions rand_core/src/impls.rs
Original file line number Diff line number Diff line change
@@ -21,6 +21,7 @@

use core::intrinsics::transmute;
use core::slice;
use core::cmp::min;
use Rng;

/// Implement `next_u64` via `next_u32`, little-endian order.
@@ -93,6 +94,82 @@ macro_rules! impl_uint_from_fill {
});
}

macro_rules! fill_via_chunks {
($src:expr, $dest:expr, $N:expr) => ({
let chunk_size_u8 = min($src.len() * $N, $dest.len());
let chunk_size = (chunk_size_u8 + $N - 1) / $N;

// Convert to little-endian:
for ref mut x in $src[0..chunk_size].iter_mut() {
**x = (*x).to_le();
}

let bytes = unsafe { slice::from_raw_parts($src.as_ptr() as *const u8,
$src.len() * $N) };

let dest_chunk = &mut $dest[0..chunk_size_u8];
dest_chunk.copy_from_slice(&bytes[0..chunk_size_u8]);

(chunk_size, chunk_size_u8)
});
}

/// Implement `fill_bytes` by reading chunks from the output buffer of a block
/// based RNG.
///
/// The return values are `(consumed_u32, filled_u8)`.
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure these names are sufficiently clear without explanation.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will try write some better documentation.

///
/// `filled_u8` is the number of filled bytes in `dest`, which may be less than
/// the length of `dest`.
/// `consumed_u32` is the number of words consumed from `src`, which is the same
/// as `filled_u8 / 4` rounded up.
///
/// Note that on big-endian systems values in the output buffer `src` are
/// mutated. `src[0..consumed_u32]` get converted to little-endian before
/// copying.
///
/// # Example
/// (from `IsaacRng`)
///
/// ```rust,ignore
/// fn fill_bytes(&mut self, dest: &mut [u8]) {
/// let mut read_len = 0;
/// while read_len < dest.len() {
/// if self.index >= self.rsl.len() {
/// self.isaac();
/// }
///
/// let (consumed_u32, filled_u8) =
/// impls::fill_via_u32_chunks(&mut self.rsl[self.index..],
/// &mut dest[read_len..]);
///
/// self.index += consumed_u32;
/// read_len += filled_u8;
/// }
/// }
/// ```
pub fn fill_via_u32_chunks(src: &mut [u32], dest: &mut [u8]) -> (usize, usize) {
fill_via_chunks!(src, dest, 4)
}

/// Implement `fill_bytes` by reading chunks from the output buffer of a block
/// based RNG.
///
/// The return values are `(consumed_u64, filled_u8)`.
/// `filled_u8` is the number of filled bytes in `dest`, which may be less than
/// the length of `dest`.
/// `consumed_u64` is the number of words consumed from `src`, which is the same
/// as `filled_u8 / 8` rounded up.
///
/// Note that on big-endian systems values in the output buffer `src` are
/// mutated. `src[0..consumed_u64]` get converted to little-endian before
/// copying.
///
/// See `fill_via_u32_chunks` for an example.
pub fn fill_via_u64_chunks(src: &mut [u64], dest: &mut [u8]) -> (usize, usize) {
fill_via_chunks!(src, dest, 8)
}

/// Implement `next_u32` via `fill_bytes`, little-endian order.
pub fn next_u32_via_fill<R: Rng+?Sized>(rng: &mut R) -> u32 {
impl_uint_from_fill!(rng, u32, 4)
136 changes: 59 additions & 77 deletions src/prng/chacha.rs
Original file line number Diff line number Diff line change
@@ -10,13 +10,10 @@

//! The ChaCha random number generator.

use core::num::Wrapping as w;
use core::fmt;
use rand_core::impls;
use {Rng, CryptoRng, SeedFromRng, SeedableRng, Error};

#[allow(bad_style)]
type w32 = w<u32>;

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure that removing this is actually a win... I mean now you have .wrapping_add in a few places and can't just think I know this algorithm uses wrapping arithmetic everywhere.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the commit message for ChaCha I added this note:

This also replaces core::num::Wrapping with a few wrapping_add's.
There were about 30 conversions to and from Wrapping, while there are only
9 wrapping operations.

Because fill_via_u32_chunks expects a [u32], converting away was just
easier.

I agree that I know this algorithm uses wrapping arithmetic everywhere is an advantage. Not all operations are available on wrapping types though, like rotate_*. You can maybe consider this to be a bug in the standard library.

While working with ISAAC, XorShift* and PCG it happened to many times I had to ask myself if I was working with the wrapped or the normal type, and if an operation was available.

const KEY_WORDS : usize = 8; // 8 words for the 256-bit key
const STATE_WORDS : usize = 16;
const CHACHA_ROUNDS: u32 = 20; // Cryptographically secure from 8 upwards as of this writing
@@ -32,9 +29,9 @@ const CHACHA_ROUNDS: u32 = 20; // Cryptographically secure from 8 upwards as of
/// Salsa20*](http://cr.yp.to/chacha.html)
#[derive(Clone)]
pub struct ChaChaRng {
buffer: [w32; STATE_WORDS], // Internal buffer of output
state: [w32; STATE_WORDS], // Initial state
index: usize, // Index into state
buffer: [u32; STATE_WORDS], // Internal buffer of output
state: [u32; STATE_WORDS], // Initial state
index: usize, // Index into state
}

// Custom Debug implementation that does not expose the internal state
@@ -46,10 +43,10 @@ impl fmt::Debug for ChaChaRng {

macro_rules! quarter_round{
($a: expr, $b: expr, $c: expr, $d: expr) => {{
$a = $a + $b; $d = $d ^ $a; $d = w($d.0.rotate_left(16));
$c = $c + $d; $b = $b ^ $c; $b = w($b.0.rotate_left(12));
$a = $a + $b; $d = $d ^ $a; $d = w($d.0.rotate_left( 8));
$c = $c + $d; $b = $b ^ $c; $b = w($b.0.rotate_left( 7));
$a = $a.wrapping_add($b); $d ^= $a; $d = $d.rotate_left(16);
$c = $c.wrapping_add($d); $b ^= $c; $b = $b.rotate_left(12);
$a = $a.wrapping_add($b); $d ^= $a; $d = $d.rotate_left( 8);
$c = $c.wrapping_add($d); $b ^= $c; $b = $b.rotate_left( 7);
}}
}

@@ -69,15 +66,15 @@ macro_rules! double_round{
}

#[inline]
fn core(output: &mut [w32; STATE_WORDS], input: &[w32; STATE_WORDS]) {
*output = *input;
fn core(new: &mut [u32; STATE_WORDS], input: &[u32; STATE_WORDS]) {
*new = *input;

for _ in 0..CHACHA_ROUNDS / 2 {
double_round!(output);
double_round!(new);
}

for i in 0..STATE_WORDS {
output[i] = output[i] + input[i];
new[i] = new[i].wrapping_add(input[i]);
}
}

@@ -104,8 +101,8 @@ impl ChaChaRng {
/// - 2419978656
pub fn new_unseeded() -> ChaChaRng {
let mut rng = ChaChaRng {
buffer: [w(0); STATE_WORDS],
state: [w(0); STATE_WORDS],
buffer: [0; STATE_WORDS],
state: [0; STATE_WORDS],
index: STATE_WORDS
};
rng.init(&[0; KEY_WORDS]);
@@ -133,10 +130,10 @@ impl ChaChaRng {
/// println!("{:?}", ra.next_u32());
/// ```
pub fn set_counter(&mut self, counter_low: u64, counter_high: u64) {
self.state[12] = w((counter_low >> 0) as u32);
self.state[13] = w((counter_low >> 32) as u32);
self.state[14] = w((counter_high >> 0) as u32);
self.state[15] = w((counter_high >> 32) as u32);
self.state[12] = (counter_low >> 0) as u32;
self.state[13] = (counter_low >> 32) as u32;
self.state[14] = (counter_high >> 0) as u32;
self.state[15] = (counter_high >> 32) as u32;
self.index = STATE_WORDS; // force recomputation
}

@@ -159,19 +156,19 @@ impl ChaChaRng {
/// [1]: Daniel J. Bernstein. [*Extending the Salsa20
/// nonce.*](http://cr.yp.to/papers.html#xsalsa)
fn init(&mut self, key: &[u32; KEY_WORDS]) {
self.state[0] = w(0x61707865);
self.state[1] = w(0x3320646E);
self.state[2] = w(0x79622D32);
self.state[3] = w(0x6B206574);
self.state[0] = 0x61707865;
self.state[1] = 0x3320646E;
self.state[2] = 0x79622D32;
self.state[3] = 0x6B206574;

for i in 0..KEY_WORDS {
self.state[4+i] = w(key[i]);
self.state[4+i] = key[i];
}

self.state[12] = w(0);
self.state[13] = w(0);
self.state[14] = w(0);
self.state[15] = w(0);
self.state[12] = 0;
self.state[13] = 0;
self.state[14] = 0;
self.state[15] = 0;

self.index = STATE_WORDS;
}
@@ -181,69 +178,54 @@ impl ChaChaRng {
core(&mut self.buffer, &self.state);
self.index = 0;
// update 128-bit counter
self.state[12] = self.state[12] + w(1);
if self.state[12] != w(0) { return };
self.state[13] = self.state[13] + w(1);
if self.state[13] != w(0) { return };
self.state[14] = self.state[14] + w(1);
if self.state[14] != w(0) { return };
self.state[15] = self.state[15] + w(1);
self.state[12] = self.state[12].wrapping_add(1);
if self.state[12] != 0 { return };
self.state[13] = self.state[13].wrapping_add(1);
if self.state[13] != 0 { return };
self.state[14] = self.state[14].wrapping_add(1);
if self.state[14] != 0 { return };
self.state[15] = self.state[15].wrapping_add(1);
}
}

impl Rng for ChaChaRng {
#[inline]
fn next_u32(&mut self) -> u32 {
if self.index == STATE_WORDS {
// Using a local variable for `index`, and checking the size avoids a
// bounds check later on.
let mut index = self.index as usize;
if index >= STATE_WORDS {
self.update();
index = 0;
}

let value = self.buffer[self.index % STATE_WORDS];
let value = self.buffer[index];
self.index += 1;
value.0
value
}

fn next_u64(&mut self) -> u64 {
::rand_core::impls::next_u64_via_u32(self)
impls::next_u64_via_u32(self)
}

#[cfg(feature = "i128_support")]
fn next_u128(&mut self) -> u128 {
::rand_core::impls::next_u128_via_u64(self)
impls::next_u128_via_u64(self)
}

// Custom implementation allowing larger reads from buffer is about 8%
// faster than default implementation in my tests

fn fill_bytes(&mut self, dest: &mut [u8]) {
use core::cmp::min;
use core::intrinsics::{transmute, copy_nonoverlapping};

let mut left = dest;
while left.len() >= 4 {
if self.index == STATE_WORDS {
let mut read_len = 0;
while read_len < dest.len() {
if self.index >= self.buffer.len() {
self.update();
}

let words = min(left.len() / 4, STATE_WORDS - self.index);
let (l, r) = {left}.split_at_mut(4 * words);
left = r;

// convert to LE:
for ref mut x in self.buffer[self.index..self.index+words].iter_mut() {
**x = w((*x).0.to_le());
}

unsafe{ copy_nonoverlapping(
&self.buffer[self.index].0 as *const u32 as *const u8,
l.as_mut_ptr(),
4 * words) };
self.index += words;
}
let n = left.len();
if n > 0 {
let chunk: [u8; 4] = unsafe {
transmute(self.next_u32().to_le())
};
left.copy_from_slice(&chunk[..n]);

let (consumed_u32, filled_u8) =
impls::fill_via_u32_chunks(&mut self.buffer[self.index..],
&mut dest[read_len..]);

self.index += consumed_u32;
read_len += filled_u8;
}
}

@@ -271,16 +253,16 @@ impl<'a> SeedableRng<&'a [u32]> for ChaChaRng {
/// words are used, the remaining are set to zero.
fn from_seed(seed: &'a [u32]) -> ChaChaRng {
let mut rng = ChaChaRng {
buffer: [w(0); STATE_WORDS],
state: [w(0); STATE_WORDS],
buffer: [0; STATE_WORDS],
state: [0; STATE_WORDS],
index: STATE_WORDS
};
rng.init(&[0u32; KEY_WORDS]);
// set key in place
{
let key = &mut rng.state[4 .. 4+KEY_WORDS];
for (k, s) in key.iter_mut().zip(seed.iter()) {
*k = w(*s);
*k = *s;
}
}
rng
Loading