dhardy · dhardy · Nov 11, 2017 · Nov 10, 2017 · Nov 11, 2017 · Nov 11, 2017
diff --git a/rand_core/src/impls.rs b/rand_core/src/impls.rs
@@ -21,6 +21,7 @@
 
 use core::intrinsics::transmute;
 use core::slice;
+use core::cmp::min;
 use Rng;
 
 /// Implement `next_u64` via `next_u32`, little-endian order.
@@ -93,6 +94,82 @@ macro_rules! impl_uint_from_fill {
     });
 }
 
+macro_rules! fill_via_chunks {
+    ($src:expr, $dest:expr, $N:expr) => ({
+        let chunk_size_u8 = min($src.len() * $N, $dest.len());
+        let chunk_size = (chunk_size_u8 + $N - 1) / $N;
+
+        // Convert to little-endian:
+        for ref mut x in $src[0..chunk_size].iter_mut() {
+            **x = (*x).to_le();
+        }
+
+        let bytes = unsafe { slice::from_raw_parts($src.as_ptr() as *const u8,
+                                                   $src.len() * $N) };
+
+        let dest_chunk = &mut $dest[0..chunk_size_u8];
+        dest_chunk.copy_from_slice(&bytes[0..chunk_size_u8]);
+
+        (chunk_size, chunk_size_u8)
+    });
+}
+
+/// Implement `fill_bytes` by reading chunks from the output buffer of a block
+/// based RNG.
+///
+/// The return values are `(consumed_u32, filled_u8)`.
+///
+/// `filled_u8` is the number of filled bytes in `dest`, which may be less than
+/// the length of `dest`.
+/// `consumed_u32` is the number of words consumed from `src`, which is the same
+/// as `filled_u8 / 4` rounded up.
+///
+/// Note that on big-endian systems values in the output buffer `src` are
+/// mutated. `src[0..consumed_u32]` get converted to little-endian before
+/// copying.
+///
+/// # Example
+/// (from `IsaacRng`)
+///
+/// ```rust,ignore
+/// fn fill_bytes(&mut self, dest: &mut [u8]) {
+///     let mut read_len = 0;
+///     while read_len < dest.len() {
+///         if self.index >= self.rsl.len() {
+///             self.isaac();
+///         }
+///
+///         let (consumed_u32, filled_u8) =
+///             impls::fill_via_u32_chunks(&mut self.rsl[self.index..],
+///                                        &mut dest[read_len..]);
+///
+///         self.index += consumed_u32;
+///         read_len += filled_u8;
+///     }
+/// }
+/// ```
+pub fn fill_via_u32_chunks(src: &mut [u32], dest: &mut [u8]) -> (usize, usize) {
+    fill_via_chunks!(src, dest, 4)
+}
+
+/// Implement `fill_bytes` by reading chunks from the output buffer of a block
+/// based RNG.
+///
+/// The return values are `(consumed_u64, filled_u8)`.
+/// `filled_u8` is the number of filled bytes in `dest`, which may be less than
+/// the length of `dest`.
+/// `consumed_u64` is the number of words consumed from `src`, which is the same
+/// as `filled_u8 / 8` rounded up.
+///
+/// Note that on big-endian systems values in the output buffer `src` are
+/// mutated. `src[0..consumed_u64]` get converted to little-endian before
+/// copying.
+///
+/// See `fill_via_u32_chunks` for an example.
+pub fn fill_via_u64_chunks(src: &mut [u64], dest: &mut [u8]) -> (usize, usize) {
+    fill_via_chunks!(src, dest, 8)
+}
+
 /// Implement `next_u32` via `fill_bytes`, little-endian order.
 pub fn next_u32_via_fill<R: Rng+?Sized>(rng: &mut R) -> u32 {
     impl_uint_from_fill!(rng, u32, 4)

diff --git a/src/prng/chacha.rs b/src/prng/chacha.rs
@@ -10,13 +10,10 @@
 
 //! The ChaCha random number generator.
 
-use core::num::Wrapping as w;
 use core::fmt;
+use rand_core::impls;
 use {Rng, CryptoRng, SeedFromRng, SeedableRng, Error};
 
-#[allow(bad_style)]
-type w32 = w<u32>;
-
 const KEY_WORDS    : usize =  8; // 8 words for the 256-bit key
 const STATE_WORDS  : usize = 16;
 const CHACHA_ROUNDS: u32 = 20; // Cryptographically secure from 8 upwards as of this writing
@@ -32,9 +29,9 @@ const CHACHA_ROUNDS: u32 = 20; // Cryptographically secure from 8 upwards as of
 /// Salsa20*](http://cr.yp.to/chacha.html)
 #[derive(Clone)]
 pub struct ChaChaRng {
-    buffer:  [w32; STATE_WORDS], // Internal buffer of output
-    state:   [w32; STATE_WORDS], // Initial state
-    index:   usize,                 // Index into state
+    buffer:  [u32; STATE_WORDS], // Internal buffer of output
+    state:   [u32; STATE_WORDS], // Initial state
+    index:   usize,              // Index into state
 }
 
 // Custom Debug implementation that does not expose the internal state
@@ -46,10 +43,10 @@ impl fmt::Debug for ChaChaRng {
 
 macro_rules! quarter_round{
     ($a: expr, $b: expr, $c: expr, $d: expr) => {{
-        $a = $a + $b; $d = $d ^ $a; $d = w($d.0.rotate_left(16));
-        $c = $c + $d; $b = $b ^ $c; $b = w($b.0.rotate_left(12));
-        $a = $a + $b; $d = $d ^ $a; $d = w($d.0.rotate_left( 8));
-        $c = $c + $d; $b = $b ^ $c; $b = w($b.0.rotate_left( 7));
+        $a = $a.wrapping_add($b); $d ^= $a; $d = $d.rotate_left(16);
+        $c = $c.wrapping_add($d); $b ^= $c; $b = $b.rotate_left(12);
+        $a = $a.wrapping_add($b); $d ^= $a; $d = $d.rotate_left( 8);
+        $c = $c.wrapping_add($d); $b ^= $c; $b = $b.rotate_left( 7);
     }}
 }
 
@@ -69,15 +66,15 @@ macro_rules! double_round{
 }
 
 #[inline]
-fn core(output: &mut [w32; STATE_WORDS], input: &[w32; STATE_WORDS]) {
-    *output = *input;
+fn core(new: &mut [u32; STATE_WORDS], input: &[u32; STATE_WORDS]) {
+    *new = *input;
 
     for _ in 0..CHACHA_ROUNDS / 2 {
-        double_round!(output);
+        double_round!(new);
     }
 
     for i in 0..STATE_WORDS {
-        output[i] = output[i] + input[i];
+        new[i] = new[i].wrapping_add(input[i]);
     }
 }
 
@@ -104,8 +101,8 @@ impl ChaChaRng {
     /// - 2419978656
     pub fn new_unseeded() -> ChaChaRng {
         let mut rng = ChaChaRng {
-            buffer:  [w(0); STATE_WORDS],
-            state:   [w(0); STATE_WORDS],
+            buffer:  [0; STATE_WORDS],
+            state:   [0; STATE_WORDS],
             index:   STATE_WORDS
         };
         rng.init(&[0; KEY_WORDS]);
@@ -133,10 +130,10 @@ impl ChaChaRng {
     /// println!("{:?}", ra.next_u32());
     /// ```
     pub fn set_counter(&mut self, counter_low: u64, counter_high: u64) {
-        self.state[12] = w((counter_low >>  0) as u32);
-        self.state[13] = w((counter_low >> 32) as u32);
-        self.state[14] = w((counter_high >>  0) as u32);
-        self.state[15] = w((counter_high >> 32) as u32);
+        self.state[12] = (counter_low >>  0) as u32;
+        self.state[13] = (counter_low >> 32) as u32;
+        self.state[14] = (counter_high >>  0) as u32;
+        self.state[15] = (counter_high >> 32) as u32;
         self.index = STATE_WORDS; // force recomputation
     }
 
@@ -159,19 +156,19 @@ impl ChaChaRng {
     /// [1]: Daniel J. Bernstein. [*Extending the Salsa20
     /// nonce.*](http://cr.yp.to/papers.html#xsalsa)
     fn init(&mut self, key: &[u32; KEY_WORDS]) {
-        self.state[0] = w(0x61707865);
-        self.state[1] = w(0x3320646E);
-        self.state[2] = w(0x79622D32);
-        self.state[3] = w(0x6B206574);
+        self.state[0] = 0x61707865;
+        self.state[1] = 0x3320646E;
+        self.state[2] = 0x79622D32;
+        self.state[3] = 0x6B206574;
 
         for i in 0..KEY_WORDS {
-            self.state[4+i] = w(key[i]);
+            self.state[4+i] = key[i];
         }
 
-        self.state[12] = w(0);
-        self.state[13] = w(0);
-        self.state[14] = w(0);
-        self.state[15] = w(0);
+        self.state[12] = 0;
+        self.state[13] = 0;
+        self.state[14] = 0;
+        self.state[15] = 0;
 
         self.index = STATE_WORDS;
     }
@@ -181,69 +178,54 @@ impl ChaChaRng {
         core(&mut self.buffer, &self.state);
         self.index = 0;
         // update 128-bit counter
-        self.state[12] = self.state[12] + w(1);
-        if self.state[12] != w(0) { return };
-        self.state[13] = self.state[13] + w(1);
-        if self.state[13] != w(0) { return };
-        self.state[14] = self.state[14] + w(1);
-        if self.state[14] != w(0) { return };
-        self.state[15] = self.state[15] + w(1);
+        self.state[12] = self.state[12].wrapping_add(1);
+        if self.state[12] != 0 { return };
+        self.state[13] = self.state[13].wrapping_add(1);
+        if self.state[13] != 0 { return };
+        self.state[14] = self.state[14].wrapping_add(1);
+        if self.state[14] != 0 { return };
+        self.state[15] = self.state[15].wrapping_add(1);
     }
 }
 
 impl Rng for ChaChaRng {
     #[inline]
     fn next_u32(&mut self) -> u32 {
-        if self.index == STATE_WORDS {
+        // Using a local variable for `index`, and checking the size avoids a
+        // bounds check later on.
+        let mut index = self.index as usize;
+        if index >= STATE_WORDS {
             self.update();
+            index = 0;
         }
 
-        let value = self.buffer[self.index % STATE_WORDS];
+        let value = self.buffer[index];
         self.index += 1;
-        value.0
+        value
     }
-    
+
     fn next_u64(&mut self) -> u64 {
-        ::rand_core::impls::next_u64_via_u32(self)
+        impls::next_u64_via_u32(self)
     }
+
     #[cfg(feature = "i128_support")]
     fn next_u128(&mut self) -> u128 {
-        ::rand_core::impls::next_u128_via_u64(self)
+        impls::next_u128_via_u64(self)
     }
-
-    // Custom implementation allowing larger reads from buffer is about 8%
-    // faster than default implementation in my tests
+
     fn fill_bytes(&mut self, dest: &mut [u8]) {
-        use core::cmp::min;
-        use core::intrinsics::{transmute, copy_nonoverlapping};
-
-        let mut left = dest;
-        while left.len() >= 4 {
-            if self.index == STATE_WORDS {
+        let mut read_len = 0;
+        while read_len < dest.len() {
+            if self.index >= self.buffer.len() {
                 self.update();
             }
-
-            let words = min(left.len() / 4, STATE_WORDS - self.index);
-            let (l, r) = {left}.split_at_mut(4 * words);
-            left = r;
-
-            // convert to LE:
-            for ref mut x in self.buffer[self.index..self.index+words].iter_mut() {
-                **x = w((*x).0.to_le());
-            }
-
-            unsafe{ copy_nonoverlapping(
-                &self.buffer[self.index].0 as *const u32 as *const u8,
-                l.as_mut_ptr(),
-                4 * words) };
-            self.index += words;
-        }
-        let n = left.len();
-        if n > 0 {
-            let chunk: [u8; 4] = unsafe {
-                transmute(self.next_u32().to_le())
-            };
-            left.copy_from_slice(&chunk[..n]);
+
+            let (consumed_u32, filled_u8) =
+                impls::fill_via_u32_chunks(&mut self.buffer[self.index..],
+                                           &mut dest[read_len..]);
+
+            self.index += consumed_u32;
+            read_len += filled_u8;
         }
     }
 
@@ -271,16 +253,16 @@ impl<'a> SeedableRng<&'a [u32]> for ChaChaRng {
     /// words are used, the remaining are set to zero.
     fn from_seed(seed: &'a [u32]) -> ChaChaRng {
         let mut rng = ChaChaRng {
-            buffer:  [w(0); STATE_WORDS],
-            state:   [w(0); STATE_WORDS],
+            buffer:  [0; STATE_WORDS],
+            state:   [0; STATE_WORDS],
             index:   STATE_WORDS
         };
         rng.init(&[0u32; KEY_WORDS]);
         // set key in place
         {
             let key = &mut rng.state[4 .. 4+KEY_WORDS];
             for (k, s) in key.iter_mut().zip(seed.iter()) {
-                *k = w(*s);
+                *k = *s;
             }
         }
         rng