Skip to content

Commit fe822c0

Browse files
authored
Merge pull request #45 from pitdicker/isaac_optim
Improve ISAAC performance (take 2)
2 parents 701679c + 69d940f commit fe822c0

File tree

4 files changed

+262
-150
lines changed

4 files changed

+262
-150
lines changed

rand_core/src/impls.rs

+77
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
2222
use core::intrinsics::transmute;
2323
use core::slice;
24+
use core::cmp::min;
2425
use Rng;
2526

2627
/// Implement `next_u64` via `next_u32`, little-endian order.
@@ -93,6 +94,82 @@ macro_rules! impl_uint_from_fill {
9394
});
9495
}
9596

97+
macro_rules! fill_via_chunks {
98+
($src:expr, $dest:expr, $N:expr) => ({
99+
let chunk_size_u8 = min($src.len() * $N, $dest.len());
100+
let chunk_size = (chunk_size_u8 + $N - 1) / $N;
101+
102+
// Convert to little-endian:
103+
for ref mut x in $src[0..chunk_size].iter_mut() {
104+
**x = (*x).to_le();
105+
}
106+
107+
let bytes = unsafe { slice::from_raw_parts($src.as_ptr() as *const u8,
108+
$src.len() * $N) };
109+
110+
let dest_chunk = &mut $dest[0..chunk_size_u8];
111+
dest_chunk.copy_from_slice(&bytes[0..chunk_size_u8]);
112+
113+
(chunk_size, chunk_size_u8)
114+
});
115+
}
116+
117+
/// Implement `fill_bytes` by reading chunks from the output buffer of a block
118+
/// based RNG.
119+
///
120+
/// The return values are `(consumed_u32, filled_u8)`.
121+
///
122+
/// `filled_u8` is the number of filled bytes in `dest`, which may be less than
123+
/// the length of `dest`.
124+
/// `consumed_u32` is the number of words consumed from `src`, which is the same
125+
/// as `filled_u8 / 4` rounded up.
126+
///
127+
/// Note that on big-endian systems values in the output buffer `src` are
128+
/// mutated. `src[0..consumed_u32]` get converted to little-endian before
129+
/// copying.
130+
///
131+
/// # Example
132+
/// (from `IsaacRng`)
133+
///
134+
/// ```rust,ignore
135+
/// fn fill_bytes(&mut self, dest: &mut [u8]) {
136+
/// let mut read_len = 0;
137+
/// while read_len < dest.len() {
138+
/// if self.index >= self.rsl.len() {
139+
/// self.isaac();
140+
/// }
141+
///
142+
/// let (consumed_u32, filled_u8) =
143+
/// impls::fill_via_u32_chunks(&mut self.rsl[self.index..],
144+
/// &mut dest[read_len..]);
145+
///
146+
/// self.index += consumed_u32;
147+
/// read_len += filled_u8;
148+
/// }
149+
/// }
150+
/// ```
151+
pub fn fill_via_u32_chunks(src: &mut [u32], dest: &mut [u8]) -> (usize, usize) {
152+
fill_via_chunks!(src, dest, 4)
153+
}
154+
155+
/// Implement `fill_bytes` by reading chunks from the output buffer of a block
156+
/// based RNG.
157+
///
158+
/// The return values are `(consumed_u64, filled_u8)`.
159+
/// `filled_u8` is the number of filled bytes in `dest`, which may be less than
160+
/// the length of `dest`.
161+
/// `consumed_u64` is the number of words consumed from `src`, which is the same
162+
/// as `filled_u8 / 8` rounded up.
163+
///
164+
/// Note that on big-endian systems values in the output buffer `src` are
165+
/// mutated. `src[0..consumed_u64]` get converted to little-endian before
166+
/// copying.
167+
///
168+
/// See `fill_via_u32_chunks` for an example.
169+
pub fn fill_via_u64_chunks(src: &mut [u64], dest: &mut [u8]) -> (usize, usize) {
170+
fill_via_chunks!(src, dest, 8)
171+
}
172+
96173
/// Implement `next_u32` via `fill_bytes`, little-endian order.
97174
pub fn next_u32_via_fill<R: Rng+?Sized>(rng: &mut R) -> u32 {
98175
impl_uint_from_fill!(rng, u32, 4)

src/prng/chacha.rs

+59-77
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,10 @@
1010

1111
//! The ChaCha random number generator.
1212
13-
use core::num::Wrapping as w;
1413
use core::fmt;
14+
use rand_core::impls;
1515
use {Rng, CryptoRng, SeedFromRng, SeedableRng, Error};
1616

17-
#[allow(bad_style)]
18-
type w32 = w<u32>;
19-
2017
const KEY_WORDS : usize = 8; // 8 words for the 256-bit key
2118
const STATE_WORDS : usize = 16;
2219
const CHACHA_ROUNDS: u32 = 20; // Cryptographically secure from 8 upwards as of this writing
@@ -32,9 +29,9 @@ const CHACHA_ROUNDS: u32 = 20; // Cryptographically secure from 8 upwards as of
3229
/// Salsa20*](http://cr.yp.to/chacha.html)
3330
#[derive(Clone)]
3431
pub struct ChaChaRng {
35-
buffer: [w32; STATE_WORDS], // Internal buffer of output
36-
state: [w32; STATE_WORDS], // Initial state
37-
index: usize, // Index into state
32+
buffer: [u32; STATE_WORDS], // Internal buffer of output
33+
state: [u32; STATE_WORDS], // Initial state
34+
index: usize, // Index into state
3835
}
3936

4037
// Custom Debug implementation that does not expose the internal state
@@ -46,10 +43,10 @@ impl fmt::Debug for ChaChaRng {
4643

4744
macro_rules! quarter_round{
4845
($a: expr, $b: expr, $c: expr, $d: expr) => {{
49-
$a = $a + $b; $d = $d ^ $a; $d = w($d.0.rotate_left(16));
50-
$c = $c + $d; $b = $b ^ $c; $b = w($b.0.rotate_left(12));
51-
$a = $a + $b; $d = $d ^ $a; $d = w($d.0.rotate_left( 8));
52-
$c = $c + $d; $b = $b ^ $c; $b = w($b.0.rotate_left( 7));
46+
$a = $a.wrapping_add($b); $d ^= $a; $d = $d.rotate_left(16);
47+
$c = $c.wrapping_add($d); $b ^= $c; $b = $b.rotate_left(12);
48+
$a = $a.wrapping_add($b); $d ^= $a; $d = $d.rotate_left( 8);
49+
$c = $c.wrapping_add($d); $b ^= $c; $b = $b.rotate_left( 7);
5350
}}
5451
}
5552

@@ -69,15 +66,15 @@ macro_rules! double_round{
6966
}
7067

7168
#[inline]
72-
fn core(output: &mut [w32; STATE_WORDS], input: &[w32; STATE_WORDS]) {
73-
*output = *input;
69+
fn core(new: &mut [u32; STATE_WORDS], input: &[u32; STATE_WORDS]) {
70+
*new = *input;
7471

7572
for _ in 0..CHACHA_ROUNDS / 2 {
76-
double_round!(output);
73+
double_round!(new);
7774
}
7875

7976
for i in 0..STATE_WORDS {
80-
output[i] = output[i] + input[i];
77+
new[i] = new[i].wrapping_add(input[i]);
8178
}
8279
}
8380

@@ -104,8 +101,8 @@ impl ChaChaRng {
104101
/// - 2419978656
105102
pub fn new_unseeded() -> ChaChaRng {
106103
let mut rng = ChaChaRng {
107-
buffer: [w(0); STATE_WORDS],
108-
state: [w(0); STATE_WORDS],
104+
buffer: [0; STATE_WORDS],
105+
state: [0; STATE_WORDS],
109106
index: STATE_WORDS
110107
};
111108
rng.init(&[0; KEY_WORDS]);
@@ -133,10 +130,10 @@ impl ChaChaRng {
133130
/// println!("{:?}", ra.next_u32());
134131
/// ```
135132
pub fn set_counter(&mut self, counter_low: u64, counter_high: u64) {
136-
self.state[12] = w((counter_low >> 0) as u32);
137-
self.state[13] = w((counter_low >> 32) as u32);
138-
self.state[14] = w((counter_high >> 0) as u32);
139-
self.state[15] = w((counter_high >> 32) as u32);
133+
self.state[12] = (counter_low >> 0) as u32;
134+
self.state[13] = (counter_low >> 32) as u32;
135+
self.state[14] = (counter_high >> 0) as u32;
136+
self.state[15] = (counter_high >> 32) as u32;
140137
self.index = STATE_WORDS; // force recomputation
141138
}
142139

@@ -159,19 +156,19 @@ impl ChaChaRng {
159156
/// [1]: Daniel J. Bernstein. [*Extending the Salsa20
160157
/// nonce.*](http://cr.yp.to/papers.html#xsalsa)
161158
fn init(&mut self, key: &[u32; KEY_WORDS]) {
162-
self.state[0] = w(0x61707865);
163-
self.state[1] = w(0x3320646E);
164-
self.state[2] = w(0x79622D32);
165-
self.state[3] = w(0x6B206574);
159+
self.state[0] = 0x61707865;
160+
self.state[1] = 0x3320646E;
161+
self.state[2] = 0x79622D32;
162+
self.state[3] = 0x6B206574;
166163

167164
for i in 0..KEY_WORDS {
168-
self.state[4+i] = w(key[i]);
165+
self.state[4+i] = key[i];
169166
}
170167

171-
self.state[12] = w(0);
172-
self.state[13] = w(0);
173-
self.state[14] = w(0);
174-
self.state[15] = w(0);
168+
self.state[12] = 0;
169+
self.state[13] = 0;
170+
self.state[14] = 0;
171+
self.state[15] = 0;
175172

176173
self.index = STATE_WORDS;
177174
}
@@ -181,69 +178,54 @@ impl ChaChaRng {
181178
core(&mut self.buffer, &self.state);
182179
self.index = 0;
183180
// update 128-bit counter
184-
self.state[12] = self.state[12] + w(1);
185-
if self.state[12] != w(0) { return };
186-
self.state[13] = self.state[13] + w(1);
187-
if self.state[13] != w(0) { return };
188-
self.state[14] = self.state[14] + w(1);
189-
if self.state[14] != w(0) { return };
190-
self.state[15] = self.state[15] + w(1);
181+
self.state[12] = self.state[12].wrapping_add(1);
182+
if self.state[12] != 0 { return };
183+
self.state[13] = self.state[13].wrapping_add(1);
184+
if self.state[13] != 0 { return };
185+
self.state[14] = self.state[14].wrapping_add(1);
186+
if self.state[14] != 0 { return };
187+
self.state[15] = self.state[15].wrapping_add(1);
191188
}
192189
}
193190

194191
impl Rng for ChaChaRng {
195192
#[inline]
196193
fn next_u32(&mut self) -> u32 {
197-
if self.index == STATE_WORDS {
194+
// Using a local variable for `index`, and checking the size avoids a
195+
// bounds check later on.
196+
let mut index = self.index as usize;
197+
if index >= STATE_WORDS {
198198
self.update();
199+
index = 0;
199200
}
200201

201-
let value = self.buffer[self.index % STATE_WORDS];
202+
let value = self.buffer[index];
202203
self.index += 1;
203-
value.0
204+
value
204205
}
205-
206+
206207
fn next_u64(&mut self) -> u64 {
207-
::rand_core::impls::next_u64_via_u32(self)
208+
impls::next_u64_via_u32(self)
208209
}
210+
209211
#[cfg(feature = "i128_support")]
210212
fn next_u128(&mut self) -> u128 {
211-
::rand_core::impls::next_u128_via_u64(self)
213+
impls::next_u128_via_u64(self)
212214
}
213-
214-
// Custom implementation allowing larger reads from buffer is about 8%
215-
// faster than default implementation in my tests
215+
216216
fn fill_bytes(&mut self, dest: &mut [u8]) {
217-
use core::cmp::min;
218-
use core::intrinsics::{transmute, copy_nonoverlapping};
219-
220-
let mut left = dest;
221-
while left.len() >= 4 {
222-
if self.index == STATE_WORDS {
217+
let mut read_len = 0;
218+
while read_len < dest.len() {
219+
if self.index >= self.buffer.len() {
223220
self.update();
224221
}
225-
226-
let words = min(left.len() / 4, STATE_WORDS - self.index);
227-
let (l, r) = {left}.split_at_mut(4 * words);
228-
left = r;
229-
230-
// convert to LE:
231-
for ref mut x in self.buffer[self.index..self.index+words].iter_mut() {
232-
**x = w((*x).0.to_le());
233-
}
234-
235-
unsafe{ copy_nonoverlapping(
236-
&self.buffer[self.index].0 as *const u32 as *const u8,
237-
l.as_mut_ptr(),
238-
4 * words) };
239-
self.index += words;
240-
}
241-
let n = left.len();
242-
if n > 0 {
243-
let chunk: [u8; 4] = unsafe {
244-
transmute(self.next_u32().to_le())
245-
};
246-
left.copy_from_slice(&chunk[..n]);
222+
223+
let (consumed_u32, filled_u8) =
224+
impls::fill_via_u32_chunks(&mut self.buffer[self.index..],
225+
&mut dest[read_len..]);
226+
227+
self.index += consumed_u32;
228+
read_len += filled_u8;
247229
}
248230
}
249231

@@ -271,16 +253,16 @@ impl<'a> SeedableRng<&'a [u32]> for ChaChaRng {
271253
/// words are used, the remaining are set to zero.
272254
fn from_seed(seed: &'a [u32]) -> ChaChaRng {
273255
let mut rng = ChaChaRng {
274-
buffer: [w(0); STATE_WORDS],
275-
state: [w(0); STATE_WORDS],
256+
buffer: [0; STATE_WORDS],
257+
state: [0; STATE_WORDS],
276258
index: STATE_WORDS
277259
};
278260
rng.init(&[0u32; KEY_WORDS]);
279261
// set key in place
280262
{
281263
let key = &mut rng.state[4 .. 4+KEY_WORDS];
282264
for (k, s) in key.iter_mut().zip(seed.iter()) {
283-
*k = w(*s);
265+
*k = *s;
284266
}
285267
}
286268
rng

0 commit comments

Comments
 (0)