rust-random
diff --git a/‎benches/generators.rs
+28-4 b/‎benches/generators.rs
+28-4
diff --git a/‎rand-core/src/impls.rs
+173-2 b/‎rand-core/src/impls.rs
+173-2
diff --git a/‎rand-core/src/lib.rs
+59-13 b/‎rand-core/src/lib.rs
+59-13
@@ -12,6 +12,8 @@ use test::{black_box, Bencher};
 use rand::{RngCore, Rng, SeedableRng, NewRng, StdRng, OsRng, JitterRng, EntropyRng};
 use rand::{XorShiftRng, Hc128Rng, IsaacRng, Isaac64Rng, ChaChaRng};
 use rand::reseeding::ReseedingRng;
+use rand::prng::hc128::Hc128Core;
+use rand::thread_rng;
 
 macro_rules! gen_bytes {
     ($fnn:ident, $gen:expr) => {
@@ -150,10 +152,13 @@ chacha_rounds!(gen_bytes_chacha12, gen_u32_chacha12, gen_u64_chacha12, 12);
 chacha_rounds!(gen_bytes_chacha20, gen_u32_chacha20, gen_u64_chacha20, 20);
 
 
+const RESEEDING_THRESHOLD: u64 = 1024*1024*1024; // something high enough to get
+                                                 // deterministic measurements
+
 #[bench]
 fn reseeding_hc128_bytes(b: &mut Bencher) {
-    let mut rng = ReseedingRng::new(Hc128Rng::new(),
-                                    128*1024*1024,
+    let mut rng = ReseedingRng::new(Hc128Core::new(),
+                                    RESEEDING_THRESHOLD,
                                     EntropyRng::new());
     let mut buf = [0u8; BYTES_LEN];
     b.iter(|| {
@@ -169,8 +174,8 @@ macro_rules! reseeding_uint {
     ($fnn:ident, $ty:ty) => {
         #[bench]
         fn $fnn(b: &mut Bencher) {
-            let mut rng = ReseedingRng::new(Hc128Rng::new(),
-                                            128*1024*1024,
+            let mut rng = ReseedingRng::new(Hc128Core::new(),
+                                            RESEEDING_THRESHOLD,
                                             EntropyRng::new());
             b.iter(|| {
                 for _ in 0..RAND_BENCH_N {
@@ -184,3 +189,22 @@ macro_rules! reseeding_uint {
 
 reseeding_uint!(reseeding_hc128_u32, u32);
 reseeding_uint!(reseeding_hc128_u64, u64);
+
+
+macro_rules! threadrng_uint {
+    ($fnn:ident, $ty:ty) => {
+        #[bench]
+        fn $fnn(b: &mut Bencher) {
+            let mut rng = thread_rng();
+            b.iter(|| {
+                for _ in 0..RAND_BENCH_N {
+                    black_box(rng.gen::<$ty>());
+                }
+            });
+            b.bytes = size_of::<$ty>() as u64 * RAND_BENCH_N;
+        }
+    }
+}
+
+threadrng_uint!(thread_rng_u32, u32);
+threadrng_uint!(thread_rng_u64, u64);
@@ -19,12 +19,13 @@
 //! to/from byte sequences, and since its purpose is reproducibility,
 //! non-reproducible sources (e.g. `OsRng`) need not bother with it.
 
+use core::convert::AsRef;
 use core::intrinsics::transmute;
 use core::ptr::copy_nonoverlapping;
-use core::slice;
+use core::{fmt, slice};
 use core::cmp::min;
 use core::mem::size_of;
-use RngCore;
+use {RngCore, BlockRngCore, CryptoRng, SeedableRng, Error};
 
 /// Implement `next_u64` via `next_u32`, little-endian order.
 pub fn next_u64_via_u32<R: RngCore + ?Sized>(rng: &mut R) -> u64 {
@@ -164,4 +165,174 @@ pub fn next_u64_via_fill<R: RngCore + ?Sized>(rng: &mut R) -> u64 {
     impl_uint_from_fill!(rng, u64, 8)
 }
 
+/// Wrapper around PRNGs that implement [`BlockRngCore`] to keep a results
+/// buffer and offer the methods from [`RngCore`].
+///
+/// `BlockRng` has heavily optimized implementations of the [`RngCore`] methods
+/// reading values from the results buffer, as well as
+/// calling `BlockRngCore::generate` directly on the output array when
+/// `fill_bytes` / `try_fill_bytes` is called on a large array. These methods
+/// also handle the bookkeeping of when to generate a new batch of values.
+/// No generated values are ever thown away.
+///
+/// Currently `BlockRng` only implements `RngCore` for buffers which are slices
+/// of `u32` elements; this may be extended to other types in the future.
+///
+/// For easy initialization `BlockRng` also implements [`SeedableRng`].
+///
+/// [`BlockRngCore`]: ../BlockRngCore.t.html
+/// [`RngCore`]: ../RngCore.t.html
+/// [`SeedableRng`]: ../SeedableRng.t.html
+#[derive(Clone)]
+pub struct BlockRng<R: BlockRngCore + ?Sized> {
+    pub results: R::Results,
+    pub index: usize,
+    pub core: R,
+}
+
+// Custom Debug implementation that does not expose the contents of `results`.
+impl<R: BlockRngCore + fmt::Debug> fmt::Debug for BlockRng<R> {
+    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
+        fmt.debug_struct("BlockRng")
+           .field("core", &self.core)
+           .field("result_len", &self.results.as_ref().len())
+           .field("index", &self.index)
+           .finish()
+    }
+}
+
+impl<R: BlockRngCore<Item=u32>> RngCore for BlockRng<R>
+where <R as BlockRngCore>::Results: AsRef<[u32]>
+{
+    #[inline(always)]
+    fn next_u32(&mut self) -> u32 {
+        if self.index >= self.results.as_ref().len() {
+            self.core.generate(&mut self.results);
+            self.index = 0;
+        }
+
+        let value = self.results.as_ref()[self.index];
+        self.index += 1;
+        value
+    }
+
+    #[inline(always)]
+    fn next_u64(&mut self) -> u64 {
+        let read_u64 = |results: &[u32], index| {
+            if cfg!(any(target_arch = "x86", target_arch = "x86_64")) {
+                // requires little-endian CPU supporting unaligned reads:
+                unsafe { *(&results[index] as *const u32 as *const u64) }
+            } else {
+                let x = results[index] as u64;
+                let y = results[index + 1] as u64;
+                (y << 32) | x
+            }
+        };
+
+        let len = self.results.as_ref().len();
+
+        let index = self.index;
+        if index < len-1 {
+            self.index += 2;
+            // Read an u64 from the current index
+            read_u64(self.results.as_ref(), index)
+        } else if index >= len {
+            self.core.generate(&mut self.results);
+            self.index = 2;
+            read_u64(self.results.as_ref(), 0)
+        } else {
+            let x = self.results.as_ref()[len-1] as u64;
+            self.core.generate(&mut self.results);
+            self.index = 1;
+            let y = self.results.as_ref()[0] as u64;
+            (y << 32) | x
+        }
+    }
+
+    // As an optimization we try to write directly into the output buffer.
+    // This is only enabled for little-endian platforms where unaligned writes
+    // are known to be safe and fast.
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    fn fill_bytes(&mut self, dest: &mut [u8]) {
+        let mut filled = 0;
+
+        // Continue filling from the current set of results
+        if self.index < self.results.as_ref().len() {
+            let (consumed_u32, filled_u8) =
+                fill_via_u32_chunks(&self.results.as_ref()[self.index..],
+                                    dest);
+
+            self.index += consumed_u32;
+            filled += filled_u8;
+        }
+
+        let len_remainder =
+            (dest.len() - filled) % (self.results.as_ref().len() * 4);
+        let end_direct = dest.len() - len_remainder;
+
+        while filled < end_direct {
+            let dest_u32: &mut R::Results = unsafe {
+                ::core::mem::transmute(dest[filled..].as_mut_ptr())
+            };
+            self.core.generate(dest_u32);
+            filled += self.results.as_ref().len() * 4;
+        }
+        self.index = self.results.as_ref().len();
+
+        if len_remainder > 0 {
+            self.core.generate(&mut self.results);
+            let (consumed_u32, _) =
+                fill_via_u32_chunks(&mut self.results.as_ref(),
+                                    &mut dest[filled..]);
+
+            self.index = consumed_u32;
+        }
+    }
+
+    #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
+    fn fill_bytes(&mut self, dest: &mut [u8]) {
+        let mut read_len = 0;
+        while read_len < dest.len() {
+            if self.index >= self.results.as_ref().len() {
+                self.core.generate(&mut self.results);
+                self.index = 0;
+            }
+            let (consumed_u32, filled_u8) =
+                fill_via_u32_chunks(&self.results.as_ref()[self.index..],
+                                    &mut dest[read_len..]);
+
+            self.index += consumed_u32;
+            read_len += filled_u8;
+        }
+    }
+
+    fn try_fill_bytes(&mut self, dest: &mut [u8]) -> Result<(), Error> {
+        Ok(self.fill_bytes(dest))
+    }
+}
+
+impl<R: BlockRngCore + SeedableRng> SeedableRng for BlockRng<R> {
+    type Seed = R::Seed;
+
+    fn from_seed(seed: Self::Seed) -> Self {
+        let results_empty = R::Results::default();
+        Self {
+            core: R::from_seed(seed),
+            index: results_empty.as_ref().len(), // generate on first use
+            results: results_empty,
+        }
+    }
+
+    fn from_rng<RNG: RngCore>(rng: &mut RNG) -> Result<Self, Error> {
+        let results_empty = R::Results::default();
+        Ok(Self {
+            core: R::from_rng(rng)?,
+            index: results_empty.as_ref().len(), // generate on first use
+            results: results_empty,
+        })
+    }
+}
+
+impl<R: BlockRngCore + CryptoRng> CryptoRng for BlockRng<R> {}
+
 // TODO: implement tests for the above
@@ -162,8 +162,58 @@ pub trait RngCore {
     fn try_fill_bytes(&mut self, dest: &mut [u8]) -> Result<(), Error>;
 }
 
-/// A marker trait for an `Rng` which may be considered for use in
-/// cryptography.
+/// A trait for RNGs which do not generate random numbers individually, but in
+/// blocks (typically `[u32; N]`). This technique is commonly used by
+/// cryptographic RNGs to improve performance.
+/// 
+/// Usage of this trait is optional, but provides two advantages:
+/// implementations only need to concern themselves with generation of the
+/// block, not the various `RngCore` methods (especially `fill_bytes`, where the
+/// optimal implementations are not trivial), and this allows `ReseedingRng` to
+/// perform periodic reseeding with very low overhead.
+/// 
+/// # Example
+/// 
+/// ```norun
+/// use rand_core::BlockRngCore;
+/// use rand_core::impls::BlockRng;
+/// 
+/// struct MyRngCore;
+/// 
+/// impl BlockRngCore for MyRngCore {
+///     type Results = [u32; 16];
+///     
+///     fn generate(&mut self, results: &mut Self::Results) {
+///         unimplemented!()
+///     }
+/// }
+/// 
+/// impl SeedableRng for MyRngCore {
+///     type Seed = unimplemented!();
+///     fn from_seed(seed: Self::Seed) -> Self {
+///         unimplemented!()
+///     }
+/// }
+/// 
+/// // optionally, also implement CryptoRng for MyRngCore
+/// 
+/// // Final RNG.
+/// type MyRng = BlockRng<u32, MyRngCore>;
+/// ```
+pub trait BlockRngCore {
+    /// Results element type, e.g. `u32`.
+    type Item;
+    
+    /// Results type. This is the 'block' an RNG implementing `BlockRngCore`
+    /// generates, which will usually be an array like `[u32; 16]`.
+    type Results: AsRef<[Self::Item]> + Default;
+
+    /// Generate a new block of results.
+    fn generate(&mut self, results: &mut Self::Results);
+}
+
+/// A marker trait used to indicate that an `RngCore` or `BlockRngCore`
+/// implementation is supposed to be cryptographically secure.
 /// 
 /// *Cryptographically secure generators*, also known as *CSPRNGs*, should
 /// satisfy an additional properties over other generators: given the first
@@ -182,7 +232,7 @@ pub trait RngCore {
 /// 
 /// Note also that use of a `CryptoRng` does not protect against other
 /// weaknesses such as seeding from a weak entropy source or leaking state.
-pub trait CryptoRng: RngCore {}
+pub trait CryptoRng {}
 
 /// A random number generator that can be explicitly seeded.
 ///
@@ -263,45 +313,41 @@ pub trait SeedableRng: Sized {
 
 
 impl<'a, R: RngCore + ?Sized> RngCore for &'a mut R {
-    #[inline]
+    #[inline(always)]
     fn next_u32(&mut self) -> u32 {
         (**self).next_u32()
     }
 
-    #[inline]
+    #[inline(always)]
     fn next_u64(&mut self) -> u64 {
         (**self).next_u64()
     }
 
-    #[inline]
     fn fill_bytes(&mut self, dest: &mut [u8]) {
         (**self).fill_bytes(dest)
     }
-    
-    #[inline]
+
     fn try_fill_bytes(&mut self, dest: &mut [u8]) -> Result<(), Error> {
         (**self).try_fill_bytes(dest)
     }
 }
 
 #[cfg(any(feature="std", feature="alloc"))]
 impl<R: RngCore + ?Sized> RngCore for Box<R> {
-    #[inline]
+    #[inline(always)]
     fn next_u32(&mut self) -> u32 {
         (**self).next_u32()
     }
 
-    #[inline]
+    #[inline(always)]
     fn next_u64(&mut self) -> u64 {
         (**self).next_u64()
     }
 
-    #[inline]
     fn fill_bytes(&mut self, dest: &mut [u8]) {
         (**self).fill_bytes(dest)
     }
-    
-    #[inline]
+
     fn try_fill_bytes(&mut self, dest: &mut [u8]) -> Result<(), Error> {
         (**self).try_fill_bytes(dest)
     }