@@ -110,8 +110,10 @@ use core::time::Duration;
110
110
use core:: ops:: { Range , RangeInclusive } ;
111
111
112
112
use crate :: distributions:: float:: IntoFloat ;
113
- use crate :: distributions:: utils:: { BoolAsSIMD , IntAsSIMD , FloatAsSIMD , FloatSIMDUtils , WideningMultiply } ;
113
+ use crate :: distributions:: utils:: { BoolAsSIMD , FloatAsSIMD , FloatSIMDUtils , IntAsSIMD , WideningMultiply } ;
114
114
use crate :: distributions:: Distribution ;
115
+ #[ cfg( feature = "simd_support" ) ]
116
+ use crate :: distributions:: Standard ;
115
117
use crate :: { Rng , RngCore } ;
116
118
117
119
#[ cfg( not( feature = "std" ) ) ]
@@ -571,21 +573,30 @@ uniform_int_impl! { u128, u128, u128 }
571
573
572
574
#[ cfg( feature = "simd_support" ) ]
573
575
macro_rules! uniform_simd_int_impl {
574
- ( $ty: ident, $unsigned: ident, $u_scalar : ident ) => {
576
+ ( $ty: ident, $unsigned: ident) => {
575
577
// The "pick the largest zone that can fit in an `u32`" optimization
576
578
// is less useful here. Multiple lanes complicate things, we don't
577
579
// know the PRNG's minimal output size, and casting to a larger vector
578
580
// is generally a bad idea for SIMD performance. The user can still
579
581
// implement it manually.
580
-
581
- // TODO: look into `Uniform::<u32x4>::new(0u32, 100)` functionality
582
- // perhaps `impl SampleUniform for $u_scalar`?
583
- impl SampleUniform for $ty {
584
- type Sampler = UniformInt <$ty>;
582
+ impl <const LANES : usize > SampleUniform for Simd <$ty, LANES >
583
+ where
584
+ LaneCount <LANES >: SupportedLaneCount ,
585
+ Simd <$unsigned, LANES >:
586
+ WideningMultiply <Output = ( Simd <$unsigned, LANES >, Simd <$unsigned, LANES >) >,
587
+ Standard : Distribution <Simd <$unsigned, LANES >>,
588
+ {
589
+ type Sampler = UniformInt <Simd <$ty, LANES >>;
585
590
}
586
591
587
- impl UniformSampler for UniformInt <$ty> {
588
- type X = $ty;
592
+ impl <const LANES : usize > UniformSampler for UniformInt <Simd <$ty, LANES >>
593
+ where
594
+ LaneCount <LANES >: SupportedLaneCount ,
595
+ Simd <$unsigned, LANES >:
596
+ WideningMultiply <Output = ( Simd <$unsigned, LANES >, Simd <$unsigned, LANES >) >,
597
+ Standard : Distribution <Simd <$unsigned, LANES >>,
598
+ {
599
+ type X = Simd <$ty, LANES >;
589
600
590
601
#[ inline] // if the range is constant, this helps LLVM to do the
591
602
// calculations at compile-time.
@@ -609,13 +620,13 @@ macro_rules! uniform_simd_int_impl {
609
620
let high = * high_b. borrow( ) ;
610
621
assert!( low. lanes_le( high) . all( ) ,
611
622
"Uniform::new_inclusive called with `low > high`" ) ;
612
- let unsigned_max = Simd :: splat( :: core:: $u_scalar :: MAX ) ;
623
+ let unsigned_max = Simd :: splat( :: core:: $unsigned :: MAX ) ;
613
624
614
- // NOTE: these may need to be replaced with explicitly
615
- // wrapping operations if `packed_simd` changes
616
- let range: $unsigned = ( ( high - low) + Simd :: splat( 1 ) ) . cast( ) ;
625
+ // NOTE: all `Simd` operations are inherently wrapping,
626
+ // see https://doc.rust-lang.org/std/simd/struct.Simd.html
627
+ let range: Simd < $unsigned, LANES > = ( ( high - low) + Simd :: splat( 1 ) ) . cast( ) ;
617
628
// `% 0` will panic at runtime.
618
- let not_full_range = range. lanes_gt( $unsigned :: splat( 0 ) ) ;
629
+ let not_full_range = range. lanes_gt( Simd :: splat( 0 ) ) ;
619
630
// replacing 0 with `unsigned_max` allows a faster `select`
620
631
// with bitwise OR
621
632
let modulo = not_full_range. select( range, unsigned_max) ;
@@ -634,8 +645,8 @@ macro_rules! uniform_simd_int_impl {
634
645
}
635
646
636
647
fn sample<R : Rng + ?Sized >( & self , rng: & mut R ) -> Self :: X {
637
- let range: $unsigned = self . range. cast( ) ;
638
- let zone: $unsigned = self . z. cast( ) ;
648
+ let range: Simd < $unsigned, LANES > = self . range. cast( ) ;
649
+ let zone: Simd < $unsigned, LANES > = self . z. cast( ) ;
639
650
640
651
// This might seem very slow, generating a whole new
641
652
// SIMD vector for every sample rejection. For most uses
@@ -646,19 +657,19 @@ macro_rules! uniform_simd_int_impl {
646
657
// rejection. The replacement method does however add a little
647
658
// overhead. Benchmarking or calculating probabilities might
648
659
// reveal contexts where this replacement method is slower.
649
- let mut v: $unsigned = rng. gen ( ) ;
660
+ let mut v: Simd < $unsigned, LANES > = rng. gen ( ) ;
650
661
loop {
651
662
let ( hi, lo) = v. wmul( range) ;
652
663
let mask = lo. lanes_le( zone) ;
653
664
if mask. all( ) {
654
- let hi: $ty = hi. cast( ) ;
665
+ let hi: Simd < $ty, LANES > = hi. cast( ) ;
655
666
// wrapping addition
656
667
let result = self . low + hi;
657
668
// `select` here compiles to a blend operation
658
669
// When `range.eq(0).none()` the compare and blend
659
670
// operations are avoided.
660
- let v: $ty = v. cast( ) ;
661
- return range. lanes_gt( $unsigned :: splat( 0 ) ) . select( result, v) ;
671
+ let v: Simd < $ty, LANES > = v. cast( ) ;
672
+ return range. lanes_gt( Simd :: splat( 0 ) ) . select( result, v) ;
662
673
}
663
674
// Replace only the failing lanes
664
675
v = mask. select( v, rng. gen ( ) ) ;
@@ -668,50 +679,16 @@ macro_rules! uniform_simd_int_impl {
668
679
} ;
669
680
670
681
// bulk implementation
671
- ( $( ( $unsigned: ident, $signed: ident) , ) + $u_scalar : ident ) => {
682
+ ( $( ( $unsigned: ident, $signed: ident) ) ,+ ) => {
672
683
$(
673
- uniform_simd_int_impl!( $unsigned, $unsigned, $u_scalar ) ;
674
- uniform_simd_int_impl!( $signed, $unsigned, $u_scalar ) ;
684
+ uniform_simd_int_impl!( $unsigned, $unsigned) ;
685
+ uniform_simd_int_impl!( $signed, $unsigned) ;
675
686
) +
676
687
} ;
677
688
}
678
689
679
690
#[ cfg( feature = "simd_support" ) ]
680
- uniform_simd_int_impl ! {
681
- ( u64x2, i64x2) ,
682
- ( u64x4, i64x4) ,
683
- ( u64x8, i64x8) ,
684
- u64
685
- }
686
-
687
- #[ cfg( feature = "simd_support" ) ]
688
- uniform_simd_int_impl ! {
689
- ( u32x2, i32x2) ,
690
- ( u32x4, i32x4) ,
691
- ( u32x8, i32x8) ,
692
- ( u32x16, i32x16) ,
693
- u32
694
- }
695
-
696
- #[ cfg( feature = "simd_support" ) ]
697
- uniform_simd_int_impl ! {
698
- ( u16x2, i16x2) ,
699
- ( u16x4, i16x4) ,
700
- ( u16x8, i16x8) ,
701
- ( u16x16, i16x16) ,
702
- ( u16x32, i16x32) ,
703
- u16
704
- }
705
-
706
- #[ cfg( feature = "simd_support" ) ]
707
- uniform_simd_int_impl ! {
708
- ( u8x4, i8x4) ,
709
- ( u8x8, i8x8) ,
710
- ( u8x16, i8x16) ,
711
- ( u8x32, i8x32) ,
712
- ( u8x64, i8x64) ,
713
- u8
714
- }
691
+ uniform_simd_int_impl ! { ( u8 , i8 ) , ( u16 , i16 ) , ( u32 , i32 ) , ( u64 , i64 ) }
715
692
716
693
impl SampleUniform for char {
717
694
type Sampler = UniformChar ;
@@ -1183,7 +1160,7 @@ mod tests {
1183
1160
_ => panic ! ( "`UniformDurationMode` was not serialized/deserialized correctly" )
1184
1161
}
1185
1162
}
1186
-
1163
+
1187
1164
#[ test]
1188
1165
#[ cfg( feature = "serde1" ) ]
1189
1166
fn test_uniform_serialization ( ) {
0 commit comments