@@ -540,12 +540,24 @@ impl<T> SliceExt for [T] {
540
540
let mut i: usize = 0 ;
541
541
let ln = self . len ( ) ;
542
542
543
+ // For very small types, all the individual reads in the normal
544
+ // path perform poorly. We can do better, given efficient unaligned
545
+ // load/store, by loading a larger chunk and reversing a register.
546
+
547
+ // Ideally LLVM would do this for us, as it knows better than we do
548
+ // whether unaligned reads are efficient (since that changes between
549
+ // different ARM versions, for example) and what the best chunk size
550
+ // would be. Unfortunately, as of LLVM 4.0 (2017-05) it only unrolls
551
+ // the loop, so we need to do this ourselves. (Hypothesis: reverse
552
+ // is troublesome because the sides can be aligned differently --
553
+ // will be, when the length is odd -- so there's no way of emitting
554
+ // pre- and postludes to use fully-aligned SIMD in the middle.)
555
+
543
556
let fast_unaligned =
544
557
cfg ! ( any( target_arch = "x86" , target_arch = "x86_64" ) ) ;
545
558
546
559
if fast_unaligned && mem:: size_of :: < T > ( ) == 1 {
547
- // Single-byte read & write are comparatively slow. Instead,
548
- // work in usize chunks and get bswap to do the hard work.
560
+ // Use the llvm.bswap intrinsic to reverse u8s in a usize
549
561
let chunk = mem:: size_of :: < usize > ( ) ;
550
562
while i + chunk - 1 < ln / 2 {
551
563
unsafe {
@@ -561,8 +573,7 @@ impl<T> SliceExt for [T] {
561
573
}
562
574
563
575
if fast_unaligned && mem:: size_of :: < T > ( ) == 2 {
564
- // Not quite as good as the above, but still helpful.
565
- // Same general idea, read bigger and do the swap in a register.
576
+ // Use rotate-by-16 to reverse u16s in a u32
566
577
let chunk = mem:: size_of :: < u32 > ( ) / 2 ;
567
578
while i + chunk - 1 < ln / 2 {
568
579
unsafe {
0 commit comments