@@ -374,7 +374,9 @@ define void @fcvtzu_v16f32_v16i16(ptr %a, ptr %b) #0 {
374
374
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
375
375
; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
376
376
; VBITS_GE_512-NEXT: fcvtzu z0.s, p0/m, z0.s
377
- ; VBITS_GE_512-NEXT: st1h { z0.s }, p0, [x1]
377
+ ; VBITS_GE_512-NEXT: ptrue p0.h, vl16
378
+ ; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h
379
+ ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1]
378
380
; VBITS_GE_512-NEXT: ret
379
381
%op1 = load <16 x float >, ptr %a
380
382
%res = fptoui <16 x float > %op1 to <16 x i16 >
@@ -388,7 +390,9 @@ define void @fcvtzu_v32f32_v32i16(ptr %a, ptr %b) vscale_range(8,0) #0 {
388
390
; CHECK-NEXT: ptrue p0.s, vl32
389
391
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
390
392
; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.s
391
- ; CHECK-NEXT: st1h { z0.s }, p0, [x1]
393
+ ; CHECK-NEXT: ptrue p0.h, vl32
394
+ ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
395
+ ; CHECK-NEXT: st1h { z0.h }, p0, [x1]
392
396
; CHECK-NEXT: ret
393
397
%op1 = load <32 x float >, ptr %a
394
398
%res = fptoui <32 x float > %op1 to <32 x i16 >
@@ -402,7 +406,9 @@ define void @fcvtzu_v64f32_v64i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
402
406
; CHECK-NEXT: ptrue p0.s, vl64
403
407
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
404
408
; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.s
405
- ; CHECK-NEXT: st1h { z0.s }, p0, [x1]
409
+ ; CHECK-NEXT: ptrue p0.h, vl64
410
+ ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
411
+ ; CHECK-NEXT: st1h { z0.h }, p0, [x1]
406
412
; CHECK-NEXT: ret
407
413
%op1 = load <64 x float >, ptr %a
408
414
%res = fptoui <64 x float > %op1 to <64 x i16 >
@@ -684,7 +690,10 @@ define void @fcvtzu_v16f64_v16i16(ptr %a, ptr %b) vscale_range(8,0) #0 {
684
690
; CHECK-NEXT: ptrue p0.d, vl16
685
691
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
686
692
; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.d
687
- ; CHECK-NEXT: st1h { z0.d }, p0, [x1]
693
+ ; CHECK-NEXT: ptrue p0.h, vl16
694
+ ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
695
+ ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
696
+ ; CHECK-NEXT: st1h { z0.h }, p0, [x1]
688
697
; CHECK-NEXT: ret
689
698
%op1 = load <16 x double >, ptr %a
690
699
%res = fptoui <16 x double > %op1 to <16 x i16 >
@@ -698,7 +707,10 @@ define void @fcvtzu_v32f64_v32i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
698
707
; CHECK-NEXT: ptrue p0.d, vl32
699
708
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
700
709
; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.d
701
- ; CHECK-NEXT: st1h { z0.d }, p0, [x1]
710
+ ; CHECK-NEXT: ptrue p0.h, vl32
711
+ ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
712
+ ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
713
+ ; CHECK-NEXT: st1h { z0.h }, p0, [x1]
702
714
; CHECK-NEXT: ret
703
715
%op1 = load <32 x double >, ptr %a
704
716
%res = fptoui <32 x double > %op1 to <32 x i16 >
@@ -769,7 +781,9 @@ define void @fcvtzu_v8f64_v8i32(ptr %a, ptr %b) #0 {
769
781
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
770
782
; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
771
783
; VBITS_GE_512-NEXT: fcvtzu z0.d, p0/m, z0.d
772
- ; VBITS_GE_512-NEXT: st1w { z0.d }, p0, [x1]
784
+ ; VBITS_GE_512-NEXT: ptrue p0.s, vl8
785
+ ; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s
786
+ ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1]
773
787
; VBITS_GE_512-NEXT: ret
774
788
%op1 = load <8 x double >, ptr %a
775
789
%res = fptoui <8 x double > %op1 to <8 x i32 >
@@ -783,7 +797,9 @@ define void @fcvtzu_v16f64_v16i32(ptr %a, ptr %b) vscale_range(8,0) #0 {
783
797
; CHECK-NEXT: ptrue p0.d, vl16
784
798
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
785
799
; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.d
786
- ; CHECK-NEXT: st1w { z0.d }, p0, [x1]
800
+ ; CHECK-NEXT: ptrue p0.s, vl16
801
+ ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
802
+ ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
787
803
; CHECK-NEXT: ret
788
804
%op1 = load <16 x double >, ptr %a
789
805
%res = fptoui <16 x double > %op1 to <16 x i32 >
@@ -797,7 +813,9 @@ define void @fcvtzu_v32f64_v32i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
797
813
; CHECK-NEXT: ptrue p0.d, vl32
798
814
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
799
815
; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.d
800
- ; CHECK-NEXT: st1w { z0.d }, p0, [x1]
816
+ ; CHECK-NEXT: ptrue p0.s, vl32
817
+ ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
818
+ ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
801
819
; CHECK-NEXT: ret
802
820
%op1 = load <32 x double >, ptr %a
803
821
%res = fptoui <32 x double > %op1 to <32 x i32 >
@@ -1267,7 +1285,9 @@ define void @fcvtzs_v16f32_v16i16(ptr %a, ptr %b) #0 {
1267
1285
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
1268
1286
; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
1269
1287
; VBITS_GE_512-NEXT: fcvtzs z0.s, p0/m, z0.s
1270
- ; VBITS_GE_512-NEXT: st1h { z0.s }, p0, [x1]
1288
+ ; VBITS_GE_512-NEXT: ptrue p0.h, vl16
1289
+ ; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h
1290
+ ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1]
1271
1291
; VBITS_GE_512-NEXT: ret
1272
1292
%op1 = load <16 x float >, ptr %a
1273
1293
%res = fptosi <16 x float > %op1 to <16 x i16 >
@@ -1281,7 +1301,9 @@ define void @fcvtzs_v32f32_v32i16(ptr %a, ptr %b) vscale_range(8,0) #0 {
1281
1301
; CHECK-NEXT: ptrue p0.s, vl32
1282
1302
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1283
1303
; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s
1284
- ; CHECK-NEXT: st1h { z0.s }, p0, [x1]
1304
+ ; CHECK-NEXT: ptrue p0.h, vl32
1305
+ ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
1306
+ ; CHECK-NEXT: st1h { z0.h }, p0, [x1]
1285
1307
; CHECK-NEXT: ret
1286
1308
%op1 = load <32 x float >, ptr %a
1287
1309
%res = fptosi <32 x float > %op1 to <32 x i16 >
@@ -1295,7 +1317,9 @@ define void @fcvtzs_v64f32_v64i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
1295
1317
; CHECK-NEXT: ptrue p0.s, vl64
1296
1318
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1297
1319
; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s
1298
- ; CHECK-NEXT: st1h { z0.s }, p0, [x1]
1320
+ ; CHECK-NEXT: ptrue p0.h, vl64
1321
+ ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
1322
+ ; CHECK-NEXT: st1h { z0.h }, p0, [x1]
1299
1323
; CHECK-NEXT: ret
1300
1324
%op1 = load <64 x float >, ptr %a
1301
1325
%res = fptosi <64 x float > %op1 to <64 x i16 >
@@ -1577,7 +1601,10 @@ define void @fcvtzs_v16f64_v16i16(ptr %a, ptr %b) vscale_range(8,0) #0 {
1577
1601
; CHECK-NEXT: ptrue p0.d, vl16
1578
1602
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1579
1603
; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d
1580
- ; CHECK-NEXT: st1h { z0.d }, p0, [x1]
1604
+ ; CHECK-NEXT: ptrue p0.h, vl16
1605
+ ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
1606
+ ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
1607
+ ; CHECK-NEXT: st1h { z0.h }, p0, [x1]
1581
1608
; CHECK-NEXT: ret
1582
1609
%op1 = load <16 x double >, ptr %a
1583
1610
%res = fptosi <16 x double > %op1 to <16 x i16 >
@@ -1591,7 +1618,10 @@ define void @fcvtzs_v32f64_v32i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
1591
1618
; CHECK-NEXT: ptrue p0.d, vl32
1592
1619
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1593
1620
; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d
1594
- ; CHECK-NEXT: st1h { z0.d }, p0, [x1]
1621
+ ; CHECK-NEXT: ptrue p0.h, vl32
1622
+ ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
1623
+ ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
1624
+ ; CHECK-NEXT: st1h { z0.h }, p0, [x1]
1595
1625
; CHECK-NEXT: ret
1596
1626
%op1 = load <32 x double >, ptr %a
1597
1627
%res = fptosi <32 x double > %op1 to <32 x i16 >
@@ -1662,7 +1692,9 @@ define void @fcvtzs_v8f64_v8i32(ptr %a, ptr %b) #0 {
1662
1692
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
1663
1693
; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
1664
1694
; VBITS_GE_512-NEXT: fcvtzs z0.d, p0/m, z0.d
1665
- ; VBITS_GE_512-NEXT: st1w { z0.d }, p0, [x1]
1695
+ ; VBITS_GE_512-NEXT: ptrue p0.s, vl8
1696
+ ; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s
1697
+ ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1]
1666
1698
; VBITS_GE_512-NEXT: ret
1667
1699
%op1 = load <8 x double >, ptr %a
1668
1700
%res = fptosi <8 x double > %op1 to <8 x i32 >
@@ -1676,7 +1708,9 @@ define void @fcvtzs_v16f64_v16i32(ptr %a, ptr %b) vscale_range(8,0) #0 {
1676
1708
; CHECK-NEXT: ptrue p0.d, vl16
1677
1709
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1678
1710
; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d
1679
- ; CHECK-NEXT: st1w { z0.d }, p0, [x1]
1711
+ ; CHECK-NEXT: ptrue p0.s, vl16
1712
+ ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
1713
+ ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
1680
1714
; CHECK-NEXT: ret
1681
1715
%op1 = load <16 x double >, ptr %a
1682
1716
%res = fptosi <16 x double > %op1 to <16 x i32 >
@@ -1690,7 +1724,9 @@ define void @fcvtzs_v32f64_v32i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
1690
1724
; CHECK-NEXT: ptrue p0.d, vl32
1691
1725
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1692
1726
; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d
1693
- ; CHECK-NEXT: st1w { z0.d }, p0, [x1]
1727
+ ; CHECK-NEXT: ptrue p0.s, vl32
1728
+ ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
1729
+ ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
1694
1730
; CHECK-NEXT: ret
1695
1731
%op1 = load <32 x double >, ptr %a
1696
1732
%res = fptosi <32 x double > %op1 to <32 x i32 >
0 commit comments