add qualcomm 8cx gen 3 benchmark + minor tuning (#48)

lemire · web-flow · commit bd4a55abdef7 · 2024-06-25T10:25:36.000-04:00
* simplify arm code

* some tuning

* adding qualcomm results
diff --git a/README.md b/README.md
@@ -162,6 +162,22 @@ faster than the standard library.
 | Russian-Lipsum  |  3.3       | 0.95                       | 3.5 x           |
 
 
+On a Qualcomm 8cx gen3 (Windows Dev Kit 2023), we get roughly the same relative performance
+boost as the Neoverse V1.
+
+| data set      | SimdUnicode speed (GB/s) | .NET speed (GB/s) |  speed up |
+|:----------------|:-----------|:--------------------------|:-------------------|
+| Twitter.json    |  15        | 10                        | 1.5 x           |
+| Arabic-Lipsum   |  4.0       | 2.3                       | 1.7 x           |
+| Chinese-Lipsum  |  4.0       | 2.9                       | 1.4 x           |
+| Emoji-Lipsum    |  4.0       | 0.9                       | 4.4 x           |
+| Hebrew-Lipsum   |  4.0       | 2.3                       | 1.7 x           |
+| Hindi-Lipsum    |  4.0       | 1.9                       | 2.1 x           |
+| Japanese-Lipsum |  4.0       | 2.7                       | 1.5 x           |
+| Korean-Lipsum   |  4.0       | 1.5                       | 2.7 x           |
+| Latin-Lipsum    |  50        | 20                       | 2.5 x           |
+| Russian-Lipsum  |  4.0       | 1.2                       | 3.3 x           |
+
 One difficulty with ARM processors is that they have varied SIMD/NEON performance. For example, Neoverse N1 processors, not to be confused with the Neoverse V1 design used by AWS Graviton 3, have weak SIMD performance. Of course, one can pick and choose which approach is best and it is not necessary to apply SimdUnicode is all cases. We expect good performance on recent ARM-based Qualcomm processors.
 
 ## Building the library
diff --git a/src/UTF8.cs b/src/UTF8.cs
@@ -1388,20 +1388,28 @@ private unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust
                             prevIncomplete = Vector128<byte>.Zero;
                             // Often, we have a lot of ASCII characters in a row.
                             int localasciirun = 16;
-                            if (processedLength + localasciirun + 64 <= inputLength)
+                            if (processedLength + localasciirun + 16 <= inputLength)
                             {
-                                for (; processedLength + localasciirun + 64 <= inputLength; localasciirun += 64)
+                                Vector128<byte> block = AdvSimd.LoadVector128(pInputBuffer + processedLength + localasciirun);
+                                if (AdvSimd.Arm64.MaxAcross(Vector128.AsUInt32(AdvSimd.And(block, v80))).ToScalar() == 0)
                                 {
-                                    Vector128<byte> block1 = AdvSimd.LoadVector128(pInputBuffer + processedLength + localasciirun);
-                                    Vector128<byte> block2 = AdvSimd.LoadVector128(pInputBuffer + processedLength + localasciirun + 16);
-                                    Vector128<byte> block3 = AdvSimd.LoadVector128(pInputBuffer + processedLength + localasciirun + 32);
-                                    Vector128<byte> block4 = AdvSimd.LoadVector128(pInputBuffer + processedLength + localasciirun + 48);
-                                    Vector128<byte> or = AdvSimd.Or(AdvSimd.Or(block1, block2), AdvSimd.Or(block3, block4));
-                                    if (AdvSimd.Arm64.MaxAcross(or).ToScalar() > 127)
+                                    localasciirun += 16;
+                                    for (; processedLength + localasciirun + 64 <= inputLength; localasciirun += 64)
                                     {
-                                        break;
+                                        Vector128<byte> block1 = AdvSimd.LoadVector128(pInputBuffer + processedLength + localasciirun);
+                                        Vector128<byte> block2 = AdvSimd.LoadVector128(pInputBuffer + processedLength + localasciirun + 16);
+                                        Vector128<byte> block3 = AdvSimd.LoadVector128(pInputBuffer + processedLength + localasciirun + 32);
+                                        Vector128<byte> block4 = AdvSimd.LoadVector128(pInputBuffer + processedLength + localasciirun + 48);
+                                        Vector128<byte> or = AdvSimd.Or(AdvSimd.Or(block1, block2), AdvSimd.Or(block3, block4));
+
+                                        if (AdvSimd.Arm64.MaxAcross(Vector128.AsUInt32(AdvSimd.And(or, v80))).ToScalar() != 0)
+                                        {
+                                            break;
+                                        }
                                     }
+
                                 }
+
                                 processedLength += localasciirun - 16;
                             }
                         }

Original file line number	Diff line number	Diff line change
`@@ -1388,20 +1388,28 @@ private unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust`
`1388`	`1388`	`prevIncomplete = Vector128<byte>.Zero;`
`1389`	`1389`	`// Often, we have a lot of ASCII characters in a row.`
`1390`	`1390`	`int localasciirun = 16;`
`1391`		`- if (processedLength + localasciirun + 64 <= inputLength)`
	`1391`	`+ if (processedLength + localasciirun + 16 <= inputLength)`
`1392`	`1392`	`{`
`1393`		`- for (; processedLength + localasciirun + 64 <= inputLength; localasciirun += 64)`
	`1393`	`+ Vector128<byte> block = AdvSimd.LoadVector128(pInputBuffer + processedLength + localasciirun);`
	`1394`	`+ if (AdvSimd.Arm64.MaxAcross(Vector128.AsUInt32(AdvSimd.And(block, v80))).ToScalar() == 0)`
`1394`	`1395`	`{`
`1395`		`- Vector128<byte> block1 = AdvSimd.LoadVector128(pInputBuffer + processedLength + localasciirun);`
`1396`		`- Vector128<byte> block2 = AdvSimd.LoadVector128(pInputBuffer + processedLength + localasciirun + 16);`
`1397`		`- Vector128<byte> block3 = AdvSimd.LoadVector128(pInputBuffer + processedLength + localasciirun + 32);`
`1398`		`- Vector128<byte> block4 = AdvSimd.LoadVector128(pInputBuffer + processedLength + localasciirun + 48);`
`1399`		`- Vector128<byte> or = AdvSimd.Or(AdvSimd.Or(block1, block2), AdvSimd.Or(block3, block4));`
`1400`		`- if (AdvSimd.Arm64.MaxAcross(or).ToScalar() > 127)`
	`1396`	`+ localasciirun += 16;`
	`1397`	`+ for (; processedLength + localasciirun + 64 <= inputLength; localasciirun += 64)`
`1401`	`1398`	`{`
`1402`		`- break;`
	`1399`	`+ Vector128<byte> block1 = AdvSimd.LoadVector128(pInputBuffer + processedLength + localasciirun);`
	`1400`	`+ Vector128<byte> block2 = AdvSimd.LoadVector128(pInputBuffer + processedLength + localasciirun + 16);`
	`1401`	`+ Vector128<byte> block3 = AdvSimd.LoadVector128(pInputBuffer + processedLength + localasciirun + 32);`
	`1402`	`+ Vector128<byte> block4 = AdvSimd.LoadVector128(pInputBuffer + processedLength + localasciirun + 48);`
	`1403`	`+ Vector128<byte> or = AdvSimd.Or(AdvSimd.Or(block1, block2), AdvSimd.Or(block3, block4));`
	`1404`	`+`
	`1405`	`+ if (AdvSimd.Arm64.MaxAcross(Vector128.AsUInt32(AdvSimd.And(or, v80))).ToScalar() != 0)`
	`1406`	`+ {`
	`1407`	`+ break;`
	`1408`	`+ }`
`1403`	`1409`	`}`
	`1410`	`+`
`1404`	`1411`	`}`
	`1412`	`+`
`1405`	`1413`	`processedLength += localasciirun - 16;`
`1406`	`1414`	`}`
`1407`	`1415`	`}`