Skip to content

Commit 5821f10

Browse files
committed
dec/amd64: implement add10VW/sub10VW
name go time/op asm time/op delta Sub10VW/1 10.1ns ± 2% 5.6ns ± 1% -45.03% Sub10VW/2 11.7ns ± 1% 6.1ns ± 1% -48.01% Sub10VW/3 13.2ns ± 2% 8.1ns ± 0% -39.03% Sub10VW/4 14.6ns ± 0% 8.4ns ± 0% -42.77% Sub10VW/5 14.9ns ± 1% 8.8ns ± 0% -40.66% Sub10VW/10 15.1ns ± 0% 10.6ns ± 3% -30.12% Sub10VW/100 116ns ± 1% 45ns ± 6% -61.62% Sub10VW/1000 1.22µs ± 1% 0.54µs ±14% -55.85% Sub10VW/10000 11.9µs ± 0% 5.4µs ± 1% -54.85% Sub10VW/100000 122µs ± 0% 62µs ± 0% -49.45% The Go implementation can check if the carry is zero and switch to copy() for free (no need to have a standard add10VW vs. add10VW large). In the assembler version I chose to keep a single implementation of the function and switch to a memcpy whenever the carry is 0 (checked every 4 Words). Considering that the carry is almost always 0, this logic is the likely cause of the performance drop between 5-15 Words. Also past 1000 Words, the performance gains seem to slowly drop. The very likely cause is the simplistic memcpy implementation vs. runtime·memmove.
1 parent edbdd7c commit 5821f10

File tree

11 files changed

+494
-343
lines changed

11 files changed

+494
-343
lines changed

arith_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ import (
1414
var isRaceBuilder bool
1515

1616
func init() {
17-
flag.BoolVar(&isRaceBuilder, "rb", true, "race builder")
17+
flag.BoolVar(&isRaceBuilder, "rb", false, "race builder")
1818
}
1919

2020
type funVV func(z, x, y []Word) (c Word)

dec.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ func (x dec) digit(i uint) uint {
6464
return 0
6565
}
6666
// 0 <= j < len(x)
67-
return (uint(x[j]) / pow10(i)) % 10
67+
return uint(x[j]/pow10(i)) % 10
6868
}
6969

7070
func (z dec) make(n int) dec {
@@ -166,7 +166,7 @@ func (x dec) sticky(i uint) uint {
166166
return 1
167167
}
168168
}
169-
if uint(x[j])%pow10(i) != 0 {
169+
if x[j]%pow10(i) != 0 {
170170
return 1
171171
}
172172
return 0

dec_arith.go

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ var pow10s = [...]uint64{
2525
10000000000000000, 100000000000000000, 1000000000000000000, 10000000000000000000,
2626
}
2727

28-
func pow10(n uint) uint { return uint(pow10s[n]) }
28+
func pow10(n uint) Word { return Word(pow10s[n]) }
2929

3030
var maxDigits = [...]uint{
3131
1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5,
@@ -54,7 +54,7 @@ func decDigits64(x uint64) (n uint) {
5454

5555
func decDigits32(x uint) (n uint) {
5656
n = maxDigits[bits.Len(x)]
57-
if x < pow10(n-1) {
57+
if x < uint(pow10(n-1)) {
5858
n--
5959
}
6060
return n
@@ -148,12 +148,9 @@ func div10WW_g(u1, u0, v Word) (q, r Word) {
148148

149149
func add10WWW_g(x, y, cIn Word) (s, c Word) {
150150
r, cc := bits.Add(uint(x), uint(y), uint(cIn))
151-
// if cc != 0 || r > _DB-1 {
152-
// cc = 1
153-
// r -= _DB
154-
// }
155-
// c1 := uint(int(r-_DB) >> 63)
156151
var c1 uint
152+
// this simple if statement is compiled without jumps
153+
// at least on amd64.
157154
if r >= _DB {
158155
c1 = 1
159156
}
@@ -187,7 +184,10 @@ func sub10VV_g(z, x, y []Word) (c Word) {
187184
}
188185

189186
// add10VW adds y to x. The resulting carry c is either 0 or 1.
190-
func add10VW_g(z, x dec, y Word) (c Word) {
187+
func add10VW_g(z, x []Word, y Word) (c Word) {
188+
if len(z) == 0 {
189+
return y
190+
}
191191
z[0], c = add10WWW_g(x[0], y, 0)
192192
// propagate carry
193193
for i := 1; i < len(z) && i < len(x); i++ {
@@ -219,15 +219,15 @@ func sub10VW_g(z, x []Word, y Word) (c Word) {
219219
}
220220

221221
// shl10VU sets z to x*(10**s), s < _WD
222-
func shl10VU_g(z, x dec, s uint) (r Word) {
222+
func shl10VU_g(z, x []Word, s uint) (r Word) {
223223
if s == 0 {
224224
copy(z, x)
225225
return
226226
}
227227
if len(z) == 0 || len(x) == 0 {
228228
return
229229
}
230-
d, m := Word(pow10(_DW-s)), Word(pow10(s))
230+
d, m := pow10(_DW-s), pow10(s)
231231
var h, l Word
232232
r, l = divWW(0, x[len(x)-1], d)
233233
for i := len(z) - 1; i > 0; i-- {
@@ -241,7 +241,7 @@ func shl10VU_g(z, x dec, s uint) (r Word) {
241241
}
242242

243243
// shr10VU sets z to x/(10**s)
244-
func shr10VU_g(z, x dec, s uint) (r Word) {
244+
func shr10VU_g(z, x []Word, s uint) (r Word) {
245245
if s == 0 {
246246
copy(z, x)
247247
return
@@ -251,15 +251,15 @@ func shr10VU_g(z, x dec, s uint) (r Word) {
251251
}
252252

253253
var h, l Word
254-
d, m := Word(pow10(s)), Word(pow10(_DW-s))
254+
d, m := pow10(s), pow10(_DW-s)
255255
h, r = divWW(0, x[0], Word(d))
256256
for i := 1; i < len(z) && i < len(x); i++ {
257257
t := h
258258
h, l = divWW(0, x[i], d)
259259
z[i-1] = t + l*m
260260
}
261261
z[len(z)-1] = h
262-
return r
262+
return r * m
263263
}
264264

265265
func mulAdd10VWW_g(z, x []Word, y, r Word) (c Word) {

0 commit comments

Comments
 (0)