Skip to content

Commit 3189546

Browse files
authored
Merge pull request #8 from FrnchFrgg/master
Speed up findLongestMatch by avoiding a lot of allocations
2 parents 4ca4252 + f08c8db commit 3189546

File tree

5 files changed

+196
-12
lines changed

5 files changed

+196
-12
lines changed

difflib/bytes/bytes.go

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -294,26 +294,43 @@ func (m *SequenceMatcher) findLongestMatch(alo, ahi, blo, bhi int) Match {
294294
// find longest junk-free match
295295
// during an iteration of the loop, j2len[j] = length of longest
296296
// junk-free match ending with a[i-1] and b[j]
297-
j2len := map[int]int{}
297+
N := bhi - blo
298+
j2len := make([]int, N)
299+
newj2len := make([]int, N)
300+
var indices []int
298301
for i := alo; i != ahi; i++ {
299302
// look at all instances of a[i] in b; note that because
300303
// b2j has no junk keys, the loop is skipped if a[i] is junk
301-
newj2len := map[int]int{}
302-
for _, j := range m.b2j.get(m.a[i]) {
304+
newindices := m.b2j.get(m.a[i])
305+
for _, j := range newindices {
303306
// a[i] matches b[j]
304307
if j < blo {
305308
continue
306309
}
307310
if j >= bhi {
308311
break
309312
}
310-
k := j2len[j-1] + 1
311-
newj2len[j] = k
313+
k := 1
314+
if j > blo {
315+
k = j2len[j-1-blo] + 1
316+
}
317+
newj2len[j-blo] = k
312318
if k > bestsize {
313319
besti, bestj, bestsize = i-k+1, j-k+1, k
314320
}
315321
}
316-
j2len = newj2len
322+
// j2len = newj2len, clear and reuse j2len as newj2len
323+
for _, j := range indices {
324+
if j < blo {
325+
continue
326+
}
327+
if j >= bhi {
328+
break
329+
}
330+
j2len[j-blo] = 0
331+
}
332+
indices = newindices
333+
j2len, newj2len = newj2len, j2len
317334
}
318335

319336
// Extend the best by non-junk elements on each end. In particular,

difflib/bytes/bytes_test.go

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,11 @@ import (
55
"fmt"
66
"math"
77
"reflect"
8+
"runtime"
89
"strings"
910
"testing"
1011
"sort"
12+
"../tester"
1113
)
1214

1315
func assertAlmostEqual(t *testing.T, a, b float64, places int) {
@@ -472,6 +474,51 @@ func BenchmarkSplitLines10000(b *testing.B) {
472474
benchmarkSplitLines(b, 10000)
473475
}
474476

477+
func prepareFilesToDiff(count, seed int) (As, Bs [][][]byte) {
478+
defer runtime.GC()
479+
aux := func () { // to ensure temp variables go out of scope
480+
stringsA, stringsB := tester.PrepareStringsToDiff(count, seed)
481+
As = make([][][]byte, len(stringsA))
482+
Bs = make([][][]byte, len(stringsB))
483+
for i := range As {
484+
As[i] = stringsToBytes(stringsA[i]...)
485+
Bs[i] = stringsToBytes(stringsB[i]...)
486+
}
487+
}
488+
aux()
489+
return
490+
}
491+
492+
func BenchmarkDiffer(b *testing.B) {
493+
A, B := prepareFilesToDiff(b.N, 0)
494+
fmt.Printf("\nDiff length:")
495+
b.ResetTimer()
496+
differ := NewDiffer()
497+
for i := range A {
498+
var x [][]byte
499+
for n := 0; n < 5; n++ {
500+
x, _ = differ.Compare(A[i], B[i])
501+
}
502+
fmt.Printf(" %v", len(x))
503+
}
504+
fmt.Printf("\n")
505+
}
506+
507+
func BenchmarkMatcher(b *testing.B) {
508+
As, Bs := prepareFilesToDiff(b.N, 0)
509+
fmt.Printf("\nOpcodes count:")
510+
b.ResetTimer()
511+
for i := range As {
512+
var x []OpCode
513+
for n := 0; n < 5; n++ {
514+
sm := NewMatcher(As[i], Bs[i])
515+
x = sm.GetOpCodes()
516+
}
517+
fmt.Printf(" %v", len(x))
518+
}
519+
fmt.Printf("\n")
520+
}
521+
475522
func TestDifferCompare(t *testing.T) {
476523
diff := NewDiffer()
477524
// Test

difflib/difflib.go

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -234,26 +234,43 @@ func (m *SequenceMatcher) findLongestMatch(alo, ahi, blo, bhi int) Match {
234234
// find longest junk-free match
235235
// during an iteration of the loop, j2len[j] = length of longest
236236
// junk-free match ending with a[i-1] and b[j]
237-
j2len := map[int]int{}
237+
N := bhi - blo
238+
j2len := make([]int, N)
239+
newj2len := make([]int, N)
240+
var indices []int
238241
for i := alo; i != ahi; i++ {
239242
// look at all instances of a[i] in b; note that because
240243
// b2j has no junk keys, the loop is skipped if a[i] is junk
241-
newj2len := map[int]int{}
242-
for _, j := range m.b2j[m.a[i]] {
244+
newindices := m.b2j[m.a[i]]
245+
for _, j := range newindices {
243246
// a[i] matches b[j]
244247
if j < blo {
245248
continue
246249
}
247250
if j >= bhi {
248251
break
249252
}
250-
k := j2len[j-1] + 1
251-
newj2len[j] = k
253+
k := 1
254+
if j > blo {
255+
k = j2len[j-1-blo] + 1
256+
}
257+
newj2len[j-blo] = k
252258
if k > bestsize {
253259
besti, bestj, bestsize = i-k+1, j-k+1, k
254260
}
255261
}
256-
j2len = newj2len
262+
// j2len = newj2len, clear and reuse j2len as newj2len
263+
for _, j := range indices {
264+
if j < blo {
265+
continue
266+
}
267+
if j >= bhi {
268+
break
269+
}
270+
j2len[j-blo] = 0
271+
}
272+
indices = newindices
273+
j2len, newj2len = newj2len, j2len
257274
}
258275

259276
// Extend the best by non-junk elements on each end. In particular,

difflib/difflib_test.go

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,10 @@ import (
55
"fmt"
66
"math"
77
"reflect"
8+
"runtime"
89
"strings"
910
"testing"
11+
"./tester"
1012
)
1113

1214
func assertAlmostEqual(t *testing.T, a, b float64, places int) {
@@ -432,6 +434,41 @@ func BenchmarkSplitLines10000(b *testing.B) {
432434
benchmarkSplitLines(b, 10000)
433435
}
434436

437+
func prepareFilesToDiff(count, seed int) (As, Bs [][]string) {
438+
defer runtime.GC()
439+
return tester.PrepareStringsToDiff(count, seed)
440+
}
441+
442+
func BenchmarkDiffer(b *testing.B) {
443+
A, B := prepareFilesToDiff(b.N, 0)
444+
fmt.Printf("\nDiff length:")
445+
b.ResetTimer()
446+
differ := NewDiffer()
447+
for i := range A {
448+
var x []string
449+
for n := 0; n < 5; n++ {
450+
x, _ = differ.Compare(A[i], B[i])
451+
}
452+
fmt.Printf(" %v", len(x))
453+
}
454+
fmt.Printf("\n")
455+
}
456+
457+
func BenchmarkMatcher(b *testing.B) {
458+
A, B := prepareFilesToDiff(b.N, 0)
459+
fmt.Printf("\nOpcodes count:")
460+
b.ResetTimer()
461+
for i := range A {
462+
var x []OpCode
463+
for n := 0; n < 5; n++ {
464+
sm := NewMatcher(A[i], B[i])
465+
x = sm.GetOpCodes()
466+
}
467+
fmt.Printf(" %v", len(x))
468+
}
469+
fmt.Printf("\n")
470+
}
471+
435472
func TestDifferCompare(t *testing.T) {
436473
diff := NewDiffer()
437474
// Test

difflib/tester/tester.go

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
package tester
2+
3+
import (
4+
"math/rand"
5+
"time"
6+
)
7+
8+
func prepareStrings(seed int64) (A, B []string) {
9+
if seed == -1 {
10+
seed = time.Now().UnixNano()
11+
}
12+
rand.Seed(seed)
13+
// Generate 4000 random lines
14+
lines := [4000]string{}
15+
for i := range lines {
16+
l := rand.Intn(100)
17+
p := make([]byte, l)
18+
rand.Read(p)
19+
lines[i] = string(p)
20+
}
21+
// Generate two 4000 lines documents by picking some lines at random
22+
A = make([]string, 4000)
23+
B = make([]string, len(A))
24+
for i := range A {
25+
// make the first 50 lines more likely to appear
26+
if rand.Intn(100) < 40 {
27+
A[i] = lines[rand.Intn(50)]
28+
} else {
29+
A[i] = lines[rand.Intn(len(lines))]
30+
}
31+
if rand.Intn(100) < 40 {
32+
B[i] = lines[rand.Intn(50)]
33+
} else {
34+
B[i] = lines[rand.Intn(len(lines))]
35+
}
36+
}
37+
// Do some copies from A to B
38+
maxcopy := rand.Intn(len(A)-1)+1
39+
for copied, tocopy := 0, rand.Intn(2*len(A)/3); copied < tocopy; {
40+
l := rand.Intn(rand.Intn(maxcopy-1)+1)
41+
for a,b,n := rand.Intn(len(A)), rand.Intn(len(B)), 0;
42+
a < len(A) && b < len(B) && n < l; a,b,n = a+1,b+1,n+1 {
43+
B[b] = A[a]
44+
copied++
45+
}
46+
}
47+
// And some from B to A
48+
for copied, tocopy := 0, rand.Intn(2*len(A)/3); copied < tocopy; {
49+
l := rand.Intn(rand.Intn(maxcopy-1)+1)
50+
for a,b,n := rand.Intn(len(A)), rand.Intn(len(B)), 0;
51+
a < len(A) && b < len(B) && n < l; a,b,n = a+1,b+1,n+1 {
52+
A[a] = B[b]
53+
copied++
54+
}
55+
}
56+
return
57+
}
58+
59+
func PrepareStringsToDiff(count, seed int) (As, Bs [][]string) {
60+
As = make([][]string, count)
61+
Bs = make([][]string, count)
62+
for i := range As {
63+
As[i], Bs[i] = prepareStrings(int64(i+seed))
64+
}
65+
return
66+
}

0 commit comments

Comments
 (0)