Skip to content

Commit 9a6f4eb

Browse files
committed
feat: 对slice编码时支持根据数据频率确定编码权重
1 parent e5aca34 commit 9a6f4eb

File tree

4 files changed

+127
-0
lines changed

4 files changed

+127
-0
lines changed

go.mod

+2
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ require (
66
github.com/compression-algorithm-research-lab/go-bit-buffer v0.0.0-20231024145723-e6a2ab4a8de4 // indirect
77
github.com/davecgh/go-spew v1.1.1 // indirect
88
github.com/golang-infrastructure/go-gtypes v0.0.1 // indirect
9+
github.com/golang-infrastructure/go-reflect-utils v0.0.0-20221130143747-965ef2eb09c3 // indirect
10+
github.com/golang-infrastructure/go-tuple v0.0.0-20221215155811-4ed54fe7d579 // indirect
911
github.com/pmezard/go-difflib v1.0.0 // indirect
1012
github.com/stretchr/objx v0.5.1 // indirect
1113
github.com/stretchr/testify v1.8.4 // indirect

go.sum

+4
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c
99
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
1010
github.com/golang-infrastructure/go-gtypes v0.0.1 h1:hnM1OYSwLPLGkZ4C6ecAxgmAUaPTjnhnUtRNmJj4p6c=
1111
github.com/golang-infrastructure/go-gtypes v0.0.1/go.mod h1:vFMCxFzxdMInvTtgLZRlWI1rS+mui88sMbL5I+zu1hg=
12+
github.com/golang-infrastructure/go-reflect-utils v0.0.0-20221130143747-965ef2eb09c3 h1:jJ7AdpNdLQudsx1hiXY9iwmauHARV4/UB52KnBh9Se0=
13+
github.com/golang-infrastructure/go-reflect-utils v0.0.0-20221130143747-965ef2eb09c3/go.mod h1:zqXYxqOBa1mL2ilBK6PuH/Wb/Iego7en6XhiKWdZQHI=
14+
github.com/golang-infrastructure/go-tuple v0.0.0-20221215155811-4ed54fe7d579 h1:pQV2/ichhyLoR3aJSNXByuxtdPM2y229Rq5x9DGl5OU=
15+
github.com/golang-infrastructure/go-tuple v0.0.0-20221215155811-4ed54fe7d579/go.mod h1:cn8fHK0Sjxh7nSrnNpRa9wi1wIsmBLsjOip4LTjQz+Q=
1216
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
1317
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
1418
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=

slice_with_frequency.go

+81
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
package unary_coding
2+
3+
import (
4+
bit_buffer "github.com/compression-algorithm-research-lab/go-bit-buffer"
5+
"github.com/golang-infrastructure/go-gtypes"
6+
"github.com/golang-infrastructure/go-tuple"
7+
"sort"
8+
)
9+
10+
// EncodeSliceWithFrequency 基于频率对切片进行一元数组编码
11+
func EncodeSliceWithFrequency[T gtypes.Unsigned](slice []T) ([]byte, []T) {
12+
13+
// 对无符号整数切片中的数字统计词频
14+
countMap := make(map[T]int, 0)
15+
for _, v := range slice {
16+
countMap[v] += 1
17+
}
18+
// 根据词频倒序排序
19+
countSlice := make([]*tuple.Tuple2[T, int], 0)
20+
for v, c := range countMap {
21+
countSlice = append(countSlice, tuple.New2(v, c))
22+
}
23+
sort.Slice(countSlice, func(i, j int) bool {
24+
// 逆序
25+
return countSlice[i].V2 > countSlice[j].V2
26+
})
27+
weightSlice := make([]T, 0)
28+
weightMap := make(map[T]int, 0)
29+
for i, t := range countSlice {
30+
weightSlice = append(weightSlice, t.V1)
31+
weightMap[t.V1] = i + 1
32+
}
33+
34+
// 好了,现在,终于可以开始一元编码进行压缩了
35+
buffer := bit_buffer.New()
36+
for _, v := range slice {
37+
weight := weightMap[v]
38+
for weight > 0 {
39+
buffer.WriteBit(1)
40+
weight--
41+
}
42+
buffer.WriteBit(0)
43+
}
44+
return buffer.Bytes(), weightSlice
45+
}
46+
47+
// DecodeSliceWithFrequency 基于频率对切片进行一元数组编码
48+
func DecodeSliceWithFrequency[T gtypes.Unsigned](bytes []byte, weightSlice []T) []T {
49+
50+
// 生成权重到无符号整数的映射
51+
weightMap := make(map[int]T, 0)
52+
for weight, v := range weightSlice {
53+
weightMap[weight+1] = v
54+
}
55+
56+
// 然后开始解压
57+
slice := make([]T, 0)
58+
buffer := bit_buffer.New().SetBytes(bytes)
59+
for !buffer.IsTail() {
60+
61+
// 先读取一位看看
62+
bit := buffer.ReadBit()
63+
if bit == 0 {
64+
// 说明出现了两个连续的0,那意味着已经读取结束了
65+
break
66+
}
67+
68+
// 然后开始读取连续的1,直到结束
69+
c := 1
70+
for {
71+
bit := buffer.ReadBit()
72+
if bit == 1 {
73+
c++
74+
} else {
75+
break
76+
}
77+
}
78+
slice = append(slice, weightMap[c])
79+
}
80+
return slice
81+
}

slice_with_frequency_test.go

+40
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
package unary_coding
2+
3+
import (
4+
"github.com/stretchr/testify/assert"
5+
"testing"
6+
)
7+
8+
func TestEncodeSliceWithFrequency(t *testing.T) {
9+
rawSlice := []uint64{
10+
10086, 10086, 1024, 10086, 10086, 1024, 10086, 1024, 10086, 10086, 10086, 1024, 10086, 10086,
11+
}
12+
bytes, frequency := EncodeSliceWithFrequency(rawSlice)
13+
assert.Equal(t, 4, len(bytes))
14+
binaryString := ToBinaryString(bytes)
15+
assert.Equal(t, "10101101010110101101010101101010", binaryString)
16+
assert.Equal(t, []uint64{
17+
uint64(10086),
18+
uint64(1024),
19+
}, frequency)
20+
21+
}
22+
23+
func TestDecodeSliceWithFrequency(t *testing.T) {
24+
25+
rawSlice := []uint64{
26+
10086, 10086, 1024, 10086, 10086, 1024, 10086, 1024, 10086, 10086, 10086, 1024, 10086, 10086,
27+
}
28+
bytes, frequency := EncodeSliceWithFrequency(rawSlice)
29+
assert.Equal(t, 4, len(bytes))
30+
binaryString := ToBinaryString(bytes)
31+
assert.Equal(t, "10101101010110101101010101101010", binaryString)
32+
assert.Equal(t, []uint64{
33+
uint64(10086),
34+
uint64(1024),
35+
}, frequency)
36+
37+
unzipSlice := DecodeSliceWithFrequency[uint64](bytes, frequency)
38+
assert.Equal(t, rawSlice, unzipSlice)
39+
40+
}

0 commit comments

Comments
 (0)