diff --git a/statistics/cmsketch.go b/statistics/cmsketch.go index 00935d0c45638..b915da6929a15 100644 --- a/statistics/cmsketch.go +++ b/statistics/cmsketch.go @@ -65,11 +65,10 @@ func NewCMSketch(d, w int32) *CMSketch { // topNHelper wraps some variables used when building cmsketch with top n. type topNHelper struct { sampleSize uint64 - counter map[hack.MutableString]uint64 - sorted []uint64 + sorted []dataCnt onlyOnceItems uint64 sumTopN uint64 - lastVal uint64 + actualNumTop uint32 } func newTopNHelper(sample [][]byte, numTop uint32) *topNHelper { @@ -77,20 +76,16 @@ func newTopNHelper(sample [][]byte, numTop uint32) *topNHelper { for i := range sample { counter[hack.String(sample[i])]++ } - sorted, onlyOnceItems := make([]uint64, 0, len(counter)), uint64(0) - for _, cnt := range counter { - sorted = append(sorted, cnt) + sorted, onlyOnceItems := make([]dataCnt, 0, len(counter)), uint64(0) + for key, cnt := range counter { + sorted = append(sorted, dataCnt{hack.Slice(string(key)), cnt}) if cnt == 1 { onlyOnceItems++ } } - sort.Slice(sorted, func(i, j int) bool { - return sorted[i] > sorted[j] - }) + sort.SliceStable(sorted, func(i, j int) bool { return sorted[i].cnt > sorted[j].cnt }) var ( - // last is the last element in top N index should occurres atleast `last` times. - last uint64 sumTopN uint64 sampleNDV = uint32(len(sorted)) ) @@ -99,18 +94,18 @@ func newTopNHelper(sample [][]byte, numTop uint32) *topNHelper { // frequency of the n-th element are added to the TopN statistics. We chose // 2/3 as an empirical value because the average cardinality estimation // error is relatively small compared with 1/2. - for i := uint32(0); i < sampleNDV && i < numTop*2; i++ { - if i >= numTop && sorted[i]*3 < sorted[numTop-1]*2 && last != sorted[i] { + var actualNumTop uint32 + for ; actualNumTop < sampleNDV && actualNumTop < numTop*2; actualNumTop++ { + if actualNumTop >= numTop && sorted[actualNumTop].cnt*3 < sorted[numTop-1].cnt*2 { break } - if sorted[i] == 1 { + if sorted[actualNumTop].cnt == 1 { break } - last = sorted[i] - sumTopN += sorted[i] + sumTopN += sorted[actualNumTop].cnt } - return &topNHelper{uint64(len(sample)), counter, sorted, onlyOnceItems, sumTopN, last} + return &topNHelper{uint64(len(sample)), sorted, onlyOnceItems, sumTopN, actualNumTop} } // NewCMSketchWithTopN returns a new CM sketch with TopN elements, the estimate NDV and the scale ratio. @@ -130,22 +125,23 @@ func buildCMSWithTopN(helper *topNHelper, d, w int32, scaleRatio uint64, default enableTopN := helper.sampleSize/topNThreshold <= helper.sumTopN if enableTopN { c.topN = make(map[uint64][]*TopNMeta) + for i := uint32(0); i < helper.actualNumTop; i++ { + data, cnt := helper.sorted[i].data, helper.sorted[i].cnt + h1, h2 := murmur3.Sum128(data) + c.topN[h1] = append(c.topN[h1], &TopNMeta{h2, data, cnt * scaleRatio}) + } + helper.sorted = helper.sorted[helper.actualNumTop:] } c.defaultValue = defaultVal - for counterKey, cnt := range helper.counter { - data := hack.Slice(string(counterKey)) + for i := range helper.sorted { + data, cnt := helper.sorted[i].data, helper.sorted[i].cnt // If the value only occurred once in the sample, we assumes that there is no difference with // value that does not occurred in the sample. rowCount := defaultVal if cnt > 1 { rowCount = cnt * scaleRatio } - if enableTopN && cnt >= helper.lastVal { - h1, h2 := murmur3.Sum128(data) - c.topN[h1] = append(c.topN[h1], &TopNMeta{h2, data, rowCount}) - } else { - c.insertBytesByCount(data, rowCount) - } + c.insertBytesByCount(data, rowCount) } return } diff --git a/statistics/cmsketch_test.go b/statistics/cmsketch_test.go index 15decc68384c2..1f3d130f9e909 100644 --- a/statistics/cmsketch_test.go +++ b/statistics/cmsketch_test.go @@ -194,6 +194,7 @@ func (s *testStatisticsSuite) TestCMSketchTopN(c *C) { for _, t := range tests { lSketch, lMap, err := buildCMSketchTopNAndMap(d, w, 20, 1000, 0, total, imax, t.zipfFactor) c.Check(err, IsNil) + c.Assert(len(lSketch.TopN()), LessEqual, 40) avg, err := averageAbsoluteError(lSketch, lMap) c.Assert(err, IsNil) c.Check(avg, LessEqual, t.avgError) diff --git a/statistics/sample.go b/statistics/sample.go index 96cc22dead8a8..58460b003d196 100644 --- a/statistics/sample.go +++ b/statistics/sample.go @@ -25,7 +25,6 @@ import ( "github.com/pingcap/tidb/sessionctx/stmtctx" "github.com/pingcap/tidb/types" "github.com/pingcap/tidb/util/chunk" - "github.com/pingcap/tidb/util/hack" "github.com/pingcap/tidb/util/sqlexec" "github.com/pingcap/tipb/go-tipb" "github.com/spaolacci/murmur3" @@ -272,19 +271,13 @@ func (c *SampleCollector) ExtractTopN(numTop uint32) { helper := newTopNHelper(values, numTop) cms := c.CMSketch cms.topN = make(map[uint64][]*TopNMeta) - dataCnts := make([]dataCnt, 0, len(helper.counter)) - for key, cnt := range helper.counter { - if cnt >= helper.lastVal { - dataCnts = append(dataCnts, dataCnt{hack.Slice(string(key)), cnt}) - } - } - // Sort them decreasingly so we can handle most frequent values first and reduce the probability of hash collision + // Process them decreasingly so we can handle most frequent values first and reduce the probability of hash collision // by small values. - sort.SliceStable(dataCnts, func(i, j int) bool { return dataCnts[i].cnt >= dataCnts[j].cnt }) - for _, dc := range dataCnts { - h1, h2 := murmur3.Sum128(dc.data) + for i := uint32(0); i < helper.actualNumTop; i++ { + data := helper.sorted[i].data + h1, h2 := murmur3.Sum128(data) realCnt := cms.queryHashValue(h1, h2) cms.subValue(h1, h2, realCnt) - cms.topN[h1] = append(cms.topN[h1], &TopNMeta{h2, dc.data, realCnt}) + cms.topN[h1] = append(cms.topN[h1], &TopNMeta{h2, data, realCnt}) } }