Skip to content

Commit

Permalink
stats: do not split excluded lower value ranges (pingcap#12009)
Browse files Browse the repository at this point in the history
  • Loading branch information
alivxxx authored and sre-bot committed Sep 11, 2019
1 parent 45fcba1 commit 440bb74
Show file tree
Hide file tree
Showing 6 changed files with 97 additions and 89 deletions.
14 changes: 10 additions & 4 deletions statistics/feedback.go
Original file line number Diff line number Diff line change
Expand Up @@ -313,15 +313,21 @@ func buildBucketFeedback(h *Histogram, feedback *QueryFeedback) (map[int]*Bucket
if skip {
continue
}
idx, _ := h.Bounds.LowerBound(0, fb.Lower)
idx := h.Bounds.UpperBound(0, fb.Lower)
bktIdx := 0
// The last bucket also stores the feedback that falls outside the upper bound.
if idx >= h.Bounds.NumRows()-2 {
if idx >= h.Bounds.NumRows()-1 {
bktIdx = h.Len() - 1
} else if h.Len() == 1 {
bktIdx = 0
} else {
bktIdx = idx / 2
if idx == 0 {
bktIdx = 0
} else {
bktIdx = (idx - 1) / 2
}
// Make sure that this feedback lies within the bucket.
if chunk.Compare(h.Bounds.GetRow(2*bktIdx+1), 0, fb.Upper) < 0 {
if chunk.Compare(h.Bounds.GetRow(2*(bktIdx+1)), 0, fb.Upper) < 0 {
continue
}
}
Expand Down
15 changes: 7 additions & 8 deletions statistics/feedback_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,14 +72,13 @@ func (s *testFeedbackSuite) TestUpdateHistogram(c *C) {
defaultBucketCount = 7
defer func() { defaultBucketCount = originBucketCount }()
c.Assert(UpdateHistogram(q.Hist, q).ToString(0), Equals,
"column:0 ndv:10058 totColSize:0\n"+
"num: 10000 lower_bound: 0 upper_bound: 1 repeats: 0\n"+
"num: 9 lower_bound: 2 upper_bound: 7 repeats: 0\n"+
"num: 11 lower_bound: 8 upper_bound: 19 repeats: 0\n"+
"num: 0 lower_bound: 20 upper_bound: 20 repeats: 0\n"+
"num: 18 lower_bound: 21 upper_bound: 39 repeats: 0\n"+
"num: 18 lower_bound: 40 upper_bound: 58 repeats: 0\n"+
"num: 2 lower_bound: 59 upper_bound: 60 repeats: 0")
"column:0 ndv:10053 totColSize:0\n"+
"num: 10001 lower_bound: 0 upper_bound: 2 repeats: 0\n"+
"num: 7 lower_bound: 2 upper_bound: 5 repeats: 0\n"+
"num: 4 lower_bound: 5 upper_bound: 7 repeats: 0\n"+
"num: 11 lower_bound: 10 upper_bound: 20 repeats: 0\n"+
"num: 19 lower_bound: 30 upper_bound: 49 repeats: 0\n"+
"num: 11 lower_bound: 50 upper_bound: 60 repeats: 0")
}

func (s *testFeedbackSuite) TestSplitBuckets(c *C) {
Expand Down
6 changes: 3 additions & 3 deletions statistics/handle/update.go
Original file line number Diff line number Diff line change
Expand Up @@ -766,11 +766,11 @@ func formatBuckets(hg *statistics.Histogram, lowBkt, highBkt, idxCols int) strin
return hg.BucketToString(lowBkt, idxCols)
}
if lowBkt+1 == highBkt {
return fmt.Sprintf("%s, %s", hg.BucketToString(lowBkt, 0), hg.BucketToString(highBkt, 0))
return fmt.Sprintf("%s, %s", hg.BucketToString(lowBkt, idxCols), hg.BucketToString(highBkt, idxCols))
}
// do not care the middle buckets
return fmt.Sprintf("%s, (%d buckets, total count %d), %s", hg.BucketToString(lowBkt, 0),
highBkt-lowBkt-1, hg.Buckets[highBkt-1].Count-hg.Buckets[lowBkt].Count, hg.BucketToString(highBkt, 0))
return fmt.Sprintf("%s, (%d buckets, total count %d), %s", hg.BucketToString(lowBkt, idxCols),
highBkt-lowBkt-1, hg.Buckets[highBkt-1].Count-hg.Buckets[lowBkt].Count, hg.BucketToString(highBkt, idxCols))
}

func colRangeToStr(c *statistics.Column, ran *ranger.Range, actual int64, factor float64) string {
Expand Down
82 changes: 41 additions & 41 deletions statistics/handle/update_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -684,18 +684,18 @@ func (s *testStatsSuite) TestSplitRange(c *C) {
{
points: []int64{0, 1, 3, 8, 8, 20},
exclude: []bool{true, false, true, false, true, false},
result: "(0,1],(3,5],(5,7],(7,8],(8,20]",
result: "(0,1],(3,7),[7,8),[8,8],(8,10),[10,20]",
},
{
points: []int64{8, 10, 20, 30},
exclude: []bool{false, false, true, true},
result: "[8,8],(8,10],(20,30)",
result: "[8,10),[10,10],(20,30)",
},
{
// test remove invalid range
points: []int64{8, 9},
exclude: []bool{false, true},
result: "[8,8]",
result: "[8,9)",
},
}
for _, t := range tests {
Expand Down Expand Up @@ -743,25 +743,25 @@ func (s *testStatsSuite) TestQueryFeedback(c *C) {
// test primary key feedback
sql: "select * from t where t.a <= 5 order by a desc",
hist: "column:1 ndv:4 totColSize:0\n" +
"num: 1 lower_bound: -9223372036854775808 upper_bound: 1 repeats: 0\n" +
"num: 1 lower_bound: 2 upper_bound: 2 repeats: 1\n" +
"num: 2 lower_bound: 3 upper_bound: 5 repeats: 0",
"num: 1 lower_bound: -9223372036854775808 upper_bound: 2 repeats: 0\n" +
"num: 2 lower_bound: 2 upper_bound: 4 repeats: 0\n" +
"num: 1 lower_bound: 4 upper_bound: 4 repeats: 1",
idxCols: 0,
},
{
// test index feedback by double read
sql: "select * from t use index(idx) where t.b <= 5",
hist: "index:1 ndv:2\n" +
"num: 2 lower_bound: -inf upper_bound: 2 repeats: 0\n" +
"num: 2 lower_bound: 3 upper_bound: 6 repeats: 0",
"num: 3 lower_bound: -inf upper_bound: 5 repeats: 0\n" +
"num: 1 lower_bound: 5 upper_bound: 5 repeats: 1",
idxCols: 1,
},
{
// test index feedback by single read
sql: "select b from t use index(idx) where t.b <= 5",
hist: "index:1 ndv:2\n" +
"num: 2 lower_bound: -inf upper_bound: 2 repeats: 0\n" +
"num: 2 lower_bound: 3 upper_bound: 6 repeats: 0",
"num: 3 lower_bound: -inf upper_bound: 5 repeats: 0\n" +
"num: 1 lower_bound: 5 upper_bound: 5 repeats: 1",
idxCols: 1,
},
}
Expand Down Expand Up @@ -855,7 +855,7 @@ func (s *testStatsSuite) TestQueryFeedbackForPartition(c *C) {
// test primary key feedback
sql: "select * from t where t.a <= 5",
hist: "column:1 ndv:2 totColSize:0\n" +
"num: 1 lower_bound: -9223372036854775808 upper_bound: 1 repeats: 0\n" +
"num: 1 lower_bound: -9223372036854775808 upper_bound: 2 repeats: 0\n" +
"num: 1 lower_bound: 2 upper_bound: 5 repeats: 0",
idxCols: 0,
},
Expand Down Expand Up @@ -987,17 +987,17 @@ func (s *testStatsSuite) TestUpdateStatsByLocalFeedback(c *C) {

c.Assert(tbl.Columns[tblInfo.Columns[0].ID].ToString(0), Equals, "column:1 ndv:3 totColSize:0\n"+
"num: 1 lower_bound: 1 upper_bound: 1 repeats: 1\n"+
"num: 1 lower_bound: 2 upper_bound: 2 repeats: 1\n"+
"num: 2 lower_bound: 3 upper_bound: 9223372036854775807 repeats: 0")
"num: 2 lower_bound: 2 upper_bound: 4 repeats: 0\n"+
"num: 1 lower_bound: 4 upper_bound: 9223372036854775807 repeats: 0")
sc := &stmtctx.StatementContext{TimeZone: time.Local}
low, err := codec.EncodeKey(sc, nil, types.NewIntDatum(5))
c.Assert(err, IsNil)

c.Assert(tbl.Indices[tblInfo.Indices[0].ID].CMSketch.QueryBytes(low), Equals, uint64(2))

c.Assert(tbl.Indices[tblInfo.Indices[0].ID].ToString(1), Equals, "index:1 ndv:2\n"+
"num: 2 lower_bound: -inf upper_bound: 2 repeats: 0\n"+
"num: 2 lower_bound: 3 upper_bound: 6 repeats: 0")
"num: 2 lower_bound: -inf upper_bound: 5 repeats: 0\n"+
"num: 1 lower_bound: 5 upper_bound: 5 repeats: 1")

// Test that it won't cause panic after update.
testKit.MustQuery("select * from t use index(idx) where b > 0")
Expand Down Expand Up @@ -1038,8 +1038,8 @@ func (s *testStatsSuite) TestUpdatePartitionStatsByLocalFeedback(c *C) {

c.Assert(tbl.Columns[tblInfo.Columns[0].ID].ToString(0), Equals, "column:1 ndv:3 totColSize:0\n"+
"num: 1 lower_bound: 1 upper_bound: 1 repeats: 1\n"+
"num: 1 lower_bound: 2 upper_bound: 2 repeats: 1\n"+
"num: 2 lower_bound: 3 upper_bound: 9223372036854775807 repeats: 0")
"num: 2 lower_bound: 2 upper_bound: 4 repeats: 0\n"+
"num: 1 lower_bound: 4 upper_bound: 9223372036854775807 repeats: 0")
}

type logHook struct {
Expand Down Expand Up @@ -1112,13 +1112,13 @@ func (s *testStatsSuite) TestLogDetailedInfo(c *C) {
}{
{
sql: "select * from t where t.a <= 15",
result: "[stats-feedback] test.t, column=a, rangeStr=range: [-inf,7), actual: 8, expected: 7, buckets: {num: 8 lower_bound: 0 upper_bound: 7 repeats: 1}" +
result: "[stats-feedback] test.t, column=a, rangeStr=range: [-inf,8), actual: 8, expected: 8, buckets: {num: 8 lower_bound: 0 upper_bound: 7 repeats: 1, num: 8 lower_bound: 8 upper_bound: 15 repeats: 1}" +
"[stats-feedback] test.t, column=a, rangeStr=range: [8,15), actual: 8, expected: 7, buckets: {num: 8 lower_bound: 8 upper_bound: 15 repeats: 1}",
},
{
sql: "select * from t use index(idx) where t.b <= 15",
result: "[stats-feedback] test.t, index=idx, rangeStr=range: [-inf,7), actual: 8, expected: 7, histogram: {num: 8 lower_bound: 0 upper_bound: 7 repeats: 1}" +
"[stats-feedback] test.t, index=idx, rangeStr=range: [8,15), actual: 8, expected: 7, histogram: {num: 8 lower_bound: 8 upper_bound: 15 repeats: 1}",
result: "[stats-feedback] test.t, index=idx, rangeStr=range: [-inf,8), actual: 8, expected: 8, histogram: {num: 8 lower_bound: 0 upper_bound: 7 repeats: 1, num: 8 lower_bound: 8 upper_bound: 15 repeats: 1}" +
"[stats-feedback] test.t, index=idx, rangeStr=range: [8,16), actual: 8, expected: 8, histogram: {num: 8 lower_bound: 8 upper_bound: 15 repeats: 1, num: 4 lower_bound: 16 upper_bound: 19 repeats: 1}",
},
{
sql: "select b from t use index(idx_ba) where b = 1 and a <= 5",
Expand Down Expand Up @@ -1466,9 +1466,9 @@ func (s *testStatsSuite) TestAbnormalIndexFeedback(c *C) {
// The real count of `a = 1` is 0.
sql: "select * from t where a = 1 and b < 21",
hist: "column:2 ndv:20 totColSize:20\n" +
"num: 4 lower_bound: -9223372036854775808 upper_bound: 6 repeats: 0\n" +
"num: 3 lower_bound: 7 upper_bound: 13 repeats: 0\n" +
"num: 6 lower_bound: 14 upper_bound: 19 repeats: 1",
"num: 5 lower_bound: -9223372036854775808 upper_bound: 7 repeats: 0\n" +
"num: 4 lower_bound: 7 upper_bound: 14 repeats: 0\n" +
"num: 4 lower_bound: 14 upper_bound: 21 repeats: 0",
rangeID: tblInfo.Columns[1].ID,
idxID: tblInfo.Indices[0].ID,
eqCount: 3,
Expand All @@ -1477,9 +1477,9 @@ func (s *testStatsSuite) TestAbnormalIndexFeedback(c *C) {
// The real count of `b > 10` is 0.
sql: "select * from t where a = 2 and b > 10",
hist: "column:2 ndv:20 totColSize:20\n" +
"num: 4 lower_bound: -9223372036854775808 upper_bound: 6 repeats: 0\n" +
"num: 2 lower_bound: 7 upper_bound: 13 repeats: 0\n" +
"num: 6 lower_bound: 14 upper_bound: 19 repeats: 1",
"num: 5 lower_bound: -9223372036854775808 upper_bound: 7 repeats: 0\n" +
"num: 6 lower_bound: 7 upper_bound: 14 repeats: 0\n" +
"num: 7 lower_bound: 14 upper_bound: 9223372036854775807 repeats: 0",
rangeID: tblInfo.Columns[1].ID,
idxID: tblInfo.Indices[0].ID,
eqCount: 3,
Expand Down Expand Up @@ -1531,25 +1531,25 @@ func (s *testStatsSuite) TestFeedbackRanges(c *C) {
{
sql: "select * from t where a <= 50 or (a > 130 and a < 140)",
hist: "column:1 ndv:30 totColSize:0\n" +
"num: 8 lower_bound: -128 upper_bound: 7 repeats: 0\n" +
"num: 8 lower_bound: 8 upper_bound: 15 repeats: 0\n" +
"num: 8 lower_bound: -128 upper_bound: 8 repeats: 0\n" +
"num: 8 lower_bound: 8 upper_bound: 16 repeats: 0\n" +
"num: 14 lower_bound: 16 upper_bound: 50 repeats: 0",
colID: 1,
},
{
sql: "select * from t where a >= 10",
hist: "column:1 ndv:30 totColSize:0\n" +
"num: 8 lower_bound: -128 upper_bound: 7 repeats: 0\n" +
"num: 8 lower_bound: 8 upper_bound: 15 repeats: 0\n" +
"num: 8 lower_bound: -128 upper_bound: 8 repeats: 0\n" +
"num: 8 lower_bound: 8 upper_bound: 16 repeats: 0\n" +
"num: 14 lower_bound: 16 upper_bound: 127 repeats: 0",
colID: 1,
},
{
sql: "select * from t use index(idx) where a = 1 and (b <= 50 or (b > 130 and b < 140))",
hist: "column:2 ndv:20 totColSize:30\n" +
"num: 7 lower_bound: -128 upper_bound: 6 repeats: 0\n" +
"num: 7 lower_bound: 7 upper_bound: 13 repeats: 1\n" +
"num: 6 lower_bound: 14 upper_bound: 19 repeats: 1",
"num: 8 lower_bound: -128 upper_bound: 7 repeats: 0\n" +
"num: 8 lower_bound: 7 upper_bound: 14 repeats: 0\n" +
"num: 7 lower_bound: 14 upper_bound: 51 repeats: 0",
colID: 2,
},
}
Expand Down Expand Up @@ -1604,32 +1604,32 @@ func (s *testStatsSuite) TestUnsignedFeedbackRanges(c *C) {
{
sql: "select * from t where a <= 50",
hist: "column:1 ndv:30 totColSize:10\n" +
"num: 8 lower_bound: 0 upper_bound: 7 repeats: 0\n" +
"num: 8 lower_bound: 8 upper_bound: 15 repeats: 0\n" +
"num: 8 lower_bound: 0 upper_bound: 8 repeats: 0\n" +
"num: 8 lower_bound: 8 upper_bound: 16 repeats: 0\n" +
"num: 14 lower_bound: 16 upper_bound: 50 repeats: 0",
tblName: "t",
},
{
sql: "select count(*) from t",
hist: "column:1 ndv:30 totColSize:10\n" +
"num: 8 lower_bound: 0 upper_bound: 7 repeats: 0\n" +
"num: 8 lower_bound: 8 upper_bound: 15 repeats: 0\n" +
"num: 8 lower_bound: 0 upper_bound: 8 repeats: 0\n" +
"num: 8 lower_bound: 8 upper_bound: 16 repeats: 0\n" +
"num: 14 lower_bound: 16 upper_bound: 255 repeats: 0",
tblName: "t",
},
{
sql: "select * from t1 where a <= 50",
hist: "column:1 ndv:30 totColSize:10\n" +
"num: 8 lower_bound: 0 upper_bound: 7 repeats: 0\n" +
"num: 8 lower_bound: 8 upper_bound: 15 repeats: 0\n" +
"num: 8 lower_bound: 0 upper_bound: 8 repeats: 0\n" +
"num: 8 lower_bound: 8 upper_bound: 16 repeats: 0\n" +
"num: 14 lower_bound: 16 upper_bound: 50 repeats: 0",
tblName: "t1",
},
{
sql: "select count(*) from t1",
hist: "column:1 ndv:30 totColSize:10\n" +
"num: 8 lower_bound: 0 upper_bound: 7 repeats: 0\n" +
"num: 8 lower_bound: 8 upper_bound: 15 repeats: 0\n" +
"num: 8 lower_bound: 0 upper_bound: 8 repeats: 0\n" +
"num: 8 lower_bound: 8 upper_bound: 16 repeats: 0\n" +
"num: 14 lower_bound: 16 upper_bound: 18446744073709551615 repeats: 0",
tblName: "t1",
},
Expand Down
57 changes: 31 additions & 26 deletions statistics/histogram.go
Original file line number Diff line number Diff line change
Expand Up @@ -432,41 +432,43 @@ func (hg *Histogram) typeMatch(ranges []*ranger.Range) bool {
return true
}

// SplitRange splits the range according to the histogram upper bound. Note that we treat last bucket's upper bound
// as inf, so all the split Ranges will totally fall in one of the (-inf, u(0)], (u(0), u(1)],...(u(n-3), u(n-2)],
// (u(n-2), +inf), where n is the number of buckets, u(i) is the i-th bucket's upper bound.
// SplitRange splits the range according to the histogram lower bound. Note that we treat first bucket's lower bound
// as -inf and last bucket's upper bound as +inf, so all the split ranges will totally fall in one of the (-inf, l(1)),
// [l(1), l(2)),...[l(n-2), l(n-1)), [l(n-1), +inf), where n is the number of buckets, l(i) is the i-th bucket's lower bound.
func (hg *Histogram) SplitRange(sc *stmtctx.StatementContext, oldRanges []*ranger.Range, encoded bool) ([]*ranger.Range, bool) {
if !hg.typeMatch(oldRanges) {
return oldRanges, false
}
// Treat the only buckets as (-inf, +inf), so we do not need split it.
if hg.Len() == 1 {
return oldRanges, true
}
ranges := make([]*ranger.Range, 0, len(oldRanges))
for _, ran := range oldRanges {
ranges = append(ranges, ran.Clone())
}
split := make([]*ranger.Range, 0, len(ranges))
for len(ranges) > 0 {
// Find the last bound that greater or equal to the LowVal.
// Find the first bound that greater than the LowVal.
idx := hg.Bounds.UpperBound(0, &ranges[0].LowVal[0])
if !ranges[0].LowExclude && idx > 0 {
cmp := chunk.Compare(hg.Bounds.GetRow(idx-1), 0, &ranges[0].LowVal[0])
if cmp == 0 {
idx--
}
}
// Treat last bucket's upper bound as inf, so we do not need split any more.
if idx >= hg.Bounds.NumRows()-2 {
// Treat last bucket's upper bound as +inf, so we do not need split any more.
if idx >= hg.Bounds.NumRows()-1 {
split = append(split, ranges...)
break
}
// Get the corresponding upper bound.
if idx%2 == 0 {
// Treat first buckets's lower bound as -inf, just increase it to the next lower bound.
if idx == 0 {
idx = 2
}
// Get the next lower bound.
if idx%2 == 1 {
idx++
}
upperBound := hg.Bounds.GetRow(idx)
lowerBound := hg.Bounds.GetRow(idx)
var i int
// Find the first range that need to be split by the upper bound.
// Find the first range that need to be split by the lower bound.
for ; i < len(ranges); i++ {
if chunk.Compare(upperBound, 0, &ranges[i].HighVal[0]) < 0 {
if chunk.Compare(lowerBound, 0, &ranges[i].HighVal[0]) <= 0 {
break
}
}
Expand All @@ -475,17 +477,20 @@ func (hg *Histogram) SplitRange(sc *stmtctx.StatementContext, oldRanges []*range
if len(ranges) == 0 {
break
}
// Split according to the upper bound.
cmp := chunk.Compare(upperBound, 0, &ranges[0].LowVal[0])
if cmp > 0 || (cmp == 0 && !ranges[0].LowExclude) {
upper := upperBound.GetDatum(0, hg.Tp)
split = append(split, &ranger.Range{
// Split according to the lower bound.
cmp := chunk.Compare(lowerBound, 0, &ranges[0].LowVal[0])
if cmp > 0 {
lower := lowerBound.GetDatum(0, hg.Tp)
newRange := &ranger.Range{
LowExclude: ranges[0].LowExclude,
LowVal: []types.Datum{ranges[0].LowVal[0]},
HighVal: []types.Datum{upper},
HighExclude: false})
ranges[0].LowVal[0] = upper
ranges[0].LowExclude = true
HighVal: []types.Datum{lower},
HighExclude: true}
if validRange(sc, newRange, encoded) {
split = append(split, newRange)
}
ranges[0].LowVal[0] = lower
ranges[0].LowExclude = false
if !validRange(sc, ranges[0], encoded) {
ranges = ranges[1:]
}
Expand Down
12 changes: 5 additions & 7 deletions statistics/histogram_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,9 @@ func (s *testStatisticsSuite) TestNewHistogramBySelectivity(c *C) {
node.Ranges = append(node.Ranges, &ranger.Range{LowVal: types.MakeDatums(25), HighVal: []types.Datum{types.MaxValueDatum()}})
intColResult := `column:1 ndv:16 totColSize:0
num: 30 lower_bound: 0 upper_bound: 2 repeats: 10
num: 10 lower_bound: 3 upper_bound: 5 repeats: 10
num: 20 lower_bound: 6 upper_bound: 8 repeats: 10
num: 20 lower_bound: 9 upper_bound: 11 repeats: 0
num: 20 lower_bound: 6 upper_bound: 8 repeats: 0
num: 30 lower_bound: 9 upper_bound: 11 repeats: 0
num: 10 lower_bound: 12 upper_bound: 14 repeats: 0
num: 20 lower_bound: 24 upper_bound: 26 repeats: 10
num: 30 lower_bound: 27 upper_bound: 29 repeats: 0`

stringCol := &Column{}
Expand Down Expand Up @@ -85,9 +83,9 @@ num: 30 lower_bound: 27 upper_bound: 29 repeats: 0`
node2.Ranges = append(node2.Ranges, &ranger.Range{LowVal: types.MakeDatums("ggg"), HighVal: []types.Datum{types.MaxValueDatum()}})
stringColResult := `column:2 ndv:9 totColSize:0
num: 60 lower_bound: a upper_bound: aaaabbbb repeats: 0
num: 60 lower_bound: bbbb upper_bound: fdsfdsfds repeats: 20
num: 60 lower_bound: kkkkk upper_bound: ooooo repeats: 20
num: 60 lower_bound: oooooo upper_bound: sssss repeats: 20
num: 52 lower_bound: bbbb upper_bound: fdsfdsfds repeats: 0
num: 54 lower_bound: kkkkk upper_bound: ooooo repeats: 0
num: 60 lower_bound: oooooo upper_bound: sssss repeats: 0
num: 60 lower_bound: ssssssu upper_bound: yyyyy repeats: 0`

newColl := coll.NewHistCollBySelectivity(sc, []*StatsNode{node, node2})
Expand Down

0 comments on commit 440bb74

Please sign in to comment.