stats: convert small range to points in selectivity (pingcap#11524)

matthewli968 · Aug 5, 2019 · 03bb8d7 · 03bb8d7
1 parent 3bd64ba
commit 03bb8d7
Show file tree

Hide file tree

Showing 7 changed files with 277 additions and 19 deletions.
diff --git a/cmd/explaintest/r/explain_easy.result b/cmd/explaintest/r/explain_easy.result
@@ -703,3 +703,12 @@ TableReader_7	1.00	root	data:Selection_6
 └─Selection_6	1.00	cop	eq(test.t.b, 1000-01-01 00:00:00.000000)
   └─TableScan_5	3.00	cop	table:t, range:[-inf,+inf], keep order:false
 drop table t;
+create table t(a int);
+insert into t values (1),(2),(2),(2),(9),(9),(9),(10);
+analyze table t with 1 buckets;
+explain select * from t where a >= 3 and a <= 8;
+id	count	task	operator info
+TableReader_7	0.00	root	data:Selection_6
+└─Selection_6	0.00	cop	ge(test.t.a, 3), le(test.t.a, 8)
+  └─TableScan_5	8.00	cop	table:t, range:[-inf,+inf], keep order:false
+drop table t;
diff --git a/cmd/explaintest/t/explain_easy.test b/cmd/explaintest/t/explain_easy.test
@@ -159,3 +159,9 @@ analyze table t;
 explain select * from t where a = 1;
 explain select * from t where b = "1000-01-01";
 drop table t;
+
+create table t(a int);
+insert into t values (1),(2),(2),(2),(9),(9),(9),(10);
+analyze table t with 1 buckets;
+explain select * from t where a >= 3 and a <= 8;
+drop table t;
diff --git a/statistics/handle/update_test.go b/statistics/handle/update_test.go
@@ -1300,11 +1300,11 @@ func (s *testStatsSuite) TestIndexQueryFeedback(c *C) {
 			hist: "index:1 ndv:20\n" +
 				"num: 16 lower_bound: -inf upper_bound: 7 repeats: 0\n" +
 				"num: 16 lower_bound: 8 upper_bound: 15 repeats: 0\n" +
-				"num: 8 lower_bound: 16 upper_bound: 21 repeats: 0",
+				"num: 9 lower_bound: 16 upper_bound: 21 repeats: 0",
 			rangeID: tblInfo.Indices[0].ID,
 			idxID:   tblInfo.Indices[1].ID,
 			idxCols: 1,
-			eqCount: 39,
+			eqCount: 32,
 		},
 		{
 			sql: "select * from t use index(idx_ac) where a = 1 and c < 21",
@@ -1315,7 +1315,7 @@ func (s *testStatsSuite) TestIndexQueryFeedback(c *C) {
 			rangeID: tblInfo.Columns[2].ID,
 			idxID:   tblInfo.Indices[2].ID,
 			idxCols: 0,
-			eqCount: 35,
+			eqCount: 32,
 		},
 		{
 			sql: "select * from t use index(idx_ad) where a = 1 and d < 21",
@@ -1359,7 +1359,7 @@ func (s *testStatsSuite) TestIndexQueryFeedback(c *C) {
 			rangeID: tblInfo.Columns[6].ID,
 			idxID:   tblInfo.Indices[6].ID,
 			idxCols: 0,
-			eqCount: 32,
+			eqCount: 30,
 		},
 		{
 			sql: `select * from t use index(idx_ah) where a = 1 and h < "1000-01-21"`,

diff --git a/statistics/histogram.go b/statistics/histogram.go
@@ -742,6 +742,18 @@ func (c *Column) GetColumnRowCount(sc *stmtctx.StatementContext, ranges []*range
 			}
 			continue
 		}
+		rangeVals := enumRangeValues(rg.LowVal[0], rg.HighVal[0], rg.LowExclude, rg.HighExclude)
+		// The small range case.
+		if rangeVals != nil {
+			for _, val := range rangeVals {
+				cnt, err := c.equalRowCount(sc, val, modifyCount)
+				if err != nil {
+					return 0, err
+				}
+				rowCount += cnt
+			}
+			continue
+		}
 		// The interval case.
 		cnt := c.BetweenRowCount(rg.LowVal[0], rg.HighVal[0])
 		if (c.outOfRange(rg.LowVal[0]) && !rg.LowVal[0].IsNull()) || c.outOfRange(rg.HighVal[0]) {

diff --git a/statistics/scalar.go b/statistics/scalar.go
@@ -16,7 +16,9 @@ package statistics
 import (
 	"encoding/binary"
 	"math"
+	"time"
 
+	"github.com/cznic/mathutil"
 	"github.com/pingcap/parser/mysql"
 	"github.com/pingcap/tidb/sessionctx/stmtctx"
 	"github.com/pingcap/tidb/types"
@@ -175,3 +177,122 @@ func calcFraction4Datums(lower, upper, value *types.Datum) float64 {
 	}
 	return 0.5
 }
+
+const maxNumStep = 10
+
+func enumRangeValues(low, high types.Datum, lowExclude, highExclude bool) []types.Datum {
+	if low.Kind() != high.Kind() {
+		return nil
+	}
+	exclude := 0
+	if lowExclude {
+		exclude++
+	}
+	if highExclude {
+		exclude++
+	}
+	switch low.Kind() {
+	case types.KindInt64:
+		// Overflow check.
+		lowVal, highVal := low.GetInt64(), high.GetInt64()
+		if lowVal < 0 && highVal > 0 {
+			if lowVal <= -maxNumStep || highVal >= maxNumStep {
+				return nil
+			}
+		}
+		remaining := highVal - lowVal
+		if remaining >= maxNumStep+1 {
+			return nil
+		}
+		remaining = remaining + 1 - int64(exclude)
+		if remaining >= maxNumStep {
+			return nil
+		}
+		values := make([]types.Datum, 0, remaining)
+		startValue := lowVal
+		if lowExclude {
+			startValue++
+		}
+		for i := int64(0); i < remaining; i++ {
+			values = append(values, types.NewIntDatum(startValue+i))
+		}
+		return values
+	case types.KindUint64:
+		remaining := high.GetUint64() - low.GetUint64()
+		if remaining >= maxNumStep+1 {
+			return nil
+		}
+		remaining = remaining + 1 - uint64(exclude)
+		if remaining >= maxNumStep {
+			return nil
+		}
+		values := make([]types.Datum, 0, remaining)
+		startValue := low.GetUint64()
+		if lowExclude {
+			startValue++
+		}
+		for i := uint64(0); i < remaining; i++ {
+			values = append(values, types.NewUintDatum(startValue+i))
+		}
+		return values
+	case types.KindMysqlDuration:
+		lowDur, highDur := low.GetMysqlDuration(), high.GetMysqlDuration()
+		fsp := mathutil.Max(lowDur.Fsp, highDur.Fsp)
+		stepSize := int64(math.Pow10(types.MaxFsp-fsp)) * int64(time.Microsecond)
+		lowDur.Duration = lowDur.Duration.Round(time.Duration(stepSize))
+		remaining := int64(highDur.Duration-lowDur.Duration)/stepSize + 1 - int64(exclude)
+		if remaining >= maxNumStep {
+			return nil
+		}
+		startValue := int64(lowDur.Duration)
+		if lowExclude {
+			startValue += stepSize
+		}
+		values := make([]types.Datum, 0, remaining)
+		for i := int64(0); i < remaining; i++ {
+			values = append(values, types.NewDurationDatum(types.Duration{Duration: time.Duration(startValue + i*stepSize), Fsp: fsp}))
+		}
+		return values
+	case types.KindMysqlTime:
+		lowTime, highTime := low.GetMysqlTime(), high.GetMysqlTime()
+		if lowTime.Type != highTime.Type {
+			return nil
+		}
+		fsp := mathutil.Max(lowTime.Fsp, highTime.Fsp)
+		var stepSize int64
+		sc := &stmtctx.StatementContext{TimeZone: time.UTC}
+		if lowTime.Type == mysql.TypeDate {
+			stepSize = 24 * int64(time.Hour)
+			lowTime.Time = types.FromDate(lowTime.Time.Year(), lowTime.Time.Month(), lowTime.Time.Day(), 0, 0, 0, 0)
+		} else {
+			var err error
+			lowTime, err = lowTime.RoundFrac(sc, fsp)
+			if err != nil {
+				return nil
+			}
+			stepSize = int64(math.Pow10(types.MaxFsp-fsp)) * int64(time.Microsecond)
+		}
+		remaining := int64(highTime.Sub(sc, &lowTime).Duration)/stepSize + 1 - int64(exclude)
+		if remaining >= maxNumStep {
+			return nil
+		}
+		startValue := lowTime
+		var err error
+		if lowExclude {
+			startValue, err = lowTime.Add(sc, types.Duration{Duration: time.Duration(stepSize), Fsp: fsp})
+			if err != nil {
+				return nil
+			}
+		}
+		values := make([]types.Datum, 0, remaining)
+		for i := int64(0); i < remaining; i++ {
+			value, err := startValue.Add(sc, types.Duration{Duration: time.Duration(i * stepSize), Fsp: fsp})
+			if err != nil {
+				return nil
+			}
+			values = append(values, types.NewTimeDatum(value))
+		}
+		return values
+	}
+	return nil
+}
diff --git a/statistics/scalar_test.go b/statistics/scalar_test.go
@@ -42,6 +42,14 @@ func getTime(year, month, day int, timeType byte) types.Time {
 	return ret
 }
 
+func getTimeStamp(hour, min, sec int, timeType byte) types.Time {
+	ret := types.Time{
+		Time: types.FromDate(2017, int(1), 1, hour, min, sec, 0),
+		Type: timeType,
+		Fsp:  0}
+	return ret
+}
+
 func getBinaryLiteral(value string) types.BinaryLiteral {
 	b, _ := types.ParseBitStr(value)
 	return b
@@ -168,3 +176,76 @@ func (s *testStatisticsSuite) TestCalcFraction(c *C) {
 		c.Check(math.Abs(fraction-test.fraction) < eps, IsTrue)
 	}
 }
+
+func (s *testStatisticsSuite) TestEnumRangeValues(c *C) {
+	tests := []struct {
+		low         types.Datum
+		high        types.Datum
+		lowExclude  bool
+		highExclude bool
+		res         string
+	}{
+		{
+			low:         types.NewIntDatum(0),
+			high:        types.NewIntDatum(5),
+			lowExclude:  false,
+			highExclude: true,
+			res:         "(0, 1, 2, 3, 4)",
+		},
+		{
+			low:         types.NewIntDatum(math.MinInt64),
+			high:        types.NewIntDatum(math.MaxInt64),
+			lowExclude:  false,
+			highExclude: false,
+			res:         "",
+		},
+		{
+			low:         types.NewUintDatum(0),
+			high:        types.NewUintDatum(5),
+			lowExclude:  false,
+			highExclude: true,
+			res:         "(0, 1, 2, 3, 4)",
+		},
+		{
+			low:         types.NewDurationDatum(getDuration("0:00:00")),
+			high:        types.NewDurationDatum(getDuration("0:00:05")),
+			lowExclude:  false,
+			highExclude: true,
+			res:         "(00:00:00, 00:00:01, 00:00:02, 00:00:03, 00:00:04)",
+		},
+		{
+			low:         types.NewDurationDatum(getDuration("0:00:00")),
+			high:        types.NewDurationDatum(getDuration("0:00:05")),
+			lowExclude:  false,
+			highExclude: true,
+			res:         "(00:00:00, 00:00:01, 00:00:02, 00:00:03, 00:00:04)",
+		},
+		{
+			low:         types.NewTimeDatum(getTime(2017, 1, 1, mysql.TypeDate)),
+			high:        types.NewTimeDatum(getTime(2017, 1, 5, mysql.TypeDate)),
+			lowExclude:  false,
+			highExclude: true,
+			res:         "(2017-01-01, 2017-01-02, 2017-01-03, 2017-01-04)",
+		},
+		{
+			low:         types.NewTimeDatum(getTimeStamp(0, 0, 0, mysql.TypeTimestamp)),
+			high:        types.NewTimeDatum(getTimeStamp(0, 0, 5, mysql.TypeTimestamp)),
+			lowExclude:  false,
+			highExclude: true,
+			res:         "(2017-01-01 00:00:00, 2017-01-01 00:00:01, 2017-01-01 00:00:02, 2017-01-01 00:00:03, 2017-01-01 00:00:04)",
+		},
+		{
+			low:         types.NewTimeDatum(getTimeStamp(0, 0, 0, mysql.TypeDatetime)),
+			high:        types.NewTimeDatum(getTimeStamp(0, 0, 5, mysql.TypeDatetime)),
+			lowExclude:  false,
+			highExclude: true,
+			res:         "(2017-01-01 00:00:00, 2017-01-01 00:00:01, 2017-01-01 00:00:02, 2017-01-01 00:00:03, 2017-01-01 00:00:04)",
+		},
+	}
+	for _, t := range tests {
+		vals := enumRangeValues(t.low, t.high, t.lowExclude, t.highExclude)
+		str, err := types.DatumsToString(vals, true)
+		c.Assert(err, IsNil)
+		c.Assert(t.res, Equals, str)
+	}
+}
diff --git a/statistics/table.go b/statistics/table.go
@@ -361,11 +361,36 @@ func isSingleColIdxNullRange(idx *Index, ran *ranger.Range) bool {
 	return false
 }
 
+// getEqualCondSelectivity gets the selectivity of the equal conditions. `coverAll` means if the conditions
+// have covered all the index columns.
+func (coll *HistColl) getEqualCondSelectivity(idx *Index, bytes []byte, coverAll bool) float64 {
+	val := types.NewBytesDatum(bytes)
+	if idx.outOfRange(val) {
+		// When the value is out of range, we could not found this value in the CM Sketch,
+		// so we use heuristic methods to estimate the selectivity.
+		if idx.NDV > 0 && coverAll {
+			// for equality queries
+			return float64(coll.ModifyCount) / float64(idx.NDV) / idx.TotalRowCount()
+		}
+		// for range queries
+		return float64(coll.ModifyCount) / outOfRangeBetweenRate / idx.TotalRowCount()
+	}
+	return float64(idx.CMSketch.QueryBytes(bytes)) / float64(idx.TotalRowCount())
+}
+
 func (coll *HistColl) getIndexRowCount(sc *stmtctx.StatementContext, idxID int64, indexRanges []*ranger.Range) (float64, error) {
 	idx := coll.Indices[idxID]
 	totalCount := float64(0)
 	for _, ran := range indexRanges {
 		rangePosition := GetOrdinalOfRangeCond(sc, ran)
+		var rangeVals []types.Datum
+		// Try to enum the last range values.
+		if rangePosition != len(ran.LowVal) {
+			rangeVals = enumRangeValues(ran.LowVal[rangePosition], ran.HighVal[rangePosition], ran.LowExclude, ran.HighExclude)
+			if rangeVals != nil {
+				rangePosition++
+			}
+		}
 		// If first one is range, just use the previous way to estimate; if it is [NULL, NULL] range
 		// on single-column index, use previous way as well, because CMSketch does not contain null
 		// values in this case.
@@ -378,24 +403,28 @@ func (coll *HistColl) getIndexRowCount(sc *stmtctx.StatementContext, idxID int64
 			continue
 		}
 		var selectivity float64
+		coverAll := len(ran.LowVal) == len(idx.Info.Columns) && rangePosition == len(ran.LowVal)
 		// use CM Sketch to estimate the equal conditions
-		bytes, err := codec.EncodeKey(sc, nil, ran.LowVal[:rangePosition]...)
-		if err != nil {
-			return 0, errors.Trace(err)
-		}
-		val := types.NewBytesDatum(bytes)
-		if idx.outOfRange(val) {
-			// When the value is out of range, we could not found this value in the CM Sketch,
-			// so we use heuristic methods to estimate the selectivity.
-			if idx.NDV > 0 && len(ran.LowVal) == len(idx.Info.Columns) && rangePosition == len(ran.LowVal) {
-				// for equality queries
-				selectivity = float64(coll.ModifyCount) / float64(idx.NDV) / idx.TotalRowCount()
-			} else {
-				// for range queries
-				selectivity = float64(coll.ModifyCount) / outOfRangeBetweenRate / idx.TotalRowCount()
+		if rangeVals == nil {
+			bytes, err := codec.EncodeKey(sc, nil, ran.LowVal[:rangePosition]...)
+			if err != nil {
+				return 0, errors.Trace(err)
 			}
+			selectivity = coll.getEqualCondSelectivity(idx, bytes, coverAll)
 		} else {
-			selectivity = float64(idx.CMSketch.QueryBytes(bytes)) / float64(idx.TotalRowCount())
+			bytes, err := codec.EncodeKey(sc, nil, ran.LowVal[:rangePosition-1]...)
+			if err != nil {
+				return 0, errors.Trace(err)
+			}
+			prefixLen := len(bytes)
+			for _, val := range rangeVals {
+				bytes = bytes[:prefixLen]
+				bytes, err = codec.EncodeKey(sc, bytes, val)
+				if err != nil {
+					return 0, err
+				}
+				selectivity += coll.getEqualCondSelectivity(idx, bytes, coverAll)
+			}
 		}
 		// use histogram to estimate the range condition
 		if rangePosition != len(ran.LowVal) {