Skip to content

Commit

Permalink
stats: convert small range to points in selectivity (pingcap#11524)
Browse files Browse the repository at this point in the history
  • Loading branch information
alivxxx authored and zz-jason committed Aug 5, 2019
1 parent 3bd64ba commit 03bb8d7
Show file tree
Hide file tree
Showing 7 changed files with 277 additions and 19 deletions.
9 changes: 9 additions & 0 deletions cmd/explaintest/r/explain_easy.result
Original file line number Diff line number Diff line change
Expand Up @@ -703,3 +703,12 @@ TableReader_7 1.00 root data:Selection_6
└─Selection_6 1.00 cop eq(test.t.b, 1000-01-01 00:00:00.000000)
└─TableScan_5 3.00 cop table:t, range:[-inf,+inf], keep order:false
drop table t;
create table t(a int);
insert into t values (1),(2),(2),(2),(9),(9),(9),(10);
analyze table t with 1 buckets;
explain select * from t where a >= 3 and a <= 8;
id count task operator info
TableReader_7 0.00 root data:Selection_6
└─Selection_6 0.00 cop ge(test.t.a, 3), le(test.t.a, 8)
└─TableScan_5 8.00 cop table:t, range:[-inf,+inf], keep order:false
drop table t;
6 changes: 6 additions & 0 deletions cmd/explaintest/t/explain_easy.test
Original file line number Diff line number Diff line change
Expand Up @@ -159,3 +159,9 @@ analyze table t;
explain select * from t where a = 1;
explain select * from t where b = "1000-01-01";
drop table t;

create table t(a int);
insert into t values (1),(2),(2),(2),(9),(9),(9),(10);
analyze table t with 1 buckets;
explain select * from t where a >= 3 and a <= 8;
drop table t;
8 changes: 4 additions & 4 deletions statistics/handle/update_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1300,11 +1300,11 @@ func (s *testStatsSuite) TestIndexQueryFeedback(c *C) {
hist: "index:1 ndv:20\n" +
"num: 16 lower_bound: -inf upper_bound: 7 repeats: 0\n" +
"num: 16 lower_bound: 8 upper_bound: 15 repeats: 0\n" +
"num: 8 lower_bound: 16 upper_bound: 21 repeats: 0",
"num: 9 lower_bound: 16 upper_bound: 21 repeats: 0",
rangeID: tblInfo.Indices[0].ID,
idxID: tblInfo.Indices[1].ID,
idxCols: 1,
eqCount: 39,
eqCount: 32,
},
{
sql: "select * from t use index(idx_ac) where a = 1 and c < 21",
Expand All @@ -1315,7 +1315,7 @@ func (s *testStatsSuite) TestIndexQueryFeedback(c *C) {
rangeID: tblInfo.Columns[2].ID,
idxID: tblInfo.Indices[2].ID,
idxCols: 0,
eqCount: 35,
eqCount: 32,
},
{
sql: "select * from t use index(idx_ad) where a = 1 and d < 21",
Expand Down Expand Up @@ -1359,7 +1359,7 @@ func (s *testStatsSuite) TestIndexQueryFeedback(c *C) {
rangeID: tblInfo.Columns[6].ID,
idxID: tblInfo.Indices[6].ID,
idxCols: 0,
eqCount: 32,
eqCount: 30,
},
{
sql: `select * from t use index(idx_ah) where a = 1 and h < "1000-01-21"`,
Expand Down
12 changes: 12 additions & 0 deletions statistics/histogram.go
Original file line number Diff line number Diff line change
Expand Up @@ -742,6 +742,18 @@ func (c *Column) GetColumnRowCount(sc *stmtctx.StatementContext, ranges []*range
}
continue
}
rangeVals := enumRangeValues(rg.LowVal[0], rg.HighVal[0], rg.LowExclude, rg.HighExclude)
// The small range case.
if rangeVals != nil {
for _, val := range rangeVals {
cnt, err := c.equalRowCount(sc, val, modifyCount)
if err != nil {
return 0, err
}
rowCount += cnt
}
continue
}
// The interval case.
cnt := c.BetweenRowCount(rg.LowVal[0], rg.HighVal[0])
if (c.outOfRange(rg.LowVal[0]) && !rg.LowVal[0].IsNull()) || c.outOfRange(rg.HighVal[0]) {
Expand Down
121 changes: 121 additions & 0 deletions statistics/scalar.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@ package statistics
import (
"encoding/binary"
"math"
"time"

"github.com/cznic/mathutil"
"github.com/pingcap/parser/mysql"
"github.com/pingcap/tidb/sessionctx/stmtctx"
"github.com/pingcap/tidb/types"
Expand Down Expand Up @@ -175,3 +177,122 @@ func calcFraction4Datums(lower, upper, value *types.Datum) float64 {
}
return 0.5
}

const maxNumStep = 10

func enumRangeValues(low, high types.Datum, lowExclude, highExclude bool) []types.Datum {
if low.Kind() != high.Kind() {
return nil
}
exclude := 0
if lowExclude {
exclude++
}
if highExclude {
exclude++
}
switch low.Kind() {
case types.KindInt64:
// Overflow check.
lowVal, highVal := low.GetInt64(), high.GetInt64()
if lowVal < 0 && highVal > 0 {
if lowVal <= -maxNumStep || highVal >= maxNumStep {
return nil
}
}
remaining := highVal - lowVal
if remaining >= maxNumStep+1 {
return nil
}
remaining = remaining + 1 - int64(exclude)
if remaining >= maxNumStep {
return nil
}
values := make([]types.Datum, 0, remaining)
startValue := lowVal
if lowExclude {
startValue++
}
for i := int64(0); i < remaining; i++ {
values = append(values, types.NewIntDatum(startValue+i))
}
return values
case types.KindUint64:
remaining := high.GetUint64() - low.GetUint64()
if remaining >= maxNumStep+1 {
return nil
}
remaining = remaining + 1 - uint64(exclude)
if remaining >= maxNumStep {
return nil
}
values := make([]types.Datum, 0, remaining)
startValue := low.GetUint64()
if lowExclude {
startValue++
}
for i := uint64(0); i < remaining; i++ {
values = append(values, types.NewUintDatum(startValue+i))
}
return values
case types.KindMysqlDuration:
lowDur, highDur := low.GetMysqlDuration(), high.GetMysqlDuration()
fsp := mathutil.Max(lowDur.Fsp, highDur.Fsp)
stepSize := int64(math.Pow10(types.MaxFsp-fsp)) * int64(time.Microsecond)
lowDur.Duration = lowDur.Duration.Round(time.Duration(stepSize))
remaining := int64(highDur.Duration-lowDur.Duration)/stepSize + 1 - int64(exclude)
if remaining >= maxNumStep {
return nil
}
startValue := int64(lowDur.Duration)
if lowExclude {
startValue += stepSize
}
values := make([]types.Datum, 0, remaining)
for i := int64(0); i < remaining; i++ {
values = append(values, types.NewDurationDatum(types.Duration{Duration: time.Duration(startValue + i*stepSize), Fsp: fsp}))
}
return values
case types.KindMysqlTime:
lowTime, highTime := low.GetMysqlTime(), high.GetMysqlTime()
if lowTime.Type != highTime.Type {
return nil
}
fsp := mathutil.Max(lowTime.Fsp, highTime.Fsp)
var stepSize int64
sc := &stmtctx.StatementContext{TimeZone: time.UTC}
if lowTime.Type == mysql.TypeDate {
stepSize = 24 * int64(time.Hour)
lowTime.Time = types.FromDate(lowTime.Time.Year(), lowTime.Time.Month(), lowTime.Time.Day(), 0, 0, 0, 0)
} else {
var err error
lowTime, err = lowTime.RoundFrac(sc, fsp)
if err != nil {
return nil
}
stepSize = int64(math.Pow10(types.MaxFsp-fsp)) * int64(time.Microsecond)
}
remaining := int64(highTime.Sub(sc, &lowTime).Duration)/stepSize + 1 - int64(exclude)
if remaining >= maxNumStep {
return nil
}
startValue := lowTime
var err error
if lowExclude {
startValue, err = lowTime.Add(sc, types.Duration{Duration: time.Duration(stepSize), Fsp: fsp})
if err != nil {
return nil
}
}
values := make([]types.Datum, 0, remaining)
for i := int64(0); i < remaining; i++ {
value, err := startValue.Add(sc, types.Duration{Duration: time.Duration(i * stepSize), Fsp: fsp})
if err != nil {
return nil
}
values = append(values, types.NewTimeDatum(value))
}
return values
}
return nil
}
81 changes: 81 additions & 0 deletions statistics/scalar_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,14 @@ func getTime(year, month, day int, timeType byte) types.Time {
return ret
}

func getTimeStamp(hour, min, sec int, timeType byte) types.Time {
ret := types.Time{
Time: types.FromDate(2017, int(1), 1, hour, min, sec, 0),
Type: timeType,
Fsp: 0}
return ret
}

func getBinaryLiteral(value string) types.BinaryLiteral {
b, _ := types.ParseBitStr(value)
return b
Expand Down Expand Up @@ -168,3 +176,76 @@ func (s *testStatisticsSuite) TestCalcFraction(c *C) {
c.Check(math.Abs(fraction-test.fraction) < eps, IsTrue)
}
}

func (s *testStatisticsSuite) TestEnumRangeValues(c *C) {
tests := []struct {
low types.Datum
high types.Datum
lowExclude bool
highExclude bool
res string
}{
{
low: types.NewIntDatum(0),
high: types.NewIntDatum(5),
lowExclude: false,
highExclude: true,
res: "(0, 1, 2, 3, 4)",
},
{
low: types.NewIntDatum(math.MinInt64),
high: types.NewIntDatum(math.MaxInt64),
lowExclude: false,
highExclude: false,
res: "",
},
{
low: types.NewUintDatum(0),
high: types.NewUintDatum(5),
lowExclude: false,
highExclude: true,
res: "(0, 1, 2, 3, 4)",
},
{
low: types.NewDurationDatum(getDuration("0:00:00")),
high: types.NewDurationDatum(getDuration("0:00:05")),
lowExclude: false,
highExclude: true,
res: "(00:00:00, 00:00:01, 00:00:02, 00:00:03, 00:00:04)",
},
{
low: types.NewDurationDatum(getDuration("0:00:00")),
high: types.NewDurationDatum(getDuration("0:00:05")),
lowExclude: false,
highExclude: true,
res: "(00:00:00, 00:00:01, 00:00:02, 00:00:03, 00:00:04)",
},
{
low: types.NewTimeDatum(getTime(2017, 1, 1, mysql.TypeDate)),
high: types.NewTimeDatum(getTime(2017, 1, 5, mysql.TypeDate)),
lowExclude: false,
highExclude: true,
res: "(2017-01-01, 2017-01-02, 2017-01-03, 2017-01-04)",
},
{
low: types.NewTimeDatum(getTimeStamp(0, 0, 0, mysql.TypeTimestamp)),
high: types.NewTimeDatum(getTimeStamp(0, 0, 5, mysql.TypeTimestamp)),
lowExclude: false,
highExclude: true,
res: "(2017-01-01 00:00:00, 2017-01-01 00:00:01, 2017-01-01 00:00:02, 2017-01-01 00:00:03, 2017-01-01 00:00:04)",
},
{
low: types.NewTimeDatum(getTimeStamp(0, 0, 0, mysql.TypeDatetime)),
high: types.NewTimeDatum(getTimeStamp(0, 0, 5, mysql.TypeDatetime)),
lowExclude: false,
highExclude: true,
res: "(2017-01-01 00:00:00, 2017-01-01 00:00:01, 2017-01-01 00:00:02, 2017-01-01 00:00:03, 2017-01-01 00:00:04)",
},
}
for _, t := range tests {
vals := enumRangeValues(t.low, t.high, t.lowExclude, t.highExclude)
str, err := types.DatumsToString(vals, true)
c.Assert(err, IsNil)
c.Assert(t.res, Equals, str)
}
}
59 changes: 44 additions & 15 deletions statistics/table.go
Original file line number Diff line number Diff line change
Expand Up @@ -361,11 +361,36 @@ func isSingleColIdxNullRange(idx *Index, ran *ranger.Range) bool {
return false
}

// getEqualCondSelectivity gets the selectivity of the equal conditions. `coverAll` means if the conditions
// have covered all the index columns.
func (coll *HistColl) getEqualCondSelectivity(idx *Index, bytes []byte, coverAll bool) float64 {
val := types.NewBytesDatum(bytes)
if idx.outOfRange(val) {
// When the value is out of range, we could not found this value in the CM Sketch,
// so we use heuristic methods to estimate the selectivity.
if idx.NDV > 0 && coverAll {
// for equality queries
return float64(coll.ModifyCount) / float64(idx.NDV) / idx.TotalRowCount()
}
// for range queries
return float64(coll.ModifyCount) / outOfRangeBetweenRate / idx.TotalRowCount()
}
return float64(idx.CMSketch.QueryBytes(bytes)) / float64(idx.TotalRowCount())
}

func (coll *HistColl) getIndexRowCount(sc *stmtctx.StatementContext, idxID int64, indexRanges []*ranger.Range) (float64, error) {
idx := coll.Indices[idxID]
totalCount := float64(0)
for _, ran := range indexRanges {
rangePosition := GetOrdinalOfRangeCond(sc, ran)
var rangeVals []types.Datum
// Try to enum the last range values.
if rangePosition != len(ran.LowVal) {
rangeVals = enumRangeValues(ran.LowVal[rangePosition], ran.HighVal[rangePosition], ran.LowExclude, ran.HighExclude)
if rangeVals != nil {
rangePosition++
}
}
// If first one is range, just use the previous way to estimate; if it is [NULL, NULL] range
// on single-column index, use previous way as well, because CMSketch does not contain null
// values in this case.
Expand All @@ -378,24 +403,28 @@ func (coll *HistColl) getIndexRowCount(sc *stmtctx.StatementContext, idxID int64
continue
}
var selectivity float64
coverAll := len(ran.LowVal) == len(idx.Info.Columns) && rangePosition == len(ran.LowVal)
// use CM Sketch to estimate the equal conditions
bytes, err := codec.EncodeKey(sc, nil, ran.LowVal[:rangePosition]...)
if err != nil {
return 0, errors.Trace(err)
}
val := types.NewBytesDatum(bytes)
if idx.outOfRange(val) {
// When the value is out of range, we could not found this value in the CM Sketch,
// so we use heuristic methods to estimate the selectivity.
if idx.NDV > 0 && len(ran.LowVal) == len(idx.Info.Columns) && rangePosition == len(ran.LowVal) {
// for equality queries
selectivity = float64(coll.ModifyCount) / float64(idx.NDV) / idx.TotalRowCount()
} else {
// for range queries
selectivity = float64(coll.ModifyCount) / outOfRangeBetweenRate / idx.TotalRowCount()
if rangeVals == nil {
bytes, err := codec.EncodeKey(sc, nil, ran.LowVal[:rangePosition]...)
if err != nil {
return 0, errors.Trace(err)
}
selectivity = coll.getEqualCondSelectivity(idx, bytes, coverAll)
} else {
selectivity = float64(idx.CMSketch.QueryBytes(bytes)) / float64(idx.TotalRowCount())
bytes, err := codec.EncodeKey(sc, nil, ran.LowVal[:rangePosition-1]...)
if err != nil {
return 0, errors.Trace(err)
}
prefixLen := len(bytes)
for _, val := range rangeVals {
bytes = bytes[:prefixLen]
bytes, err = codec.EncodeKey(sc, bytes, val)
if err != nil {
return 0, err
}
selectivity += coll.getEqualCondSelectivity(idx, bytes, coverAll)
}
}
// use histogram to estimate the range condition
if rangePosition != len(ran.LowVal) {
Expand Down

0 comments on commit 03bb8d7

Please sign in to comment.