Skip to content

Commit a9acdae

Browse files
zhixingheyi-tianhaoyangeng-db
authored andcommitted
[SPARK-52817][SQL] Fix Like Expression performance
### What changes were proposed in this pull request? Make contains function to be used in like expression with multiple '%'. ### Why are the changes needed? In some customers' cases , user sometimes use multiple '%' for like expression. For Example: ``` SELECT * FROM testData where value not like '%%HotFocus%%' SELECT * FROM testData where value not like '%%%HotFocus%%%' ``` In these SQL queries, cannot convert Like expressions to contains function in logical planning. So the performance is very poor. ### How was this patch tested? Added UTs and Existed UTs Closes apache#51510 from zhixingheyi-tian/fix-like. Authored-by: zhixingheyi-tian <[email protected]> Signed-off-by: Yuming Wang <[email protected]>
1 parent b0eb3e9 commit a9acdae

File tree

2 files changed

+53
-4
lines changed

2 files changed

+53
-4
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -743,10 +743,11 @@ object SupportedBinaryExpr {
743743
object LikeSimplification extends Rule[LogicalPlan] with PredicateHelper {
744744
// if guards below protect from escapes on trailing %.
745745
// Cases like "something\%" are not optimized, but this does not affect correctness.
746-
private val startsWith = "([^_%]+)%".r
747-
private val endsWith = "%([^_%]+)".r
748-
private val startsAndEndsWith = "([^_%]+)%([^_%]+)".r
749-
private val contains = "%([^_%]+)%".r
746+
// Consecutive wildcard characters are equivalent to a single wildcard character.
747+
private val startsWith = "([^_%]+)%+".r
748+
private val endsWith = "%+([^_%]+)".r
749+
private val startsAndEndsWith = "([^_%]+)%+([^_%]+)".r
750+
private val contains = "%+([^_%]+)%+".r
750751
private val equalTo = "([^_%]*)".r
751752

752753
private def simplifyLike(

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LikeSimplificationSuite.scala

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,54 @@ class LikeSimplificationSuite extends PlanTest {
165165
comparePlans(optimized5, correctAnswer5)
166166
}
167167

168+
test("SPARK-52817: Spark SQL LIKE expressions show poor performance when using multiple '%'") {
169+
val originalQuery1 =
170+
testRelation
171+
.where($"a" like "abc%%")
172+
val optimized1 = Optimize.execute(originalQuery1.analyze)
173+
val correctAnswer1 = testRelation
174+
.where(StartsWith($"a", "abc"))
175+
.analyze
176+
comparePlans(optimized1, correctAnswer1)
177+
178+
val originalQuery2 =
179+
testRelation
180+
.where($"a" like "%%xyz")
181+
val optimized2 = Optimize.execute(originalQuery2.analyze)
182+
val correctAnswer2 = testRelation
183+
.where(EndsWith($"a", "xyz"))
184+
.analyze
185+
comparePlans(optimized2, correctAnswer2)
186+
187+
val originalQuery3 =
188+
testRelation
189+
.where($"a" like "abc%%def")
190+
val optimized3 = Optimize.execute(originalQuery3.analyze)
191+
val correctAnswer3 = testRelation
192+
.where(
193+
(Length($"a") >= 6 && (StartsWith($"a", "abc") && EndsWith($"a", "def"))))
194+
.analyze
195+
comparePlans(optimized3, correctAnswer3)
196+
197+
val originalQuery4 =
198+
testRelation
199+
.where(($"a" like "%%mn%%"))
200+
val optimized4 = Optimize.execute(originalQuery4.analyze)
201+
val correctAnswer4 = testRelation
202+
.where(Contains($"a", "mn"))
203+
.analyze
204+
comparePlans(optimized4, correctAnswer4)
205+
206+
val originalQuery5 =
207+
testRelation
208+
.where(($"a" like "%%%mn%%%"))
209+
val optimized5 = Optimize.execute(originalQuery5.analyze)
210+
val correctAnswer5 = testRelation
211+
.where(Contains($"a", "mn"))
212+
.analyze
213+
comparePlans(optimized5, correctAnswer5)
214+
}
215+
168216
test("simplify LikeAll") {
169217
val originalQuery =
170218
testRelation

0 commit comments

Comments
 (0)