Skip to content

Commit 225ae77

Browse files
authored
Fixes for NonBacktracking with some combinations of loops and anchors (#80880)
* Add tests for loop plus anchors bug * Fixes to loop derivative and pruning * Add a test for high-priority nullable loop bodies This is a simplified test case for a breaking test in the Rust patterns. * Fix for high-priority nullable loop bodies
1 parent a04aaeb commit 225ae77

File tree

2 files changed

+103
-28
lines changed

2 files changed

+103
-28
lines changed

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs

Lines changed: 97 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1111,26 +1111,32 @@ private SymbolicRegexNode<TSet> PruneLowerPriorityThanNullability(SymbolicRegexB
11111111

11121112
case SymbolicRegexNodeKind.Concat:
11131113
Debug.Assert(_left is not null && _right is not null);
1114-
//in a concatenation (X|Y)Z priority is given to XZ when X is nullable
1115-
//observe that (X|Y)Z is equivalent to (XZ|YZ) and the branch YZ must be ignored
1116-
//when X is nullable (observe that XZ is nullable because this=(X|Y)Z is nullable)
1117-
//if X is not nullable then XZ is maintaned as is, and YZ is pruned
1118-
//e.g. (a{0,5}?|abc|b+)c* reduces to c*
1119-
//---
1120-
//in a concatenation XZ where X is not an alternation, both X and Z are pruned
1121-
//e.g. a{0,5}?b{0,5}? reduces to ()
1122-
prunedNode = _left._kind == SymbolicRegexNodeKind.Alternate ?
1123-
(_left._left!.IsNullableFor(context) ?
1114+
prunedNode = _left._kind switch
1115+
{
1116+
// If the left side is a concatenation, then we bring it to right associative form to expose
1117+
// the first element of the concatenation for the cases below.
1118+
SymbolicRegexNodeKind.Concat => CreateConcat(builder, _left._left!, CreateConcat(builder, _left._right!, _right))
1119+
.PruneLowerPriorityThanNullability(builder, context),
1120+
// In a concatenation (X|Y)Z priority is given to XZ when X is nullable.
1121+
// Observe that (X|Y)Z is equivalent to (XZ|YZ) and the branch YZ must be ignored
1122+
// when X is nullable (observe that XZ is nullable because this=(X|Y)Z is nullable).
1123+
// If X is not nullable then XZ is maintaned as is, and YZ is pruned. For example,
1124+
// (a{0,5}?|abc|b+)c* reduces to c*.
1125+
SymbolicRegexNodeKind.Alternate => (_left._left!.IsNullableFor(context) ?
11241126
CreateConcat(builder, _left._left, _right).PruneLowerPriorityThanNullability(builder, context) :
1125-
CreateAlternate(builder, CreateConcat(builder, _left._left, _right), CreateConcat(builder, _left._right!, _right).PruneLowerPriorityThanNullability(builder, context))) :
1126-
CreateConcat(builder, _left.PruneLowerPriorityThanNullability(builder, context), _right.PruneLowerPriorityThanNullability(builder, context));
1127+
CreateAlternate(builder, CreateConcat(builder, _left._left, _right),
1128+
CreateConcat(builder, _left._right!, _right).PruneLowerPriorityThanNullability(builder, context), deduplicated: true)),
1129+
// Loops have various cases that are handled uniformly for concatenations and standalone loops.
1130+
SymbolicRegexNodeKind.Loop => PruneLoop(builder, context, _left, _right),
1131+
// The previous cases handle all the ways that the left side of the concatenation could
1132+
// contain branching. Thus if we get here it is safe to only prune the right side.
1133+
_ => CreateConcat(builder, _left, _right.PruneLowerPriorityThanNullability(builder, context)),
1134+
};
11271135
break;
11281136

11291137
case SymbolicRegexNodeKind.Loop:
1130-
Debug.Assert(_left is not null);
1131-
// Lazy nullable loop reduces to (), i.e., the loop body is just forgotten
1132-
prunedNode = _info.IsLazyLoop && _lower == 0 ? builder.Epsilon :
1133-
CreateLoop(builder, _left.PruneLowerPriorityThanNullability(builder, context), _lower, _upper, _info.IsLazyLoop);
1138+
// Loops have various cases that are handled uniformly for standalone loops and concatenations.
1139+
prunedNode = PruneLoop(builder, context, this, builder.Epsilon);
11341140
break;
11351141

11361142
case SymbolicRegexNodeKind.Effect:
@@ -1147,6 +1153,73 @@ private SymbolicRegexNode<TSet> PruneLowerPriorityThanNullability(SymbolicRegexB
11471153

11481154
builder._pruneLowerPriorityThanNullabilityCache[key] = prunedNode;
11491155
return prunedNode;
1156+
1157+
static SymbolicRegexNode<TSet> PruneLoop(SymbolicRegexBuilder<TSet> builder, uint context, SymbolicRegexNode<TSet> loop, SymbolicRegexNode<TSet> tail)
1158+
{
1159+
Debug.Assert(loop.Kind == SymbolicRegexNodeKind.Loop && loop._left is not null);
1160+
1161+
if (loop._lower == 0)
1162+
{
1163+
// Loop is nullable at least due to a zero lower bound and the cases below handle the different
1164+
// priorities of checking that lower bound.
1165+
if (loop.IsLazy)
1166+
{
1167+
// In a lazy loop nullability from the zero lower bound has highest priority, so the loop is skipped completely
1168+
return tail.PruneLowerPriorityThanNullability(builder, context);
1169+
}
1170+
else if (!loop._left.IsNullableFor(context))
1171+
{
1172+
// For an eager loop with a non-nullable body, the nullability from the zero lower bound has lowest priority.
1173+
// Handle by case splitting into (1) doing at least one iteration in the loop and (2) skipping the loop and
1174+
// continuing directly into the tail, which then must be pruned in the current context.
1175+
return CreateAlternate(builder,
1176+
CreateConcat(builder, CreateLoop(builder, loop._left, 1, loop._upper, loop.IsLazy), tail),
1177+
tail.PruneLowerPriorityThanNullability(builder, context));
1178+
}
1179+
else if (loop._upper == int.MaxValue)
1180+
{
1181+
// The special case of a R*S loop with a nullable body is handled separately, as the general case handler could cause
1182+
// infinite recursion. The paths that backtracking would explore before stopping here are (1) consuming a character in the
1183+
// first iteration of the loop and (2) skipping the loop and continuing with anything higher priority than nullability in S.
1184+
// For example, a pattern (a|b??)*c? would prune into essentially a?(a|b??)*c?|c?. Note that pruning only one peeled-out
1185+
// iteration of the loop is necessary, since anchors could change the priority of nullability in other locations in the input.
1186+
// Additionally, a high-priority nullable body will always be skipped, in which case only option (2) is included. Finally, in
1187+
// all cases the loop body must not be dropped, as doing so could affect whether some capture groups match zero characters or
1188+
// don't match at all.
1189+
SymbolicRegexNode<TSet> skipLoopCase = CreateConcat(builder, loop._left.PruneLowerPriorityThanNullability(builder, context),
1190+
tail.PruneLowerPriorityThanNullability(builder, context));
1191+
return loop._left.IsHighPriorityNullableFor(context) ? skipLoopCase : CreateAlternate(builder,
1192+
CreateConcat(builder, loop._left.PruneLowerPriorityThanNullability(builder, context), CreateConcat(builder, loop, tail)),
1193+
skipLoopCase);
1194+
}
1195+
// For an eager loop with finite upper bound and nullable body fall back to the general case handler
1196+
}
1197+
1198+
Debug.Assert(loop._left.IsNullableFor(context));
1199+
// The general case handler peels one iteration out of the loop and prunes the resulting concatenation
1200+
return CreateConcat(builder, loop._left, CreateConcat(builder, loop.CreateLoopContinuation(builder), tail))
1201+
.PruneLowerPriorityThanNullability(builder, context);
1202+
}
1203+
}
1204+
1205+
/// <summary>
1206+
/// Creates the remainder of a loop when one iteration is peeled out of it. In other words, for a loop R with
1207+
/// a body B returns a new regex S such that R is equivalent to BS.
1208+
/// </summary>
1209+
/// <param name="builder">the builder that owns this node</param>
1210+
/// <returns>the loop continuation regex</returns>
1211+
private SymbolicRegexNode<TSet> CreateLoopContinuation(SymbolicRegexBuilder<TSet> builder)
1212+
{
1213+
Debug.Assert(_kind == SymbolicRegexNodeKind.Loop && _left is not null);
1214+
1215+
// Note that the upper bound is guaranteed to be greater than zero, since otherwise the loop would have
1216+
// been simplified to nothing, and int.MaxValue is treated as infinity.
1217+
int newupper = _upper == int.MaxValue ? int.MaxValue : _upper - 1;
1218+
// Do not decrement the lower bound if it equals int.MaxValue
1219+
int newlower = _lower == 0 || _lower == int.MaxValue ? _lower : _lower - 1;
1220+
1221+
// The continued loop becomes epsilon when newlower == newupper == 0
1222+
return builder.CreateLoop(_left, IsLazy, newlower, newupper);
11501223
}
11511224

11521225
/// <summary>
@@ -1244,22 +1317,18 @@ private SymbolicRegexNode<TSet> CreateDerivative(SymbolicRegexBuilder<TSet> buil
12441317
Debug.Assert(_left is not null);
12451318
Debug.Assert(_upper > 0);
12461319

1247-
SymbolicRegexNode<TSet> bodyDerivative = _left.CreateDerivative(builder, elem, context);
1248-
if (bodyDerivative.IsNothing(builder._solver))
1320+
if (_lower == 0 || _left.IsNullable || !_left.IsNullableFor(context))
12491321
{
1250-
derivative = builder._nothing;
1322+
// In these special cases the derivative concatenates the body's derivative with the decremented loop, so
1323+
// d(R{m,n}) = d(R)R{max(0,m-1),n-1}.
1324+
derivative = builder.CreateConcat(_left.CreateDerivative(builder, elem, context), CreateLoopContinuation(builder));
12511325
}
12521326
else
12531327
{
1254-
// The loop derivative peels out one iteration and concatenates the body's derivative with the decremented loop,
1255-
// so d(R{m,n}) = d(R)R{max(0,m-1),n-1}. Note that n is guaranteed to be greater than zero, since otherwise the
1256-
// loop would have been simplified to nothing, and int.MaxValue is treated as infinity.
1257-
int newupper = _upper == int.MaxValue ? int.MaxValue : _upper - 1;
1258-
// do not decrement the lower bound if it equals int.MaxValue
1259-
int newlower = _lower == 0 || _lower == int.MaxValue ? _lower : _lower - 1;
1260-
// the continued loop becomes epsilon when newlower == newupper == 0
1261-
// in which case the returned concatenation will be just bodyDerivative
1262-
derivative = builder.CreateConcat(bodyDerivative, builder.CreateLoop(_left, IsLazy, newlower, newupper));
1328+
// In the general case the concatenation must be created first and the derivative taken on that. For example,
1329+
// the first derivative for (a|\b){2} with an input "ac" is (a|\b)|epsilon, but the rule above would produce
1330+
// just (a|\b).
1331+
derivative = builder.CreateConcat(_left, CreateLoopContinuation(builder)).CreateDerivative(builder, elem, context);
12631332
}
12641333
break;
12651334
}

src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -960,6 +960,12 @@ public static IEnumerable<object[]> Match_MemberData()
960960

961961
// Test for a bug in NonBacktracking's subsumption rule for XY subsuming X??Y, which didn't check that X is nullable
962962
yield return (@"XY|X??Y", "Y", RegexOptions.None, 0, 1, true, "Y");
963+
964+
// Tests for bugs in NonBacktracking, which didn't properly handle some combinations of loops and anchors
965+
yield return (@"(a|\b){2}", "ac", RegexOptions.None, 0, 2, true, "a");
966+
yield return (@"a?(\b|c)", "ac", RegexOptions.None, 0, 2, true, "ac");
967+
yield return (@"(a|())*(\b|c)", "ac", RegexOptions.None, 0, 2, true, "ac");
968+
yield return (@"(\b|a)*", "a", RegexOptions.None, 0, 1, true, "");
963969
}
964970

965971
[OuterLoop("Takes several seconds to run")]

0 commit comments

Comments
 (0)