@@ -1111,26 +1111,32 @@ private SymbolicRegexNode<TSet> PruneLowerPriorityThanNullability(SymbolicRegexB
1111
1111
1112
1112
case SymbolicRegexNodeKind . Concat :
1113
1113
Debug . Assert ( _left is not null && _right is not null ) ;
1114
- //in a concatenation (X|Y)Z priority is given to XZ when X is nullable
1115
- //observe that (X|Y)Z is equivalent to (XZ|YZ) and the branch YZ must be ignored
1116
- //when X is nullable (observe that XZ is nullable because this=(X|Y)Z is nullable)
1117
- //if X is not nullable then XZ is maintaned as is, and YZ is pruned
1118
- //e.g. (a{0,5}?|abc|b+)c* reduces to c*
1119
- //---
1120
- //in a concatenation XZ where X is not an alternation, both X and Z are pruned
1121
- //e.g. a{0,5}?b{0,5}? reduces to ()
1122
- prunedNode = _left . _kind == SymbolicRegexNodeKind . Alternate ?
1123
- ( _left . _left ! . IsNullableFor ( context ) ?
1114
+ prunedNode = _left . _kind switch
1115
+ {
1116
+ // If the left side is a concatenation, then we bring it to right associative form to expose
1117
+ // the first element of the concatenation for the cases below.
1118
+ SymbolicRegexNodeKind . Concat => CreateConcat ( builder , _left . _left ! , CreateConcat ( builder , _left . _right ! , _right ) )
1119
+ . PruneLowerPriorityThanNullability ( builder , context ) ,
1120
+ // In a concatenation (X|Y)Z priority is given to XZ when X is nullable.
1121
+ // Observe that (X|Y)Z is equivalent to (XZ|YZ) and the branch YZ must be ignored
1122
+ // when X is nullable (observe that XZ is nullable because this=(X|Y)Z is nullable).
1123
+ // If X is not nullable then XZ is maintaned as is, and YZ is pruned. For example,
1124
+ // (a{0,5}?|abc|b+)c* reduces to c*.
1125
+ SymbolicRegexNodeKind . Alternate => ( _left . _left ! . IsNullableFor ( context ) ?
1124
1126
CreateConcat ( builder , _left . _left , _right ) . PruneLowerPriorityThanNullability ( builder , context ) :
1125
- CreateAlternate ( builder , CreateConcat ( builder , _left . _left , _right ) , CreateConcat ( builder , _left . _right ! , _right ) . PruneLowerPriorityThanNullability ( builder , context ) ) ) :
1126
- CreateConcat ( builder , _left . PruneLowerPriorityThanNullability ( builder , context ) , _right . PruneLowerPriorityThanNullability ( builder , context ) ) ;
1127
+ CreateAlternate ( builder , CreateConcat ( builder , _left . _left , _right ) ,
1128
+ CreateConcat ( builder , _left . _right ! , _right ) . PruneLowerPriorityThanNullability ( builder , context ) , deduplicated : true ) ) ,
1129
+ // Loops have various cases that are handled uniformly for concatenations and standalone loops.
1130
+ SymbolicRegexNodeKind . Loop => PruneLoop ( builder , context , _left , _right ) ,
1131
+ // The previous cases handle all the ways that the left side of the concatenation could
1132
+ // contain branching. Thus if we get here it is safe to only prune the right side.
1133
+ _ => CreateConcat ( builder , _left , _right . PruneLowerPriorityThanNullability ( builder , context ) ) ,
1134
+ } ;
1127
1135
break ;
1128
1136
1129
1137
case SymbolicRegexNodeKind . Loop :
1130
- Debug . Assert ( _left is not null ) ;
1131
- // Lazy nullable loop reduces to (), i.e., the loop body is just forgotten
1132
- prunedNode = _info . IsLazyLoop && _lower == 0 ? builder . Epsilon :
1133
- CreateLoop ( builder , _left . PruneLowerPriorityThanNullability ( builder , context ) , _lower , _upper , _info . IsLazyLoop ) ;
1138
+ // Loops have various cases that are handled uniformly for standalone loops and concatenations.
1139
+ prunedNode = PruneLoop ( builder , context , this , builder . Epsilon ) ;
1134
1140
break ;
1135
1141
1136
1142
case SymbolicRegexNodeKind . Effect :
@@ -1147,6 +1153,73 @@ private SymbolicRegexNode<TSet> PruneLowerPriorityThanNullability(SymbolicRegexB
1147
1153
1148
1154
builder . _pruneLowerPriorityThanNullabilityCache [ key ] = prunedNode ;
1149
1155
return prunedNode ;
1156
+
1157
+ static SymbolicRegexNode < TSet > PruneLoop ( SymbolicRegexBuilder < TSet > builder , uint context , SymbolicRegexNode < TSet > loop , SymbolicRegexNode < TSet > tail )
1158
+ {
1159
+ Debug . Assert ( loop . Kind == SymbolicRegexNodeKind . Loop && loop . _left is not null ) ;
1160
+
1161
+ if ( loop . _lower == 0 )
1162
+ {
1163
+ // Loop is nullable at least due to a zero lower bound and the cases below handle the different
1164
+ // priorities of checking that lower bound.
1165
+ if ( loop . IsLazy )
1166
+ {
1167
+ // In a lazy loop nullability from the zero lower bound has highest priority, so the loop is skipped completely
1168
+ return tail . PruneLowerPriorityThanNullability ( builder , context ) ;
1169
+ }
1170
+ else if ( ! loop . _left . IsNullableFor ( context ) )
1171
+ {
1172
+ // For an eager loop with a non-nullable body, the nullability from the zero lower bound has lowest priority.
1173
+ // Handle by case splitting into (1) doing at least one iteration in the loop and (2) skipping the loop and
1174
+ // continuing directly into the tail, which then must be pruned in the current context.
1175
+ return CreateAlternate ( builder ,
1176
+ CreateConcat ( builder , CreateLoop ( builder , loop . _left , 1 , loop . _upper , loop . IsLazy ) , tail ) ,
1177
+ tail . PruneLowerPriorityThanNullability ( builder , context ) ) ;
1178
+ }
1179
+ else if ( loop . _upper == int . MaxValue )
1180
+ {
1181
+ // The special case of a R*S loop with a nullable body is handled separately, as the general case handler could cause
1182
+ // infinite recursion. The paths that backtracking would explore before stopping here are (1) consuming a character in the
1183
+ // first iteration of the loop and (2) skipping the loop and continuing with anything higher priority than nullability in S.
1184
+ // For example, a pattern (a|b??)*c? would prune into essentially a?(a|b??)*c?|c?. Note that pruning only one peeled-out
1185
+ // iteration of the loop is necessary, since anchors could change the priority of nullability in other locations in the input.
1186
+ // Additionally, a high-priority nullable body will always be skipped, in which case only option (2) is included. Finally, in
1187
+ // all cases the loop body must not be dropped, as doing so could affect whether some capture groups match zero characters or
1188
+ // don't match at all.
1189
+ SymbolicRegexNode < TSet > skipLoopCase = CreateConcat ( builder , loop . _left . PruneLowerPriorityThanNullability ( builder , context ) ,
1190
+ tail . PruneLowerPriorityThanNullability ( builder , context ) ) ;
1191
+ return loop . _left . IsHighPriorityNullableFor ( context ) ? skipLoopCase : CreateAlternate ( builder ,
1192
+ CreateConcat ( builder , loop . _left . PruneLowerPriorityThanNullability ( builder , context ) , CreateConcat ( builder , loop , tail ) ) ,
1193
+ skipLoopCase ) ;
1194
+ }
1195
+ // For an eager loop with finite upper bound and nullable body fall back to the general case handler
1196
+ }
1197
+
1198
+ Debug . Assert ( loop . _left . IsNullableFor ( context ) ) ;
1199
+ // The general case handler peels one iteration out of the loop and prunes the resulting concatenation
1200
+ return CreateConcat ( builder , loop . _left , CreateConcat ( builder , loop . CreateLoopContinuation ( builder ) , tail ) )
1201
+ . PruneLowerPriorityThanNullability ( builder , context ) ;
1202
+ }
1203
+ }
1204
+
1205
+ /// <summary>
1206
+ /// Creates the remainder of a loop when one iteration is peeled out of it. In other words, for a loop R with
1207
+ /// a body B returns a new regex S such that R is equivalent to BS.
1208
+ /// </summary>
1209
+ /// <param name="builder">the builder that owns this node</param>
1210
+ /// <returns>the loop continuation regex</returns>
1211
+ private SymbolicRegexNode < TSet > CreateLoopContinuation ( SymbolicRegexBuilder < TSet > builder )
1212
+ {
1213
+ Debug . Assert ( _kind == SymbolicRegexNodeKind . Loop && _left is not null ) ;
1214
+
1215
+ // Note that the upper bound is guaranteed to be greater than zero, since otherwise the loop would have
1216
+ // been simplified to nothing, and int.MaxValue is treated as infinity.
1217
+ int newupper = _upper == int . MaxValue ? int . MaxValue : _upper - 1 ;
1218
+ // Do not decrement the lower bound if it equals int.MaxValue
1219
+ int newlower = _lower == 0 || _lower == int . MaxValue ? _lower : _lower - 1 ;
1220
+
1221
+ // The continued loop becomes epsilon when newlower == newupper == 0
1222
+ return builder . CreateLoop ( _left , IsLazy , newlower , newupper ) ;
1150
1223
}
1151
1224
1152
1225
/// <summary>
@@ -1244,22 +1317,18 @@ private SymbolicRegexNode<TSet> CreateDerivative(SymbolicRegexBuilder<TSet> buil
1244
1317
Debug . Assert ( _left is not null ) ;
1245
1318
Debug . Assert ( _upper > 0 ) ;
1246
1319
1247
- SymbolicRegexNode < TSet > bodyDerivative = _left . CreateDerivative ( builder , elem , context ) ;
1248
- if ( bodyDerivative . IsNothing ( builder . _solver ) )
1320
+ if ( _lower == 0 || _left . IsNullable || ! _left . IsNullableFor ( context ) )
1249
1321
{
1250
- derivative = builder . _nothing ;
1322
+ // In these special cases the derivative concatenates the body's derivative with the decremented loop, so
1323
+ // d(R{m,n}) = d(R)R{max(0,m-1),n-1}.
1324
+ derivative = builder . CreateConcat ( _left . CreateDerivative ( builder , elem , context ) , CreateLoopContinuation ( builder ) ) ;
1251
1325
}
1252
1326
else
1253
1327
{
1254
- // The loop derivative peels out one iteration and concatenates the body's derivative with the decremented loop,
1255
- // so d(R{m,n}) = d(R)R{max(0,m-1),n-1}. Note that n is guaranteed to be greater than zero, since otherwise the
1256
- // loop would have been simplified to nothing, and int.MaxValue is treated as infinity.
1257
- int newupper = _upper == int . MaxValue ? int . MaxValue : _upper - 1 ;
1258
- // do not decrement the lower bound if it equals int.MaxValue
1259
- int newlower = _lower == 0 || _lower == int . MaxValue ? _lower : _lower - 1 ;
1260
- // the continued loop becomes epsilon when newlower == newupper == 0
1261
- // in which case the returned concatenation will be just bodyDerivative
1262
- derivative = builder . CreateConcat ( bodyDerivative , builder . CreateLoop ( _left , IsLazy , newlower , newupper ) ) ;
1328
+ // In the general case the concatenation must be created first and the derivative taken on that. For example,
1329
+ // the first derivative for (a|\b){2} with an input "ac" is (a|\b)|epsilon, but the rule above would produce
1330
+ // just (a|\b).
1331
+ derivative = builder . CreateConcat ( _left , CreateLoopContinuation ( builder ) ) . CreateDerivative ( builder , elem , context ) ;
1263
1332
}
1264
1333
break ;
1265
1334
}
0 commit comments