@@ -73,7 +73,6 @@ const CACHE_LIMIT: usize = 2 * (1<<20);
73
73
/// of tracking multi-byte assertions in the DFA.
74
74
pub fn can_exec ( insts : & Program ) -> bool {
75
75
use prog:: Inst :: * ;
76
- use prog:: EmptyLook :: * ;
77
76
// If for some reason we manage to allocate a regex program with more
78
77
// than STATE_MAX instructions, then we can't execute the DFA because we
79
78
// use 32 bit pointers with some of the bits reserved for special use.
@@ -83,14 +82,7 @@ pub fn can_exec(insts: &Program) -> bool {
83
82
for inst in insts {
84
83
match * inst {
85
84
Char ( _) | Ranges ( _) => return false ,
86
- EmptyLook ( ref inst) => {
87
- match inst. look {
88
- WordBoundary | NotWordBoundary => return false ,
89
- WordBoundaryAscii | NotWordBoundaryAscii => { }
90
- StartLine | EndLine | StartText | EndText => { }
91
- }
92
- }
93
- Match ( _) | Save ( _) | Split ( _) | Bytes ( _) => { }
85
+ EmptyLook ( _) | Match ( _) | Save ( _) | Split ( _) | Bytes ( _) => { }
94
86
}
95
87
}
96
88
true
@@ -296,17 +288,22 @@ const STATE_UNKNOWN: StatePtr = 1<<31;
296
288
/// once it is entered, no match can ever occur.
297
289
const STATE_DEAD : StatePtr = 1 <<30 ;
298
290
291
+ /// A quit state means that the DFA came across some input that it doesn't
292
+ /// know how to process correctly. The DFA should quit and another matching
293
+ /// engine should be run in its place.
294
+ const STATE_QUIT : StatePtr = 1 <<29 ;
295
+
299
296
/// A start state is a state that the DFA can start in.
300
297
///
301
298
/// Note that unlike unknown and dead states, start states have their lower
302
299
/// bits set to a state pointer.
303
- const STATE_START : StatePtr = 1 <<29 ;
300
+ const STATE_START : StatePtr = 1 <<28 ;
304
301
305
302
/// A match state means that the regex has successfully matched.
306
303
///
307
304
/// Note that unlike unknown and dead states, match states have their lower
308
305
/// bits set to a state pointer.
309
- const STATE_MATCH : StatePtr = 1 <<28 ;
306
+ const STATE_MATCH : StatePtr = 1 <<27 ;
310
307
311
308
/// The maximum state pointer.
312
309
const STATE_MAX : StatePtr = STATE_MATCH - 1 ;
@@ -591,7 +588,10 @@ impl<'a> Fsm<'a> {
591
588
None => return Result :: NoMatch ,
592
589
Some ( i) => i,
593
590
} ;
594
- } else if next_si >= STATE_DEAD {
591
+ } else if next_si >= STATE_QUIT {
592
+ if next_si & STATE_QUIT > 0 {
593
+ return Result :: Quit ;
594
+ }
595
595
// Finally, this corresponds to the case where the transition
596
596
// entered a state that can never lead to a match or a state
597
597
// that hasn't been computed yet. The latter being the "slow"
@@ -697,7 +697,10 @@ impl<'a> Fsm<'a> {
697
697
if self . at < cur {
698
698
result = Result :: Match ( self . at + 2 ) ;
699
699
}
700
- } else if next_si >= STATE_DEAD {
700
+ } else if next_si >= STATE_QUIT {
701
+ if next_si & STATE_QUIT > 0 {
702
+ return Result :: Quit ;
703
+ }
701
704
let byte = Byte :: byte ( text[ self . at ] ) ;
702
705
prev_si &= STATE_MAX ;
703
706
next_si = match self . next_state ( qcur, qnext, prev_si, byte) {
@@ -986,10 +989,15 @@ impl<'a> Fsm<'a> {
986
989
NotWordBoundaryAscii if flags. not_word_boundary => {
987
990
self . cache . stack . push ( inst. goto as InstPtr ) ;
988
991
}
992
+ WordBoundary if flags. word_boundary => {
993
+ self . cache . stack . push ( inst. goto as InstPtr ) ;
994
+ }
995
+ NotWordBoundary if flags. not_word_boundary => {
996
+ self . cache . stack . push ( inst. goto as InstPtr ) ;
997
+ }
989
998
StartLine | EndLine | StartText | EndText => { }
990
999
WordBoundaryAscii | NotWordBoundaryAscii => { }
991
- // The DFA doesn't support Unicode word boundaries. :-(
992
- WordBoundary | NotWordBoundary => unreachable ! ( ) ,
1000
+ WordBoundary | NotWordBoundary => { }
993
1001
}
994
1002
}
995
1003
Save ( ref inst) => self . cache . stack . push ( inst. goto as InstPtr ) ,
@@ -1057,7 +1065,12 @@ impl<'a> Fsm<'a> {
1057
1065
1058
1066
// OK, now there's enough room to push our new state.
1059
1067
// We do this even if the cache size is set to 0!
1060
- let trans = Transitions :: new ( self . num_byte_classes ( ) ) ;
1068
+ let mut trans = Transitions :: new ( self . num_byte_classes ( ) ) ;
1069
+ if self . prog . has_unicode_word_boundary {
1070
+ for b in 128 ..256 {
1071
+ trans[ self . byte_class ( Byte :: byte ( b as u8 ) ) ] = STATE_QUIT ;
1072
+ }
1073
+ }
1061
1074
let si = usize_to_u32 ( self . cache . states . len ( ) ) ;
1062
1075
self . cache . states . push ( State {
1063
1076
insts : key. insts . clone ( ) ,
@@ -1120,15 +1133,14 @@ impl<'a> Fsm<'a> {
1120
1133
state_flags. set_empty ( ) ;
1121
1134
insts. push ( ip) ;
1122
1135
}
1123
- WordBoundaryAscii => {
1136
+ WordBoundary | WordBoundaryAscii => {
1124
1137
state_flags. set_empty ( ) ;
1125
1138
insts. push ( ip) ;
1126
1139
}
1127
- NotWordBoundaryAscii => {
1140
+ NotWordBoundary | NotWordBoundaryAscii => {
1128
1141
state_flags. set_empty ( ) ;
1129
1142
insts. push ( ip) ;
1130
1143
}
1131
- WordBoundary | NotWordBoundary => unreachable ! ( ) ,
1132
1144
}
1133
1145
}
1134
1146
Match ( _) => {
@@ -1226,7 +1238,12 @@ impl<'a> Fsm<'a> {
1226
1238
return si;
1227
1239
}
1228
1240
let si = usize_to_u32 ( self . cache . states . len ( ) ) ;
1229
- let trans = Transitions :: new ( self . num_byte_classes ( ) ) ;
1241
+ let mut trans = Transitions :: new ( self . num_byte_classes ( ) ) ;
1242
+ if self . prog . has_unicode_word_boundary {
1243
+ for b in 128 ..256 {
1244
+ trans[ self . byte_class ( Byte :: byte ( b as u8 ) ) ] = STATE_QUIT ;
1245
+ }
1246
+ }
1230
1247
self . cache . states . push ( state) ;
1231
1248
self . cache . trans . push ( trans) ;
1232
1249
self . cache . compiled . insert ( key, si) ;
@@ -1257,8 +1274,9 @@ impl<'a> Fsm<'a> {
1257
1274
}
1258
1275
match self . cache . trans [ si as usize ] [ self . byte_class ( b) ] {
1259
1276
STATE_UNKNOWN => self . exec_byte ( qcur, qnext, si, b) ,
1260
- STATE_DEAD => return Some ( STATE_DEAD ) ,
1261
- nsi => return Some ( nsi) ,
1277
+ STATE_QUIT => None ,
1278
+ STATE_DEAD => Some ( STATE_DEAD ) ,
1279
+ nsi => Some ( nsi) ,
1262
1280
}
1263
1281
}
1264
1282
0 commit comments