Skip to content

Commit 0a55280

Browse files
committed
Update UCP handling of \b and \B to match recent changes to \w
1 parent a0b4ee0 commit 0a55280

File tree

7 files changed

+56
-20
lines changed

7 files changed

+56
-20
lines changed

ChangeLog

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,9 @@ matches the "fullwidth" versions of the hex digits. Just like it is done for
134134

135135
34. GitHub PR305 fixes a potential integer overflow in pcre2_dfa_match().
136136

137+
35. Updated handling of \b and \B in UCP mode to match the changes to \w in 32
138+
above because \b and \B are defined in terms of \w.
139+
137140

138141
Version 10.42 11-December-2022
139142
------------------------------

src/pcre2_dfa_match.c

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1119,11 +1119,10 @@ for (;;)
11191119
if (codevalue == OP_UCP_WORD_BOUNDARY ||
11201120
codevalue == OP_NOT_UCP_WORD_BOUNDARY)
11211121
{
1122-
if (d == '_') left_word = TRUE; else
1123-
{
1124-
uint32_t cat = UCD_CATEGORY(d);
1125-
left_word = (cat == ucp_L || cat == ucp_N);
1126-
}
1122+
int chartype = UCD_CHARTYPE(d);
1123+
int category = PRIV(ucp_gentype)[chartype];
1124+
left_word = (category == ucp_L || category == ucp_N ||
1125+
chartype == ucp_Mn || chartype == ucp_Pc);
11271126
}
11281127
else
11291128
#endif
@@ -1145,11 +1144,10 @@ for (;;)
11451144
if (codevalue == OP_UCP_WORD_BOUNDARY ||
11461145
codevalue == OP_NOT_UCP_WORD_BOUNDARY)
11471146
{
1148-
if (c == '_') right_word = TRUE; else
1149-
{
1150-
uint32_t cat = UCD_CATEGORY(c);
1151-
right_word = (cat == ucp_L || cat == ucp_N);
1152-
}
1147+
int chartype = UCD_CHARTYPE(c);
1148+
int category = PRIV(ucp_gentype)[chartype];
1149+
right_word = (category == ucp_L || category == ucp_N ||
1150+
chartype == ucp_Mn || chartype == ucp_Pc);
11531151
}
11541152
else
11551153
#endif

src/pcre2_match.c

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6180,11 +6180,10 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
61806180
#ifdef SUPPORT_UNICODE
61816181
if (Fop == OP_UCP_WORD_BOUNDARY || Fop == OP_NOT_UCP_WORD_BOUNDARY)
61826182
{
6183-
if (fc == '_') prev_is_word = TRUE; else
6184-
{
6185-
int cat = UCD_CATEGORY(fc);
6186-
prev_is_word = (cat == ucp_L || cat == ucp_N);
6187-
}
6183+
int chartype = UCD_CHARTYPE(fc);
6184+
int category = PRIV(ucp_gentype)[chartype];
6185+
prev_is_word = (category == ucp_L || category == ucp_N ||
6186+
chartype == ucp_Mn || chartype == ucp_Pc);
61886187
}
61896188
else
61906189
#endif /* SUPPORT_UNICODE */
@@ -6214,11 +6213,10 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
62146213
#ifdef SUPPORT_UNICODE
62156214
if (Fop == OP_UCP_WORD_BOUNDARY || Fop == OP_NOT_UCP_WORD_BOUNDARY)
62166215
{
6217-
if (fc == '_') cur_is_word = TRUE; else
6218-
{
6219-
int cat = UCD_CATEGORY(fc);
6220-
cur_is_word = (cat == ucp_L || cat == ucp_N);
6221-
}
6216+
int chartype = UCD_CHARTYPE(fc);
6217+
int category = PRIV(ucp_gentype)[chartype];
6218+
cur_is_word = (category == ucp_L || category == ucp_N ||
6219+
chartype == ucp_Mn || chartype == ucp_Pc);
62226220
}
62236221
else
62246222
#endif /* SUPPORT_UNICODE */

testdata/testinput4

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2856,4 +2856,10 @@
28562856
/[[:xdigit:]]+/utf,ucp
28572857
--123ef\x{ff10}\x{ff19}\x{ff21}\x{ff26}\x{ff1a}\=no_jit
28582858

2859+
/\b.+?\b/utf,ucp
2860+
--cafe\x{300}_au\x{203f}lait!\=no_jit
2861+
2862+
/caf\B.+?\B/utf,ucp
2863+
--cafe\x{300}_au\x{203f}lait!\=no_jit
2864+
28592865
# End of testinput4

testdata/testinput7

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2478,4 +2478,10 @@
24782478
/[\w]+/utf,ucp
24792479
--cafe\x{300}_au\x{203f}lait!
24802480

2481+
/\b.+?\b/utf,ucp
2482+
--cafe\x{300}_au\x{203f}lait!
2483+
2484+
/caf\B.+?\B/utf,ucp
2485+
--cafe\x{300}_au\x{203f}lait!
2486+
24812487
# End of testinput7

testdata/testoutput4

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4567,4 +4567,12 @@ No match
45674567
--123ef\x{ff10}\x{ff19}\x{ff21}\x{ff26}\x{ff1a}\=no_jit
45684568
0: 123ef\x{ff10}\x{ff19}\x{ff21}\x{ff26}
45694569

4570+
/\b.+?\b/utf,ucp
4571+
--cafe\x{300}_au\x{203f}lait!\=no_jit
4572+
0: cafe\x{300}_au\x{203f}lait
4573+
4574+
/caf\B.+?\B/utf,ucp
4575+
--cafe\x{300}_au\x{203f}lait!\=no_jit
4576+
0: cafe
4577+
45704578
# End of testinput4

testdata/testoutput7

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4139,4 +4139,21 @@ No match
41394139
--cafe\x{300}_au\x{203f}lait!
41404140
0: cafe\x{300}_au\x{203f}lait
41414141

4142+
/\b.+?\b/utf,ucp
4143+
--cafe\x{300}_au\x{203f}lait!
4144+
0: cafe\x{300}_au\x{203f}lait
4145+
4146+
/caf\B.+?\B/utf,ucp
4147+
--cafe\x{300}_au\x{203f}lait!
4148+
0: cafe\x{300}_au\x{203f}lait!
4149+
1: cafe\x{300}_au\x{203f}lai
4150+
2: cafe\x{300}_au\x{203f}la
4151+
3: cafe\x{300}_au\x{203f}l
4152+
4: cafe\x{300}_au\x{203f}
4153+
5: cafe\x{300}_au
4154+
6: cafe\x{300}_a
4155+
7: cafe\x{300}_
4156+
8: cafe\x{300}
4157+
9: cafe
4158+
41424159
# End of testinput7

0 commit comments

Comments
 (0)