Skip to content

Commit 9aa1220

Browse files
author
Zoltan Herczeg
committed
Pre-compute unicode category list for xclasses
1 parent 2e03e32 commit 9aa1220

File tree

8 files changed

+305
-80
lines changed

8 files changed

+305
-80
lines changed

src/pcre2_compile_class.c

Lines changed: 112 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -115,10 +115,11 @@ while (TRUE)
115115

116116
#ifdef SUPPORT_UNICODE
117117

118-
#define PARSE_CLASS_UTF 0x1
119-
#define PARSE_CLASS_CASELESS_UTF 0x2
120-
#define PARSE_CLASS_RESTRICTED_UTF 0x4
121-
#define PARSE_CLASS_TURKISH_UTF 0x8
118+
#define PARSE_CLASS_UTF 0x01
119+
#define PARSE_CLASS_CASELESS_UTF 0x02
120+
#define PARSE_CLASS_RESTRICTED_UTF 0x04
121+
#define PARSE_CLASS_TURKISH_UTF 0x08
122+
#define PARSE_CLASS_COMPUTE_CATLIST 0x10
122123

123124
/* Get the range of nocase characters which includes the
124125
'c' character passed as argument, or directly follows 'c'. */
@@ -357,13 +358,28 @@ append_non_ascii_range(uint32_t options, uint32_t *buffer)
357358
return buffer + 2;
358359
}
359360

361+
/* The buffer may represent the categry list pointer when utf is enabled. */
360362
static size_t
361363
parse_class(uint32_t *ptr, uint32_t options, uint32_t *buffer)
362364
{
363365
size_t total_size = 0;
364366
size_t size;
365367
uint32_t meta_arg;
366368
uint32_t start_char;
369+
uint32_t ptype;
370+
#ifdef SUPPORT_UNICODE
371+
uint32_t pdata;
372+
uint32_t category_list;
373+
uint32_t *pcategory_list = NULL;
374+
#endif
375+
376+
#ifdef SUPPORT_UNICODE
377+
if ((options & PARSE_CLASS_COMPUTE_CATLIST) != 0)
378+
{
379+
pcategory_list = buffer;
380+
buffer = NULL;
381+
}
382+
#endif
367383

368384
while (TRUE)
369385
{
@@ -407,7 +423,8 @@ while (TRUE)
407423
case ESC_p:
408424
case ESC_P:
409425
ptr++;
410-
if (meta_arg == ESC_p && (*ptr >> 16) == PT_ANY)
426+
ptype = (*ptr >> 16);
427+
if (meta_arg == ESC_p && ptype == PT_ANY)
411428
{
412429
if (buffer != NULL)
413430
{
@@ -417,6 +434,43 @@ while (TRUE)
417434
}
418435
total_size += 2;
419436
}
437+
#ifdef SUPPORT_UNICODE
438+
if (pcategory_list == NULL) break;
439+
440+
category_list = 0;
441+
442+
switch(ptype)
443+
{
444+
case PT_LAMP:
445+
category_list = UCPCAT3(ucp_Lu, ucp_Ll, ucp_Lt);
446+
break;
447+
448+
case PT_GC:
449+
pdata = *ptr & 0xffff;
450+
category_list = UCPCAT_RANGE(PRIV(ucp_typerange)[pdata],
451+
PRIV(ucp_typerange)[pdata + 1] - 1);
452+
break;
453+
454+
case PT_PC:
455+
pdata = *ptr & 0xffff;
456+
category_list = UCPCAT(pdata);
457+
break;
458+
459+
case PT_WORD:
460+
category_list = UCPCAT2(ucp_Mn, ucp_Pc) | UCPCAT_L | UCPCAT_N;
461+
break;
462+
463+
case PT_ALNUM:
464+
category_list = UCPCAT_L | UCPCAT_N;
465+
break;
466+
}
467+
468+
if (category_list > 0)
469+
{
470+
if (meta_arg == ESC_P) category_list ^= UCPCAT_ALL;
471+
*pcategory_list |= category_list;
472+
}
473+
#endif
420474
break;
421475
}
422476
ptr++;
@@ -511,6 +565,9 @@ const uint32_t *char_list_next;
511565
uint16_t *next_char;
512566
uint32_t char_list_start, char_list_end;
513567
uint32_t range_start, range_end;
568+
#ifdef SUPPORT_UNICODE
569+
uint32_t category_list = 0;
570+
#endif
514571

515572
#ifdef SUPPORT_UNICODE
516573
if (options & PCRE2_UTF)
@@ -531,11 +588,22 @@ if (xoptions & PCRE2_EXTRA_TURKISH_CASING)
531588

532589
/* Compute required space for the range. */
533590

591+
#ifdef SUPPORT_UNICODE
592+
range_list_size = parse_class(start_ptr,
593+
class_options | PARSE_CLASS_COMPUTE_CATLIST,
594+
&category_list);
595+
#else
534596
range_list_size = parse_class(start_ptr, class_options, NULL);
597+
#endif
535598
PCRE2_ASSERT((range_list_size & 0x1) == 0);
536599

537600
/* Allocate buffer. The total_size also represents the end of the buffer. */
538601

602+
#ifdef SUPPORT_UNICODE
603+
/* Replaced by an OP_ALLANY. */
604+
if (category_list == UCPCAT_ALL) range_list_size = 2;
605+
#endif
606+
539607
total_size = range_list_size +
540608
((range_list_size >= 2) ? CHAR_LIST_EXTRA_SIZE : 0);
541609

@@ -553,6 +621,21 @@ cranges->range_list_size = (uint16_t)range_list_size;
553621
cranges->char_lists_types = 0;
554622
cranges->char_lists_size = 0;
555623
cranges->char_lists_start = 0;
624+
#ifdef SUPPORT_UNICODE
625+
cranges->category_list = category_list;
626+
#endif
627+
628+
#ifdef SUPPORT_UNICODE
629+
if (category_list == UCPCAT_ALL)
630+
{
631+
/* Replace the xclass with OP_ALLANY. */
632+
cranges->category_list = 0;
633+
buffer = (uint32_t*)(cranges + 1);
634+
buffer[0] = 0;
635+
buffer[1] = get_highest_char(class_options);
636+
return cranges;
637+
}
638+
#endif
556639

557640
if (range_list_size == 0) return cranges;
558641

@@ -1087,6 +1170,7 @@ BOOL utf = FALSE;
10871170

10881171
#ifdef SUPPORT_WIDE_CHARS
10891172
uint32_t xclass_props;
1173+
uint32_t category_list;
10901174
PCRE2_UCHAR *class_uchardata;
10911175
class_ranges* cranges;
10921176
#else
@@ -1107,6 +1191,7 @@ should_flip_negation = FALSE;
11071191

11081192
#ifdef SUPPORT_WIDE_CHARS
11091193
xclass_props = 0;
1194+
category_list = 0;
11101195

11111196
#if PCRE2_CODE_UNIT_WIDTH == 8
11121197
cranges = NULL;
@@ -1140,6 +1225,9 @@ if (utf)
11401225
cb->first_data = cranges->header.next;
11411226
}
11421227

1228+
category_list = cranges->category_list;
1229+
PCRE2_ASSERT(category_list != UCPCAT_ALL);
1230+
11431231
if (cranges->range_list_size > 0)
11441232
{
11451233
const uint32_t *ranges = (const uint32_t*)(cranges + 1);
@@ -1154,6 +1242,13 @@ if (utf)
11541242
}
11551243

11561244
class_uchardata = code + LINK_SIZE + 2; /* For XCLASS items */
1245+
1246+
if (cranges != NULL && category_list != 0 &&
1247+
(xclass_props & XCLASS_HIGH_ANY) == 0)
1248+
{
1249+
xclass_props |= XCLASS_REQUIRED | XCLASS_HAS_PROPS;
1250+
class_uchardata += sizeof(uint32_t) / sizeof(PCRE2_UCHAR);
1251+
}
11571252
#endif /* SUPPORT_WIDE_CHARS */
11581253

11591254
/* Initialize the 256-bit (32-byte) bit map to all zeros. We build the map
@@ -1444,7 +1539,9 @@ while (TRUE)
14441539

14451540
PRIV(update_classbits)(ptype, pdata, (escape == ESC_P), classbits);
14461541

1447-
if ((xclass_props & XCLASS_HIGH_ANY) == 0)
1542+
if ((xclass_props & XCLASS_HIGH_ANY) == 0 &&
1543+
ptype != PT_LAMP && ptype != PT_GC && ptype != PT_PC &&
1544+
ptype != PT_WORD && ptype != PT_ALNUM)
14481545
{
14491546
if (lengthptr != NULL)
14501547
*lengthptr += 3;
@@ -1709,6 +1806,15 @@ if ((xclass_props & XCLASS_REQUIRED) != 0)
17091806
*code = negate_class? XCL_NOT:0;
17101807
if ((xclass_props & XCLASS_HAS_PROPS) != 0) *code |= XCL_HASPROP;
17111808

1809+
/* The category_list is placed after the class feature bitset.
1810+
The code pointer is not increased, because the bitset for the
1811+
first 256 characters may be injected after the feature bitset. */
1812+
if (category_list != 0)
1813+
{
1814+
*code |= XCL_HASCATLIST;
1815+
memmove(code + 1, &category_list, sizeof(uint32_t));
1816+
}
1817+
17121818
/* If the map is required, move up the extra data to make room for it;
17131819
otherwise just move the code pointer to the end of the extra data. */
17141820

src/pcre2_internal.h

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1515,9 +1515,10 @@ table. */
15151515
/* Flag bits and data types for the extended class (OP_XCLASS) for classes that
15161516
contain characters with values greater than 255. */
15171517

1518-
#define XCL_NOT 0x01 /* Flag: this is a negative class */
1519-
#define XCL_MAP 0x02 /* Flag: a 32-byte map is present */
1520-
#define XCL_HASPROP 0x04 /* Flag: property checks are present. */
1518+
#define XCL_NOT 0x01 /* Flag: this is a negative class */
1519+
#define XCL_MAP 0x02 /* Flag: a 32-byte map is present */
1520+
#define XCL_HASPROP 0x04 /* Flag: property checks are present */
1521+
#define XCL_HASCATLIST 0x08 /* Flag: category list is present */
15211522

15221523
#define XCL_END 0 /* Marks end of individual items */
15231524
#define XCL_SINGLE 1 /* Single item (one multibyte char) follows */
@@ -2189,6 +2190,18 @@ typedef struct {
21892190
((uint32_t)(ch) == 0x0130u ? 0x69u : \
21902191
(uint32_t)(ch) == 0x49u ? 0x0131u : (uint32_t)(ch))
21912192

2193+
/* UCP bitset manipulating macros. */
2194+
2195+
#ifdef SUPPORT_UNICODE
2196+
#define UCPCAT(bit) (1 << (bit))
2197+
#define UCPCAT2(bit1, bit2) (UCPCAT(bit1) | UCPCAT(bit2))
2198+
#define UCPCAT3(bit1, bit2, bit3) (UCPCAT(bit1) | UCPCAT(bit2) | UCPCAT(bit3))
2199+
#define UCPCAT_RANGE(start, end) (((1 << ((end) + 1)) - 1) - ((1 << (start)) - 1))
2200+
#define UCPCAT_L UCPCAT_RANGE(ucp_Ll, ucp_Lu)
2201+
#define UCPCAT_N UCPCAT_RANGE(ucp_Nd, ucp_No)
2202+
#define UCPCAT_ALL ((1 << (ucp_Zs + 1)) - 1)
2203+
#endif
2204+
21922205
/* The "scriptx" and bprops fields contain offsets into vectors of 32-bit words
21932206
that form a bitmap representing a list of scripts or boolean properties. These
21942207
macros test or set a bit in the map by number. */

src/pcre2_intmodedep.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -754,6 +754,9 @@ typedef struct class_ranges {
754754
compile_data header; /* Common header */
755755
size_t char_lists_size; /* Total size of encoded char lists */
756756
size_t char_lists_start; /* Start offset of encoded char lists */
757+
#ifdef SUPPORT_UNICODE
758+
uint32_t category_list; /* Bitset of matching unicode categories. */
759+
#endif
757760
uint16_t range_list_size; /* Size of ranges array */
758761
uint16_t char_lists_types; /* The XCL_LIST header of char lists */
759762
/* Followed by the list of ranges (start/end pairs) */

src/pcre2_jit_char_inc.h

Lines changed: 14 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -533,6 +533,13 @@ if (flags & XCL_MAP)
533533
cc += 32 / sizeof(PCRE2_UCHAR);
534534

535535
#ifdef SUPPORT_UNICODE
536+
if (flags & XCL_HASCATLIST)
537+
{
538+
memcpy(&category_list, cc, sizeof(uint32_t));
539+
status |= XCLASS_HAS_TYPE;
540+
cc += sizeof(uint32_t) / sizeof(PCRE2_UCHAR);
541+
}
542+
536543
while (*cc == XCL_PROP || *cc == XCL_NOTPROP)
537544
{
538545
compares++;
@@ -542,12 +549,14 @@ while (*cc == XCL_PROP || *cc == XCL_NOTPROP)
542549

543550
switch(*cc)
544551
{
552+
/* JIT compiles bare (not in class) escape sequences using
553+
this code path, so setting categories must be kept. */
545554
case PT_LAMP:
546555
items = UCPCAT3(ucp_Lu, ucp_Ll, ucp_Lt);
547556
break;
548557

549558
case PT_GC:
550-
items = UCPCAT_RANGE(PRIV(ucp_typerange)[(int)cc[1] * 2], PRIV(ucp_typerange)[(int)cc[1] * 2 + 1]);
559+
items = UCPCAT_RANGE(PRIV(ucp_typerange)[(int)cc[1]], PRIV(ucp_typerange)[(int)cc[1] + 1] - 1);
551560
break;
552561

553562
case PT_PC:
@@ -614,21 +623,7 @@ while (*cc == XCL_PROP || *cc == XCL_NOTPROP)
614623
cc += 2;
615624
}
616625

617-
if (category_list == UCPCAT_ALL)
618-
{
619-
/* All or no characters are accepted, same as dotall. */
620-
if (status & XCLASS_IS_ECLASS)
621-
{
622-
if (list != backtracks)
623-
OP2(SLJIT_OR, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1);
624-
return;
625-
}
626-
627-
compile_char1_matchingpath(common, OP_ALLANY, cc, backtracks, FALSE);
628-
if (list == backtracks)
629-
add_jump(compiler, backtracks, JUMP(SLJIT_JUMP));
630-
return;
631-
}
626+
SLJIT_ASSERT(category_list != UCPCAT_ALL);
632627

633628
if (category_list != 0)
634629
compares++;
@@ -681,6 +676,9 @@ if ((flags & XCL_MAP) != 0)
681676
}
682677

683678
#ifdef SUPPORT_UNICODE
679+
if (flags & XCL_HASCATLIST)
680+
cc += sizeof(uint32_t) / sizeof(PCRE2_UCHAR);
681+
684682
if (status & XCLASS_NEEDS_UCD)
685683
{
686684
if ((status & (XCLASS_SAVE_CHAR | XCLASS_IS_ECLASS)) == XCLASS_SAVE_CHAR)

src/pcre2_jit_compile.c

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7147,16 +7147,6 @@ else
71477147
JUMPTO(SLJIT_JUMP, mainloop);
71487148
}
71497149

7150-
#ifdef SUPPORT_UNICODE
7151-
#define UCPCAT(bit) (1 << (bit))
7152-
#define UCPCAT2(bit1, bit2) (UCPCAT(bit1) | UCPCAT(bit2))
7153-
#define UCPCAT3(bit1, bit2, bit3) (UCPCAT(bit1) | UCPCAT(bit2) | UCPCAT(bit3))
7154-
#define UCPCAT_RANGE(start, end) (((1 << ((end) + 1)) - 1) - ((1 << (start)) - 1))
7155-
#define UCPCAT_L UCPCAT_RANGE(ucp_Ll, ucp_Lu)
7156-
#define UCPCAT_N UCPCAT_RANGE(ucp_Nd, ucp_No)
7157-
#define UCPCAT_ALL ((1 << (ucp_Zs + 1)) - 1)
7158-
#endif
7159-
71607150
static void check_wordboundary(compiler_common *common, BOOL ucp)
71617151
{
71627152
DEFINE_COMPILER;

0 commit comments

Comments
 (0)