Skip to content

Commit c537dcf

Browse files
author
Zoltan Herczeg
committed
Pre-compute unicode category list for xclasses
1 parent ff375d6 commit c537dcf

File tree

8 files changed

+244
-80
lines changed

8 files changed

+244
-80
lines changed

src/pcre2_compile_class.c

Lines changed: 108 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -116,10 +116,11 @@ while (TRUE)
116116

117117
#ifdef SUPPORT_UNICODE
118118

119-
#define PARSE_CLASS_UTF 0x1
120-
#define PARSE_CLASS_CASELESS_UTF 0x2
121-
#define PARSE_CLASS_RESTRICTED_UTF 0x4
122-
#define PARSE_CLASS_TURKISH_UTF 0x8
119+
#define PARSE_CLASS_UTF 0x01
120+
#define PARSE_CLASS_CASELESS_UTF 0x02
121+
#define PARSE_CLASS_RESTRICTED_UTF 0x04
122+
#define PARSE_CLASS_TURKISH_UTF 0x08
123+
#define PARSE_CLASS_COMPUTE_CATLIST 0x10
123124

124125
/* Get the range of nocase characters which includes the
125126
'c' character passed as argument, or directly follows 'c'. */
@@ -358,13 +359,28 @@ append_non_ascii_range(uint32_t options, uint32_t *buffer)
358359
return buffer + 2;
359360
}
360361

362+
/* The buffer may represent the categry list pointer when utf is enabled. */
361363
static size_t
362364
parse_class(uint32_t *ptr, uint32_t options, uint32_t *buffer)
363365
{
364366
size_t total_size = 0;
365367
size_t size;
366368
uint32_t meta_arg;
367369
uint32_t start_char;
370+
uint32_t ptype;
371+
#ifdef SUPPORT_UNICODE
372+
uint32_t pdata;
373+
uint32_t category_list;
374+
uint32_t *pcategory_list = NULL;
375+
#endif
376+
377+
#ifdef SUPPORT_UNICODE
378+
if ((options & PARSE_CLASS_COMPUTE_CATLIST) != 0)
379+
{
380+
pcategory_list = buffer;
381+
buffer = NULL;
382+
}
383+
#endif
368384

369385
while (TRUE)
370386
{
@@ -408,7 +424,8 @@ while (TRUE)
408424
case ESC_p:
409425
case ESC_P:
410426
ptr++;
411-
if (meta_arg == ESC_p && (*ptr >> 16) == PT_ANY)
427+
ptype = (*ptr >> 16);
428+
if (meta_arg == ESC_p && ptype == PT_ANY)
412429
{
413430
if (buffer != NULL)
414431
{
@@ -418,6 +435,43 @@ while (TRUE)
418435
}
419436
total_size += 2;
420437
}
438+
#ifdef SUPPORT_UNICODE
439+
if (pcategory_list == NULL) break;
440+
441+
category_list = 0;
442+
443+
switch(ptype)
444+
{
445+
case PT_LAMP:
446+
category_list = UCPCAT3(ucp_Lu, ucp_Ll, ucp_Lt);
447+
break;
448+
449+
case PT_GC:
450+
pdata = *ptr & 0xffff;
451+
category_list = UCPCAT_RANGE(PRIV(ucp_typerange)[pdata],
452+
PRIV(ucp_typerange)[pdata + 1] - 1);
453+
break;
454+
455+
case PT_PC:
456+
pdata = *ptr & 0xffff;
457+
category_list = UCPCAT(pdata);
458+
break;
459+
460+
case PT_WORD:
461+
category_list = UCPCAT2(ucp_Mn, ucp_Pc) | UCPCAT_L | UCPCAT_N;
462+
break;
463+
464+
case PT_ALNUM:
465+
category_list = UCPCAT_L | UCPCAT_N;
466+
break;
467+
}
468+
469+
if (category_list > 0)
470+
{
471+
if (meta_arg == ESC_P) category_list ^= UCPCAT_ALL;
472+
*pcategory_list |= category_list;
473+
}
474+
#endif
421475
break;
422476
}
423477
ptr++;
@@ -512,6 +566,9 @@ const uint32_t *char_list_next;
512566
uint16_t *next_char;
513567
uint32_t char_list_start, char_list_end;
514568
uint32_t range_start, range_end;
569+
#ifdef SUPPORT_UNICODE
570+
uint32_t category_list = 0;
571+
#endif
515572

516573
#ifdef SUPPORT_UNICODE
517574
if (options & PCRE2_UTF)
@@ -529,11 +586,21 @@ if (xoptions & PCRE2_EXTRA_TURKISH_CASING)
529586

530587
/* Compute required space for the range. */
531588

589+
#ifdef SUPPORT_UNICODE
590+
range_list_size = parse_class(start_ptr,
591+
class_options | PARSE_CLASS_COMPUTE_CATLIST,
592+
&category_list);
593+
#else
532594
range_list_size = parse_class(start_ptr, class_options, NULL);
595+
#endif
533596
PCRE2_ASSERT((range_list_size & 0x1) == 0);
534597

535598
/* Allocate buffer. The total_size also represents the end of the buffer. */
536599

600+
#ifdef SUPPORT_UNICODE
601+
if (category_list == UCPCAT_ALL) range_list_size = 2;
602+
#endif
603+
537604
total_size = range_list_size +
538605
((range_list_size >= 2) ? CHAR_LIST_EXTRA_SIZE : 0);
539606

@@ -548,6 +615,21 @@ cranges->range_list_size = (uint16_t)range_list_size;
548615
cranges->char_lists_types = 0;
549616
cranges->char_lists_size = 0;
550617
cranges->char_lists_start = 0;
618+
#ifdef SUPPORT_UNICODE
619+
cranges->category_list = category_list;
620+
#endif
621+
622+
#ifdef SUPPORT_UNICODE
623+
if (category_list == UCPCAT_ALL)
624+
{
625+
/* Replace the xclass with OP_ALLANY. */
626+
cranges->category_list = 0;
627+
buffer = (uint32_t*)(cranges + 1);
628+
buffer[0] = 0;
629+
buffer[1] = get_highest_char(options);
630+
return cranges;
631+
}
632+
#endif
551633

552634
if (range_list_size == 0) return cranges;
553635

@@ -1042,6 +1124,7 @@ BOOL utf = FALSE;
10421124

10431125
#ifdef SUPPORT_WIDE_CHARS
10441126
uint32_t xclass_props;
1127+
uint32_t category_list;
10451128
PCRE2_UCHAR *class_uchardata;
10461129
class_ranges* cranges;
10471130
#endif
@@ -1058,6 +1141,7 @@ should_flip_negation = FALSE;
10581141

10591142
#ifdef SUPPORT_WIDE_CHARS
10601143
xclass_props = 0;
1144+
category_list = 0;
10611145

10621146
#if PCRE2_CODE_UNIT_WIDTH == 8
10631147
cranges = NULL;
@@ -1091,6 +1175,9 @@ if (utf)
10911175
cb->cranges = cranges->next;
10921176
}
10931177

1178+
category_list = cranges->category_list;
1179+
PCRE2_ASSERT(category_list != UCPCAT_ALL);
1180+
10941181
if (cranges->range_list_size > 0)
10951182
{
10961183
const uint32_t *ranges = (const uint32_t*)(cranges + 1);
@@ -1105,6 +1192,13 @@ if (utf)
11051192
}
11061193

11071194
class_uchardata = code + LINK_SIZE + 2; /* For XCLASS items */
1195+
1196+
if (cranges != NULL && category_list != 0 &&
1197+
(xclass_props & XCLASS_HIGH_ANY) == 0)
1198+
{
1199+
xclass_props |= XCLASS_REQUIRED | XCLASS_HAS_PROPS;
1200+
class_uchardata += sizeof(uint32_t) / sizeof(PCRE2_UCHAR);
1201+
}
11081202
#endif /* SUPPORT_WIDE_CHARS */
11091203

11101204
/* Initialize the 256-bit (32-byte) bit map to all zeros. We build the map
@@ -1380,7 +1474,9 @@ while (TRUE)
13801474

13811475
PRIV(update_classbits)(ptype, pdata, (escape == ESC_P), classbits);
13821476

1383-
if ((xclass_props & XCLASS_HIGH_ANY) == 0)
1477+
if ((xclass_props & XCLASS_HIGH_ANY) == 0 &&
1478+
ptype != PT_LAMP && ptype != PT_GC && ptype != PT_PC &&
1479+
ptype != PT_WORD && ptype != PT_ALNUM)
13841480
{
13851481
if (lengthptr != NULL)
13861482
*lengthptr += 3;
@@ -1640,6 +1736,12 @@ if ((xclass_props & XCLASS_REQUIRED) != 0)
16401736
code += LINK_SIZE;
16411737
*code = negate_class? XCL_NOT:0;
16421738
if ((xclass_props & XCLASS_HAS_PROPS) != 0) *code |= XCL_HASPROP;
1739+
/* This should be the last one. */
1740+
if (category_list != 0)
1741+
{
1742+
*code |= XCL_HASCATLIST;
1743+
memmove(code + 1, &category_list, sizeof(uint32_t));
1744+
}
16431745

16441746
/* If the map is required, move up the extra data to make room for it;
16451747
otherwise just move the code pointer to the end of the extra data. */

src/pcre2_internal.h

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1357,9 +1357,10 @@ table. */
13571357
/* Flag bits and data types for the extended class (OP_XCLASS) for classes that
13581358
contain characters with values greater than 255. */
13591359

1360-
#define XCL_NOT 0x01 /* Flag: this is a negative class */
1361-
#define XCL_MAP 0x02 /* Flag: a 32-byte map is present */
1362-
#define XCL_HASPROP 0x04 /* Flag: property checks are present. */
1360+
#define XCL_NOT 0x01 /* Flag: this is a negative class */
1361+
#define XCL_MAP 0x02 /* Flag: a 32-byte map is present */
1362+
#define XCL_HASPROP 0x04 /* Flag: property checks are present */
1363+
#define XCL_HASCATLIST 0x08 /* Flag: category list is present */
13631364

13641365
#define XCL_END 0 /* Marks end of individual items */
13651366
#define XCL_SINGLE 1 /* Single item (one multibyte char) follows */
@@ -2031,6 +2032,18 @@ typedef struct {
20312032
((uint32_t)(ch) == 0x0130u ? 0x69u : \
20322033
(uint32_t)(ch) == 0x49u ? 0x0131u : (uint32_t)(ch))
20332034

2035+
/* UCP bitset manipulating macros. */
2036+
2037+
#ifdef SUPPORT_UNICODE
2038+
#define UCPCAT(bit) (1 << (bit))
2039+
#define UCPCAT2(bit1, bit2) (UCPCAT(bit1) | UCPCAT(bit2))
2040+
#define UCPCAT3(bit1, bit2, bit3) (UCPCAT(bit1) | UCPCAT(bit2) | UCPCAT(bit3))
2041+
#define UCPCAT_RANGE(start, end) (((1 << ((end) + 1)) - 1) - ((1 << (start)) - 1))
2042+
#define UCPCAT_L UCPCAT_RANGE(ucp_Ll, ucp_Lu)
2043+
#define UCPCAT_N UCPCAT_RANGE(ucp_Nd, ucp_No)
2044+
#define UCPCAT_ALL ((1 << (ucp_Zs + 1)) - 1)
2045+
#endif
2046+
20342047
/* The "scriptx" and bprops fields contain offsets into vectors of 32-bit words
20352048
that form a bitmap representing a list of scripts or boolean properties. These
20362049
macros test or set a bit in the map by number. */

src/pcre2_intmodedep.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -733,6 +733,9 @@ typedef struct class_ranges {
733733
struct class_ranges *next; /* Next class ranges */
734734
size_t char_lists_size; /* Total size of encoded char lists */
735735
size_t char_lists_start; /* Start offset of encoded char lists */
736+
#ifdef SUPPORT_UNICODE
737+
uint32_t category_list; /* Bitset of matching unicode categories. */
738+
#endif
736739
uint16_t range_list_size; /* Size of ranges array */
737740
uint16_t char_lists_types; /* The XCL_LIST header of char lists */
738741
/* Followed by the list of ranges (start/end pairs) */

src/pcre2_jit_char_inc.h

Lines changed: 12 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -531,6 +531,13 @@ if (flags & XCL_MAP)
531531
cc += 32 / sizeof(PCRE2_UCHAR);
532532

533533
#ifdef SUPPORT_UNICODE
534+
if (flags & XCL_HASCATLIST)
535+
{
536+
memcpy(&category_list, cc, sizeof(uint32_t));
537+
status |= XCLASS_HAS_TYPE;
538+
cc += sizeof(uint32_t) / sizeof(PCRE2_UCHAR);
539+
}
540+
534541
while (*cc == XCL_PROP || *cc == XCL_NOTPROP)
535542
{
536543
compares++;
@@ -545,7 +552,7 @@ while (*cc == XCL_PROP || *cc == XCL_NOTPROP)
545552
break;
546553

547554
case PT_GC:
548-
items = UCPCAT_RANGE(PRIV(ucp_typerange)[(int)cc[1] * 2], PRIV(ucp_typerange)[(int)cc[1] * 2 + 1]);
555+
items = UCPCAT_RANGE(PRIV(ucp_typerange)[(int)cc[1]], PRIV(ucp_typerange)[(int)cc[1] + 1] - 1);
549556
break;
550557

551558
case PT_PC:
@@ -612,21 +619,7 @@ while (*cc == XCL_PROP || *cc == XCL_NOTPROP)
612619
cc += 2;
613620
}
614621

615-
if (category_list == UCPCAT_ALL)
616-
{
617-
/* All or no characters are accepted, same as dotall. */
618-
if (status & XCLASS_IS_ECLASS)
619-
{
620-
if (list != backtracks)
621-
OP2(SLJIT_OR, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1);
622-
return;
623-
}
624-
625-
compile_char1_matchingpath(common, OP_ALLANY, cc, backtracks, FALSE);
626-
if (list == backtracks)
627-
add_jump(compiler, backtracks, JUMP(SLJIT_JUMP));
628-
return;
629-
}
622+
SLJIT_ASSERT(category_list != UCPCAT_ALL);
630623

631624
if (category_list != 0)
632625
compares++;
@@ -679,6 +672,9 @@ if ((flags & XCL_MAP) != 0)
679672
}
680673

681674
#ifdef SUPPORT_UNICODE
675+
if (flags & XCL_HASCATLIST)
676+
cc += sizeof(uint32_t) / sizeof(PCRE2_UCHAR);
677+
682678
if (status & XCLASS_NEEDS_UCD)
683679
{
684680
if ((status & (XCLASS_SAVE_CHAR | XCLASS_IS_ECLASS)) == XCLASS_SAVE_CHAR)

src/pcre2_jit_compile.c

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7007,16 +7007,6 @@ else
70077007
JUMPTO(SLJIT_JUMP, mainloop);
70087008
}
70097009

7010-
#ifdef SUPPORT_UNICODE
7011-
#define UCPCAT(bit) (1 << (bit))
7012-
#define UCPCAT2(bit1, bit2) (UCPCAT(bit1) | UCPCAT(bit2))
7013-
#define UCPCAT3(bit1, bit2, bit3) (UCPCAT(bit1) | UCPCAT(bit2) | UCPCAT(bit3))
7014-
#define UCPCAT_RANGE(start, end) (((1 << ((end) + 1)) - 1) - ((1 << (start)) - 1))
7015-
#define UCPCAT_L UCPCAT_RANGE(ucp_Ll, ucp_Lu)
7016-
#define UCPCAT_N UCPCAT_RANGE(ucp_Nd, ucp_No)
7017-
#define UCPCAT_ALL ((1 << (ucp_Zs + 1)) - 1)
7018-
#endif
7019-
70207010
static void check_wordboundary(compiler_common *common, BOOL ucp)
70217011
{
70227012
DEFINE_COMPILER;

0 commit comments

Comments
 (0)