@@ -115,10 +115,11 @@ while (TRUE)
115115
116116#ifdef  SUPPORT_UNICODE 
117117
118- #define  PARSE_CLASS_UTF                0x1
119- #define  PARSE_CLASS_CASELESS_UTF       0x2
120- #define  PARSE_CLASS_RESTRICTED_UTF     0x4
121- #define  PARSE_CLASS_TURKISH_UTF        0x8
118+ #define  PARSE_CLASS_UTF                0x01
119+ #define  PARSE_CLASS_CASELESS_UTF       0x02
120+ #define  PARSE_CLASS_RESTRICTED_UTF     0x04
121+ #define  PARSE_CLASS_TURKISH_UTF        0x08
122+ #define  PARSE_CLASS_COMPUTE_CATLIST    0x10
122123
123124/* Get the range of nocase characters which includes the 
124125'c' character passed as argument, or directly follows 'c'. */ 
@@ -357,13 +358,28 @@ append_non_ascii_range(uint32_t options, uint32_t *buffer)
357358  return  buffer  +  2 ;
358359}
359360
361+ /* The buffer may represent the categry list pointer when utf is enabled. */ 
360362static  size_t 
361363parse_class (uint32_t  * ptr , uint32_t  options , uint32_t  * buffer )
362364{
363365size_t  total_size  =  0 ;
364366size_t  size ;
365367uint32_t  meta_arg ;
366368uint32_t  start_char ;
369+ uint32_t  ptype ;
370+ #ifdef  SUPPORT_UNICODE 
371+ uint32_t  pdata ;
372+ uint32_t  category_list ;
373+ uint32_t  * pcategory_list  =  NULL ;
374+ #endif 
375+ 
376+ #ifdef  SUPPORT_UNICODE 
377+ if  ((options  &  PARSE_CLASS_COMPUTE_CATLIST ) !=  0 )
378+   {
379+   pcategory_list  =  buffer ;
380+   buffer  =  NULL ;
381+   }
382+ #endif 
367383
368384while  (TRUE)
369385  {
@@ -407,7 +423,8 @@ while (TRUE)
407423        case  ESC_p :
408424        case  ESC_P :
409425        ptr ++ ;
410-         if  (meta_arg  ==  ESC_p  &&  (* ptr  >> 16 ) ==  PT_ANY )
426+         ptype  =  (* ptr  >> 16 );
427+         if  (meta_arg  ==  ESC_p  &&  ptype  ==  PT_ANY )
411428          {
412429          if  (buffer  !=  NULL )
413430            {
@@ -417,6 +434,43 @@ while (TRUE)
417434            }
418435          total_size  +=  2 ;
419436          }
437+ #ifdef  SUPPORT_UNICODE 
438+         if  (pcategory_list  ==  NULL ) break ;
439+ 
440+         category_list  =  0 ;
441+ 
442+         switch (ptype )
443+           {
444+           case  PT_LAMP :
445+           category_list  =  UCPCAT3 (ucp_Lu , ucp_Ll , ucp_Lt );
446+           break ;
447+ 
448+           case  PT_GC :
449+           pdata  =  * ptr  &  0xffff ;
450+           category_list  =  UCPCAT_RANGE (PRIV (ucp_typerange )[pdata ],
451+                                        PRIV (ucp_typerange )[pdata  +  1 ] -  1 );
452+           break ;
453+ 
454+           case  PT_PC :
455+           pdata  =  * ptr  &  0xffff ;
456+           category_list  =  UCPCAT (pdata );
457+           break ;
458+ 
459+           case  PT_WORD :
460+           category_list  =  UCPCAT2 (ucp_Mn , ucp_Pc ) | UCPCAT_L  | UCPCAT_N ;
461+           break ;
462+ 
463+           case  PT_ALNUM :
464+           category_list  =  UCPCAT_L  | UCPCAT_N ;
465+           break ;
466+           }
467+ 
468+         if  (category_list  >  0 )
469+           {
470+           if  (meta_arg  ==  ESC_P ) category_list  ^= UCPCAT_ALL ;
471+           * pcategory_list  |= category_list ;
472+           }
473+ #endif 
420474        break ;
421475        }
422476      ptr ++ ;
@@ -511,6 +565,9 @@ const uint32_t *char_list_next;
511565uint16_t  * next_char ;
512566uint32_t  char_list_start , char_list_end ;
513567uint32_t  range_start , range_end ;
568+ #ifdef  SUPPORT_UNICODE 
569+ uint32_t  category_list  =  0 ;
570+ #endif 
514571
515572#ifdef  SUPPORT_UNICODE 
516573if  (options  &  PCRE2_UTF )
@@ -531,11 +588,22 @@ if (xoptions & PCRE2_EXTRA_TURKISH_CASING)
531588
532589/* Compute required space for the range. */ 
533590
591+ #ifdef  SUPPORT_UNICODE 
592+ range_list_size  =  parse_class (start_ptr ,
593+                               class_options  | PARSE_CLASS_COMPUTE_CATLIST ,
594+                               & category_list );
595+ #else 
534596range_list_size  =  parse_class (start_ptr , class_options , NULL );
597+ #endif 
535598PCRE2_ASSERT ((range_list_size  &  0x1 ) ==  0 );
536599
537600/* Allocate buffer. The total_size also represents the end of the buffer. */ 
538601
602+ #ifdef  SUPPORT_UNICODE 
603+ /* Replaced by an OP_ALLANY. */ 
604+ if  (category_list  ==  UCPCAT_ALL ) range_list_size  =  2 ;
605+ #endif 
606+ 
539607total_size  =  range_list_size  + 
540608   ((range_list_size  >= 2 ) ? CHAR_LIST_EXTRA_SIZE  : 0 );
541609
@@ -553,6 +621,21 @@ cranges->range_list_size = (uint16_t)range_list_size;
553621cranges -> char_lists_types  =  0 ;
554622cranges -> char_lists_size  =  0 ;
555623cranges -> char_lists_start  =  0 ;
624+ #ifdef  SUPPORT_UNICODE 
625+ cranges -> category_list  =  category_list ;
626+ #endif 
627+ 
628+ #ifdef  SUPPORT_UNICODE 
629+ if  (category_list  ==  UCPCAT_ALL )
630+   {
631+   /* Replace the xclass with OP_ALLANY. */ 
632+   cranges -> category_list  =  0 ;
633+   buffer  =  (uint32_t * )(cranges  +  1 );
634+   buffer [0 ] =  0 ;
635+   buffer [1 ] =  get_highest_char (class_options );
636+   return  cranges ;
637+   }
638+ #endif 
556639
557640if  (range_list_size  ==  0 ) return  cranges ;
558641
@@ -1087,6 +1170,7 @@ BOOL utf = FALSE;
10871170
10881171#ifdef  SUPPORT_WIDE_CHARS 
10891172uint32_t  xclass_props ;
1173+ uint32_t  category_list ;
10901174PCRE2_UCHAR  * class_uchardata ;
10911175class_ranges *  cranges ;
10921176#else 
@@ -1107,6 +1191,7 @@ should_flip_negation = FALSE;
11071191
11081192#ifdef  SUPPORT_WIDE_CHARS 
11091193xclass_props  =  0 ;
1194+ category_list  =  0 ;
11101195
11111196#if  PCRE2_CODE_UNIT_WIDTH  ==  8 
11121197cranges  =  NULL ;
@@ -1140,6 +1225,9 @@ if (utf)
11401225    cb -> first_data  =  cranges -> header .next ;
11411226    }
11421227
1228+   category_list  =  cranges -> category_list ;
1229+   PCRE2_ASSERT (category_list  !=  UCPCAT_ALL );
1230+ 
11431231  if  (cranges -> range_list_size  >  0 )
11441232    {
11451233    const  uint32_t  * ranges  =  (const  uint32_t * )(cranges  +  1 );
@@ -1154,6 +1242,13 @@ if (utf)
11541242  }
11551243
11561244class_uchardata  =  code  +  LINK_SIZE  +  2 ;   /* For XCLASS items */ 
1245+ 
1246+ if  (cranges  !=  NULL  &&  category_list  !=  0  && 
1247+     (xclass_props  &  XCLASS_HIGH_ANY ) ==  0 )
1248+   {
1249+   xclass_props  |= XCLASS_REQUIRED  | XCLASS_HAS_PROPS ;
1250+   class_uchardata  +=  sizeof (uint32_t ) / sizeof (PCRE2_UCHAR );
1251+   }
11571252#endif  /* SUPPORT_WIDE_CHARS */ 
11581253
11591254/* Initialize the 256-bit (32-byte) bit map to all zeros. We build the map 
@@ -1444,7 +1539,9 @@ while (TRUE)
14441539
14451540        PRIV (update_classbits )(ptype , pdata , (escape  ==  ESC_P ), classbits );
14461541
1447-         if  ((xclass_props  &  XCLASS_HIGH_ANY ) ==  0 )
1542+         if  ((xclass_props  &  XCLASS_HIGH_ANY ) ==  0  && 
1543+             ptype  !=  PT_LAMP  &&  ptype  !=  PT_GC  &&  ptype  !=  PT_PC  && 
1544+             ptype  !=  PT_WORD  &&  ptype  !=  PT_ALNUM )
14481545          {
14491546          if  (lengthptr  !=  NULL )
14501547            * lengthptr  +=  3 ;
@@ -1709,6 +1806,15 @@ if ((xclass_props & XCLASS_REQUIRED) != 0)
17091806  * code  =  negate_class ? XCL_NOT :0 ;
17101807  if  ((xclass_props  &  XCLASS_HAS_PROPS ) !=  0 ) * code  |= XCL_HASPROP ;
17111808
1809+   /* The category_list is placed after the class feature bitset. 
1810+   The code pointer is not increased, because the bitset for the 
1811+   first 256 characters may be injected after the feature bitset. */ 
1812+   if  (category_list  !=  0 )
1813+     {
1814+     * code  |= XCL_HASCATLIST ;
1815+     memmove (code  +  1 , & category_list , sizeof (uint32_t ));
1816+     }
1817+ 
17121818  /* If the map is required, move up the extra data to make room for it; 
17131819  otherwise just move the code pointer to the end of the extra data. */ 
17141820
0 commit comments