@@ -116,10 +116,11 @@ while (TRUE)
116116
117117#ifdef  SUPPORT_UNICODE 
118118
119- #define  PARSE_CLASS_UTF                0x1
120- #define  PARSE_CLASS_CASELESS_UTF       0x2
121- #define  PARSE_CLASS_RESTRICTED_UTF     0x4
122- #define  PARSE_CLASS_TURKISH_UTF        0x8
119+ #define  PARSE_CLASS_UTF                0x01
120+ #define  PARSE_CLASS_CASELESS_UTF       0x02
121+ #define  PARSE_CLASS_RESTRICTED_UTF     0x04
122+ #define  PARSE_CLASS_TURKISH_UTF        0x08
123+ #define  PARSE_CLASS_COMPUTE_CATLIST    0x10
123124
124125/* Get the range of nocase characters which includes the 
125126'c' character passed as argument, or directly follows 'c'. */ 
@@ -358,13 +359,28 @@ append_non_ascii_range(uint32_t options, uint32_t *buffer)
358359  return  buffer  +  2 ;
359360}
360361
362+ /* The buffer may represent the categry list pointer when utf is enabled. */ 
361363static  size_t 
362364parse_class (uint32_t  * ptr , uint32_t  options , uint32_t  * buffer )
363365{
364366size_t  total_size  =  0 ;
365367size_t  size ;
366368uint32_t  meta_arg ;
367369uint32_t  start_char ;
370+ uint32_t  ptype ;
371+ #ifdef  SUPPORT_UNICODE 
372+ uint32_t  pdata ;
373+ uint32_t  category_list ;
374+ uint32_t  * pcategory_list  =  NULL ;
375+ #endif 
376+ 
377+ #ifdef  SUPPORT_UNICODE 
378+ if  ((options  &  PARSE_CLASS_COMPUTE_CATLIST ) !=  0 )
379+   {
380+   pcategory_list  =  buffer ;
381+   buffer  =  NULL ;
382+   }
383+ #endif 
368384
369385while  (TRUE)
370386  {
@@ -408,7 +424,8 @@ while (TRUE)
408424        case  ESC_p :
409425        case  ESC_P :
410426        ptr ++ ;
411-         if  (meta_arg  ==  ESC_p  &&  (* ptr  >> 16 ) ==  PT_ANY )
427+         ptype  =  (* ptr  >> 16 );
428+         if  (meta_arg  ==  ESC_p  &&  ptype  ==  PT_ANY )
412429          {
413430          if  (buffer  !=  NULL )
414431            {
@@ -418,6 +435,43 @@ while (TRUE)
418435            }
419436          total_size  +=  2 ;
420437          }
438+ #ifdef  SUPPORT_UNICODE 
439+         if  (pcategory_list  ==  NULL ) break ;
440+ 
441+         category_list  =  0 ;
442+ 
443+         switch (ptype )
444+           {
445+           case  PT_LAMP :
446+           category_list  =  UCPCAT3 (ucp_Lu , ucp_Ll , ucp_Lt );
447+           break ;
448+ 
449+           case  PT_GC :
450+           pdata  =  * ptr  &  0xffff ;
451+           category_list  =  UCPCAT_RANGE (PRIV (ucp_typerange )[pdata ],
452+                                        PRIV (ucp_typerange )[pdata  +  1 ] -  1 );
453+           break ;
454+ 
455+           case  PT_PC :
456+           pdata  =  * ptr  &  0xffff ;
457+           category_list  =  UCPCAT (pdata );
458+           break ;
459+ 
460+           case  PT_WORD :
461+           category_list  =  UCPCAT2 (ucp_Mn , ucp_Pc ) | UCPCAT_L  | UCPCAT_N ;
462+           break ;
463+ 
464+           case  PT_ALNUM :
465+           category_list  =  UCPCAT_L  | UCPCAT_N ;
466+           break ;
467+           }
468+ 
469+         if  (category_list  >  0 )
470+           {
471+           if  (meta_arg  ==  ESC_P ) category_list  ^= UCPCAT_ALL ;
472+           * pcategory_list  |= category_list ;
473+           }
474+ #endif 
421475        break ;
422476        }
423477      ptr ++ ;
@@ -512,6 +566,9 @@ const uint32_t *char_list_next;
512566uint16_t  * next_char ;
513567uint32_t  char_list_start , char_list_end ;
514568uint32_t  range_start , range_end ;
569+ #ifdef  SUPPORT_UNICODE 
570+ uint32_t  category_list  =  0 ;
571+ #endif 
515572
516573#ifdef  SUPPORT_UNICODE 
517574if  (options  &  PCRE2_UTF )
@@ -529,11 +586,21 @@ if (xoptions & PCRE2_EXTRA_TURKISH_CASING)
529586
530587/* Compute required space for the range. */ 
531588
589+ #ifdef  SUPPORT_UNICODE 
590+ range_list_size  =  parse_class (start_ptr ,
591+                               class_options  | PARSE_CLASS_COMPUTE_CATLIST ,
592+                               & category_list );
593+ #else 
532594range_list_size  =  parse_class (start_ptr , class_options , NULL );
595+ #endif 
533596PCRE2_ASSERT ((range_list_size  &  0x1 ) ==  0 );
534597
535598/* Allocate buffer. The total_size also represents the end of the buffer. */ 
536599
600+ #ifdef  SUPPORT_UNICODE 
601+ if  (category_list  ==  UCPCAT_ALL ) range_list_size  =  2 ;
602+ #endif 
603+ 
537604total_size  =  range_list_size  + 
538605   ((range_list_size  >= 2 ) ? CHAR_LIST_EXTRA_SIZE  : 0 );
539606
@@ -548,6 +615,21 @@ cranges->range_list_size = (uint16_t)range_list_size;
548615cranges -> char_lists_types  =  0 ;
549616cranges -> char_lists_size  =  0 ;
550617cranges -> char_lists_start  =  0 ;
618+ #ifdef  SUPPORT_UNICODE 
619+ cranges -> category_list  =  category_list ;
620+ #endif 
621+ 
622+ #ifdef  SUPPORT_UNICODE 
623+ if  (category_list  ==  UCPCAT_ALL )
624+   {
625+   /* Replace the xclass with OP_ALLANY. */ 
626+   cranges -> category_list  =  0 ;
627+   buffer  =  (uint32_t * )(cranges  +  1 );
628+   buffer [0 ] =  0 ;
629+   buffer [1 ] =  get_highest_char (options );
630+   return  cranges ;
631+   }
632+ #endif 
551633
552634if  (range_list_size  ==  0 ) return  cranges ;
553635
@@ -1042,6 +1124,7 @@ BOOL utf = FALSE;
10421124
10431125#ifdef  SUPPORT_WIDE_CHARS 
10441126uint32_t  xclass_props ;
1127+ uint32_t  category_list ;
10451128PCRE2_UCHAR  * class_uchardata ;
10461129class_ranges *  cranges ;
10471130#endif 
@@ -1058,6 +1141,7 @@ should_flip_negation = FALSE;
10581141
10591142#ifdef  SUPPORT_WIDE_CHARS 
10601143xclass_props  =  0 ;
1144+ category_list  =  0 ;
10611145
10621146#if  PCRE2_CODE_UNIT_WIDTH  ==  8 
10631147cranges  =  NULL ;
@@ -1091,6 +1175,9 @@ if (utf)
10911175    cb -> cranges  =  cranges -> next ;
10921176    }
10931177
1178+   category_list  =  cranges -> category_list ;
1179+   PCRE2_ASSERT (category_list  !=  UCPCAT_ALL );
1180+ 
10941181  if  (cranges -> range_list_size  >  0 )
10951182    {
10961183    const  uint32_t  * ranges  =  (const  uint32_t * )(cranges  +  1 );
@@ -1105,6 +1192,13 @@ if (utf)
11051192  }
11061193
11071194class_uchardata  =  code  +  LINK_SIZE  +  2 ;   /* For XCLASS items */ 
1195+ 
1196+ if  (cranges  !=  NULL  &&  category_list  !=  0  && 
1197+     (xclass_props  &  XCLASS_HIGH_ANY ) ==  0 )
1198+   {
1199+   xclass_props  |= XCLASS_REQUIRED  | XCLASS_HAS_PROPS ;
1200+   class_uchardata  +=  sizeof (uint32_t ) / sizeof (PCRE2_UCHAR );
1201+   }
11081202#endif  /* SUPPORT_WIDE_CHARS */ 
11091203
11101204/* Initialize the 256-bit (32-byte) bit map to all zeros. We build the map 
@@ -1380,7 +1474,9 @@ while (TRUE)
13801474
13811475        PRIV (update_classbits )(ptype , pdata , (escape  ==  ESC_P ), classbits );
13821476
1383-         if  ((xclass_props  &  XCLASS_HIGH_ANY ) ==  0 )
1477+         if  ((xclass_props  &  XCLASS_HIGH_ANY ) ==  0  && 
1478+             ptype  !=  PT_LAMP  &&  ptype  !=  PT_GC  &&  ptype  !=  PT_PC  && 
1479+             ptype  !=  PT_WORD  &&  ptype  !=  PT_ALNUM )
13841480          {
13851481          if  (lengthptr  !=  NULL )
13861482            * lengthptr  +=  3 ;
@@ -1640,6 +1736,12 @@ if ((xclass_props & XCLASS_REQUIRED) != 0)
16401736  code  +=  LINK_SIZE ;
16411737  * code  =  negate_class ? XCL_NOT :0 ;
16421738  if  ((xclass_props  &  XCLASS_HAS_PROPS ) !=  0 ) * code  |= XCL_HASPROP ;
1739+   /* This should be the last one. */ 
1740+   if  (category_list  !=  0 )
1741+     {
1742+     * code  |= XCL_HASCATLIST ;
1743+     memmove (code  +  1 , & category_list , sizeof (uint32_t ));
1744+     }
16431745
16441746  /* If the map is required, move up the extra data to make room for it; 
16451747  otherwise just move the code pointer to the end of the extra data. */ 
0 commit comments