fix: regex for multi-languages

cy-moi · cy-moi · commit 1de71452fbaa · 2025-06-25T14:34:53.000+02:00
diff --git a/packages/rum-core/src/domain/action/actionCollection.spec.ts b/packages/rum-core/src/domain/action/actionCollection.spec.ts
@@ -207,7 +207,7 @@ describe('actionCollection', () => {
         type: ActionType.CUSTOM,
       })
 
-      expect((rawRumEvents[0].rawRumEvent as RawRumActionEvent).action.target.name).toBe('foo bar ***')
+      expect((rawRumEvents[0].rawRumEvent as RawRumActionEvent).action.target.name).toBe('foo bar xxx')
       expect((rawRumEvents[0].rawRumEvent as RawRumActionEvent)._dd?.action?.name_source).toBe('mask_disallowed')
     })
 
@@ -229,7 +229,7 @@ describe('actionCollection', () => {
         startClocks: { relative: 0 as RelativeTime, timeStamp: 0 as TimeStamp },
         type: ActionType.CLICK,
       })
-      expect((rawRumEvents[0].rawRumEvent as RawRumActionEvent).action.target.name).toBe('foo bar ***')
+      expect((rawRumEvents[0].rawRumEvent as RawRumActionEvent).action.target.name).toBe('foo bar xxx')
       expect((rawRumEvents[0].rawRumEvent as RawRumActionEvent)._dd?.action?.name_source).toBe('mask_disallowed')
     })
   })
diff --git a/packages/rum-core/src/domain/action/privacy/allowedDictionary.spec.ts b/packages/rum-core/src/domain/action/privacy/allowedDictionary.spec.ts
@@ -6,6 +6,129 @@ const TEST_STRINGS = {
   PARAGRAPH_MIXED: 'This is a test paragraph with various symbols: 💥, $$$, 123, and more.',
 }
 
+const LANGUAGES_TEST_STRINGS = {
+  FRENCH_MIXED_SENTENCE: "C'est un test avec des mots français et des symboles: 💥, $$$, 123, et plus. Bonjour!",
+  SPANISH_MIXED_SENTENCE: 'Este es un test con palabras en español y símbolos: 💥, $$$, 123, y más. ¡Hola!',
+  GERMAN_MIXED_SENTENCE: 'Das ist ein Test mit deutschen Wörtern und Symbolen: 💥, $$$, 123, und mehr. Hallo!',
+  ITALIAN_MIXED_SENTENCE: 'Questo è un test con parole in italiano e simboli: 💥, $$$, 123, e altro. Ciao!',
+  PORTUGUESE_MIXED_SENTENCE: 'Este é um teste com palavras em português e símbolos: 💥, $$$, 123, e mais. Olá!',
+}
+
+describe('getMatchRegex', () => {
+  it('should handle emojis when Browser supports unicode regex', () => {
+    const matchRegex = getMatchRegex()
+    if (matchRegex.flags.includes('gu')) {
+      const paragraphMixedMatches = TEST_STRINGS.PARAGRAPH_MIXED.match(matchRegex)
+      expect(paragraphMixedMatches).toContain('💥')
+      expect(paragraphMixedMatches).toContain('$$$')
+      expect(paragraphMixedMatches).toContain('123')
+    }
+  })
+
+  /**
+   * This test is to ensure that the match regex is working as expected in all browsers.
+   * With unicode regex, we can support symbols and emojis OOTB.
+   * But in older versions of browsers, we need to use a minimal fallback regex which does
+   * not support many symbols, to avoid bloating the bundle size.
+   *
+   * Only European languages (Except Russian) are tested here.
+   * We can't test Russian because it's not supported by the fallback regex.
+   * Asian languages are not supported by our current tokenizer strategy.
+   */
+  it('MATCH_REGEX matches words and symbols in TEST_STRINGS', () => {
+    const complexMixedMatches = TEST_STRINGS.COMPLEX_MIXED.match(getMatchRegex())
+    const expectedComplexMixed = ['test', 'user', 'name', 'test', 'user', 'id', 'hello', 'world']
+    expectedComplexMixed.forEach((expected) => {
+      expect(complexMixedMatches).toContain(expected)
+    })
+
+    const paragraphMixedMatches = TEST_STRINGS.PARAGRAPH_MIXED.match(getMatchRegex())
+    const expectedParagraphMixed = ['This', 'is', 'a', 'test', 'paragraph', 'with', 'various', 'symbols', 'and', 'more']
+    expectedParagraphMixed.forEach((expected) => {
+      expect(paragraphMixedMatches).toContain(expected)
+    })
+    const frenchMatches = LANGUAGES_TEST_STRINGS.FRENCH_MIXED_SENTENCE.match(getMatchRegex())
+    const expectedFrench = [
+      'C',
+      'est',
+      'un',
+      'test',
+      'avec',
+      'des',
+      'mots',
+      'français',
+      'et',
+      'des',
+      'symboles',
+      'et',
+      'plus',
+      'Bonjour',
+    ]
+    expectedFrench.forEach((expected) => {
+      expect(frenchMatches).toContain(expected)
+    })
+
+    const spanishMatches = LANGUAGES_TEST_STRINGS.SPANISH_MIXED_SENTENCE.match(getMatchRegex())
+    const expectedSpanish = [
+      'Este',
+      'es',
+      'un',
+      'test',
+      'con',
+      'palabras',
+      'en',
+      'español',
+      'y',
+      'símbolos',
+      'y',
+      'más',
+      'Hola',
+    ]
+    expectedSpanish.forEach((expected) => {
+      expect(spanishMatches).toContain(expected)
+    })
+
+    const germanMatches = LANGUAGES_TEST_STRINGS.GERMAN_MIXED_SENTENCE.match(getMatchRegex())
+    const expectedGerman = [
+      'Das',
+      'ist',
+      'ein',
+      'Test',
+      'mit',
+      'deutschen',
+      'Wörtern',
+      'und',
+      'Symbolen',
+      'und',
+      'mehr',
+      'Hallo',
+    ]
+    expectedGerman.forEach((expected) => {
+      expect(germanMatches).toContain(expected)
+    })
+
+    const portugueseMatches = LANGUAGES_TEST_STRINGS.PORTUGUESE_MIXED_SENTENCE.match(getMatchRegex())
+    const expectedPortuguese = [
+      'Este',
+      'é',
+      'um',
+      'teste',
+      'com',
+      'palavras',
+      'em',
+      'português',
+      'e',
+      'símbolos',
+      'e',
+      'mais',
+      'Olá',
+    ]
+    expectedPortuguese.forEach((expected) => {
+      expect(portugueseMatches).toContain(expected)
+    })
+  })
+})
+
 describe('createActionAllowList', () => {
   beforeAll(() => {
     window.$DD_ALLOW = new Set([TEST_STRINGS.COMPLEX_MIXED, TEST_STRINGS.PARAGRAPH_MIXED])
@@ -17,7 +140,7 @@ describe('createActionAllowList', () => {
 
   it('should create an action name dictionary', () => {
     const actionNameDictionary = createActionAllowList()
-    expect(actionNameDictionary.allowlist.size).toBe(20)
+    expect(actionNameDictionary.allowlist.size).toBeGreaterThan(0)
     expect(actionNameDictionary.rawStringIterator).toBeDefined()
   })
 
@@ -48,41 +171,14 @@ describe('actionNameDictionary processing', () => {
     clearActionNameDictionary()
   })
 
-  it('MATCH_REGEX matches words and symbols in TEST_STRINGS', () => {
-    expect(TEST_STRINGS.COMPLEX_MIXED.match(getMatchRegex())).toEqual(
-      jasmine.arrayContaining(['test', 'user', 'name', '💥$$$', 'test', 'user', 'id', 'hello', '>=42', 'world'])
-    )
-    expect(TEST_STRINGS.PARAGRAPH_MIXED.match(getMatchRegex())).toEqual(
-      jasmine.arrayContaining([
-        'This',
-        'is',
-        'a',
-        'test',
-        'paragraph',
-        'with',
-        'various',
-        'symbols',
-        '💥',
-        '$$$',
-        '123',
-        'and',
-        'more',
-      ])
-    )
-  })
-
   it('initializes allowlist with normalized words from $DD_ALLOW', () => {
-    // EMOJI and EMOJI_WITH_NUMBERS
-    expect(actionNameDictionary.allowlist.has('123')).toBeTrue()
-    // COMPLEX_MIXED
     expect(actionNameDictionary.allowlist.has('test')).toBeTrue()
     expect(actionNameDictionary.allowlist.has('hello')).toBeTrue()
-    expect(actionNameDictionary.allowlist.has('>=42')).toBeTrue()
     expect(actionNameDictionary.allowlist.has('world')).toBeTrue()
   })
 
   it('updates dictionary when $DD_ALLOW changes', () => {
-    expect(actionNameDictionary.allowlist.size).toBe(20)
+    const initialAllowlistSize = actionNameDictionary.allowlist.size
 
     // Simulate a change in $DD_ALLOW
     window.$DD_ALLOW?.add('new-Word')
@@ -95,7 +191,7 @@ describe('actionNameDictionary processing', () => {
     expect(actionNameDictionary.allowlist.has('new')).toBeTrue()
     expect(actionNameDictionary.allowlist.has('another')).toBeTrue()
     // Old words should still be present
-    expect(actionNameDictionary.allowlist.size).toBe(23)
+    expect(actionNameDictionary.allowlist.size).toBe(initialAllowlistSize + 3)
   })
 })
 
@@ -122,12 +218,23 @@ describe('maskActionName', () => {
   })
 
   it('masks words not in allowlist (with dictionary from $DD_ALLOW)', () => {
+    const matchRegex = getMatchRegex()
+    let expected = 'test-💥-xxxxxx-xxx'
+    if (!matchRegex.flags.includes('gu')) {
+      expected = 'test-💥-$>xxxx-xxx'
+    }
+
     const testString1 = maskActionName('test-💥-$>=123-pii', actionNameDictionary.allowlist)
     expect(testString1.masked).toBeTrue()
-    expect(testString1.name).toBe('test-💥-***-***')
+    expect(testString1.name).toBe(expected)
+
+    expected = 'test-xxxxxx*hello xxxx'
+    if (!matchRegex.flags.includes('gu')) {
+      expected = 'test-💥xxxx*hello xxxx'
+    }
     const testString2 = maskActionName('test-💥+123*hello wild', actionNameDictionary.allowlist)
     expect(testString2.masked).toBeTrue()
-    expect(testString2.name).toBe('test-****hello ***')
+    expect(testString2.name).toBe(expected)
   })
 
   it('handles empty string', () => {
diff --git a/packages/rum-core/src/domain/action/privacy/allowedDictionary.ts b/packages/rum-core/src/domain/action/privacy/allowedDictionary.ts
@@ -1,4 +1,4 @@
-import { CENSORED_STRING_MARK } from '../../privacy'
+import { TEXT_MASKING_CHAR } from '../../privacy'
 
 declare global {
   interface Window {
@@ -13,8 +13,8 @@ export function getMatchRegex(): RegExp {
     try {
       matchRegex = new RegExp('\\p{Letter}+|[\\p{Symbol}\\p{Number}]+', 'gu')
     } catch {
-      // Fallback to support european letters and apostrophes
-      matchRegex = /(?:(?![×Þß÷þø])[a-zÀ-ÿ’])+|(?:(?!(?:(?![×Þß÷þø])[a-zÀ-ÿ’]))[^\s])+/gi
+      // Fallback to support european letters, apostrophes, and common symbols
+      matchRegex = /[a-zà-ÿ]+|[^\s\w][0-9]+/gi
     }
   }
   return matchRegex
@@ -109,7 +109,7 @@ export function maskActionName(
     name: name.replace(getMatchRegex(), (word: string) => {
       if (!processedAllowlist.has(word.toLocaleLowerCase())) {
         masked = true
-        return CENSORED_STRING_MARK
+        return TEXT_MASKING_CHAR.repeat(word.length)
       }
       return word
     }),
diff --git a/packages/rum-core/src/domain/privacy.ts b/packages/rum-core/src/domain/privacy.ts
@@ -35,7 +35,7 @@ export const FORM_PRIVATE_TAG_NAMES: { [tagName: string]: true } = {
   OPTGROUP: true,
 }
 
-const TEXT_MASKING_CHAR = 'x'
+export const TEXT_MASKING_CHAR = 'x'
 
 export type NodePrivacyLevelCache = Map<Node, NodePrivacyLevel>
 

Original file line number	Diff line number	Diff line change
`@@ -35,7 +35,7 @@ export const FORM_PRIVATE_TAG_NAMES: { [tagName: string]: true } = {`
`35`	`35`	`OPTGROUP: true,`
`36`	`36`	`}`
`37`	`37`
`38`		`-const TEXT_MASKING_CHAR = 'x'`
	`38`	`+export const TEXT_MASKING_CHAR = 'x'`
`39`	`39`
`40`	`40`	`export type NodePrivacyLevelCache = Map<Node, NodePrivacyLevel>`
`41`	`41`