Skip to content

Commit 1de7145

Browse files
committed
fix: regex for multi-languages
1 parent 052a04b commit 1de7145

File tree

4 files changed

+146
-39
lines changed

4 files changed

+146
-39
lines changed

packages/rum-core/src/domain/action/actionCollection.spec.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,7 @@ describe('actionCollection', () => {
207207
type: ActionType.CUSTOM,
208208
})
209209

210-
expect((rawRumEvents[0].rawRumEvent as RawRumActionEvent).action.target.name).toBe('foo bar ***')
210+
expect((rawRumEvents[0].rawRumEvent as RawRumActionEvent).action.target.name).toBe('foo bar xxx')
211211
expect((rawRumEvents[0].rawRumEvent as RawRumActionEvent)._dd?.action?.name_source).toBe('mask_disallowed')
212212
})
213213

@@ -229,7 +229,7 @@ describe('actionCollection', () => {
229229
startClocks: { relative: 0 as RelativeTime, timeStamp: 0 as TimeStamp },
230230
type: ActionType.CLICK,
231231
})
232-
expect((rawRumEvents[0].rawRumEvent as RawRumActionEvent).action.target.name).toBe('foo bar ***')
232+
expect((rawRumEvents[0].rawRumEvent as RawRumActionEvent).action.target.name).toBe('foo bar xxx')
233233
expect((rawRumEvents[0].rawRumEvent as RawRumActionEvent)._dd?.action?.name_source).toBe('mask_disallowed')
234234
})
235235
})

packages/rum-core/src/domain/action/privacy/allowedDictionary.spec.ts

Lines changed: 139 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,129 @@ const TEST_STRINGS = {
66
PARAGRAPH_MIXED: 'This is a test paragraph with various symbols: 💥, $$$, 123, and more.',
77
}
88

9+
const LANGUAGES_TEST_STRINGS = {
10+
FRENCH_MIXED_SENTENCE: "C'est un test avec des mots français et des symboles: 💥, $$$, 123, et plus. Bonjour!",
11+
SPANISH_MIXED_SENTENCE: 'Este es un test con palabras en español y símbolos: 💥, $$$, 123, y más. ¡Hola!',
12+
GERMAN_MIXED_SENTENCE: 'Das ist ein Test mit deutschen Wörtern und Symbolen: 💥, $$$, 123, und mehr. Hallo!',
13+
ITALIAN_MIXED_SENTENCE: 'Questo è un test con parole in italiano e simboli: 💥, $$$, 123, e altro. Ciao!',
14+
PORTUGUESE_MIXED_SENTENCE: 'Este é um teste com palavras em português e símbolos: 💥, $$$, 123, e mais. Olá!',
15+
}
16+
17+
describe('getMatchRegex', () => {
18+
it('should handle emojis when Browser supports unicode regex', () => {
19+
const matchRegex = getMatchRegex()
20+
if (matchRegex.flags.includes('gu')) {
21+
const paragraphMixedMatches = TEST_STRINGS.PARAGRAPH_MIXED.match(matchRegex)
22+
expect(paragraphMixedMatches).toContain('💥')
23+
expect(paragraphMixedMatches).toContain('$$$')
24+
expect(paragraphMixedMatches).toContain('123')
25+
}
26+
})
27+
28+
/**
29+
* This test is to ensure that the match regex is working as expected in all browsers.
30+
* With unicode regex, we can support symbols and emojis OOTB.
31+
* But in older versions of browsers, we need to use a minimal fallback regex which does
32+
* not support many symbols, to avoid bloating the bundle size.
33+
*
34+
* Only European languages (Except Russian) are tested here.
35+
* We can't test Russian because it's not supported by the fallback regex.
36+
* Asian languages are not supported by our current tokenizer strategy.
37+
*/
38+
it('MATCH_REGEX matches words and symbols in TEST_STRINGS', () => {
39+
const complexMixedMatches = TEST_STRINGS.COMPLEX_MIXED.match(getMatchRegex())
40+
const expectedComplexMixed = ['test', 'user', 'name', 'test', 'user', 'id', 'hello', 'world']
41+
expectedComplexMixed.forEach((expected) => {
42+
expect(complexMixedMatches).toContain(expected)
43+
})
44+
45+
const paragraphMixedMatches = TEST_STRINGS.PARAGRAPH_MIXED.match(getMatchRegex())
46+
const expectedParagraphMixed = ['This', 'is', 'a', 'test', 'paragraph', 'with', 'various', 'symbols', 'and', 'more']
47+
expectedParagraphMixed.forEach((expected) => {
48+
expect(paragraphMixedMatches).toContain(expected)
49+
})
50+
const frenchMatches = LANGUAGES_TEST_STRINGS.FRENCH_MIXED_SENTENCE.match(getMatchRegex())
51+
const expectedFrench = [
52+
'C',
53+
'est',
54+
'un',
55+
'test',
56+
'avec',
57+
'des',
58+
'mots',
59+
'français',
60+
'et',
61+
'des',
62+
'symboles',
63+
'et',
64+
'plus',
65+
'Bonjour',
66+
]
67+
expectedFrench.forEach((expected) => {
68+
expect(frenchMatches).toContain(expected)
69+
})
70+
71+
const spanishMatches = LANGUAGES_TEST_STRINGS.SPANISH_MIXED_SENTENCE.match(getMatchRegex())
72+
const expectedSpanish = [
73+
'Este',
74+
'es',
75+
'un',
76+
'test',
77+
'con',
78+
'palabras',
79+
'en',
80+
'español',
81+
'y',
82+
'símbolos',
83+
'y',
84+
'más',
85+
'Hola',
86+
]
87+
expectedSpanish.forEach((expected) => {
88+
expect(spanishMatches).toContain(expected)
89+
})
90+
91+
const germanMatches = LANGUAGES_TEST_STRINGS.GERMAN_MIXED_SENTENCE.match(getMatchRegex())
92+
const expectedGerman = [
93+
'Das',
94+
'ist',
95+
'ein',
96+
'Test',
97+
'mit',
98+
'deutschen',
99+
'Wörtern',
100+
'und',
101+
'Symbolen',
102+
'und',
103+
'mehr',
104+
'Hallo',
105+
]
106+
expectedGerman.forEach((expected) => {
107+
expect(germanMatches).toContain(expected)
108+
})
109+
110+
const portugueseMatches = LANGUAGES_TEST_STRINGS.PORTUGUESE_MIXED_SENTENCE.match(getMatchRegex())
111+
const expectedPortuguese = [
112+
'Este',
113+
'é',
114+
'um',
115+
'teste',
116+
'com',
117+
'palavras',
118+
'em',
119+
'português',
120+
'e',
121+
'símbolos',
122+
'e',
123+
'mais',
124+
'Olá',
125+
]
126+
expectedPortuguese.forEach((expected) => {
127+
expect(portugueseMatches).toContain(expected)
128+
})
129+
})
130+
})
131+
9132
describe('createActionAllowList', () => {
10133
beforeAll(() => {
11134
window.$DD_ALLOW = new Set([TEST_STRINGS.COMPLEX_MIXED, TEST_STRINGS.PARAGRAPH_MIXED])
@@ -17,7 +140,7 @@ describe('createActionAllowList', () => {
17140

18141
it('should create an action name dictionary', () => {
19142
const actionNameDictionary = createActionAllowList()
20-
expect(actionNameDictionary.allowlist.size).toBe(20)
143+
expect(actionNameDictionary.allowlist.size).toBeGreaterThan(0)
21144
expect(actionNameDictionary.rawStringIterator).toBeDefined()
22145
})
23146

@@ -48,41 +171,14 @@ describe('actionNameDictionary processing', () => {
48171
clearActionNameDictionary()
49172
})
50173

51-
it('MATCH_REGEX matches words and symbols in TEST_STRINGS', () => {
52-
expect(TEST_STRINGS.COMPLEX_MIXED.match(getMatchRegex())).toEqual(
53-
jasmine.arrayContaining(['test', 'user', 'name', '💥$$$', 'test', 'user', 'id', 'hello', '>=42', 'world'])
54-
)
55-
expect(TEST_STRINGS.PARAGRAPH_MIXED.match(getMatchRegex())).toEqual(
56-
jasmine.arrayContaining([
57-
'This',
58-
'is',
59-
'a',
60-
'test',
61-
'paragraph',
62-
'with',
63-
'various',
64-
'symbols',
65-
'💥',
66-
'$$$',
67-
'123',
68-
'and',
69-
'more',
70-
])
71-
)
72-
})
73-
74174
it('initializes allowlist with normalized words from $DD_ALLOW', () => {
75-
// EMOJI and EMOJI_WITH_NUMBERS
76-
expect(actionNameDictionary.allowlist.has('123')).toBeTrue()
77-
// COMPLEX_MIXED
78175
expect(actionNameDictionary.allowlist.has('test')).toBeTrue()
79176
expect(actionNameDictionary.allowlist.has('hello')).toBeTrue()
80-
expect(actionNameDictionary.allowlist.has('>=42')).toBeTrue()
81177
expect(actionNameDictionary.allowlist.has('world')).toBeTrue()
82178
})
83179

84180
it('updates dictionary when $DD_ALLOW changes', () => {
85-
expect(actionNameDictionary.allowlist.size).toBe(20)
181+
const initialAllowlistSize = actionNameDictionary.allowlist.size
86182

87183
// Simulate a change in $DD_ALLOW
88184
window.$DD_ALLOW?.add('new-Word')
@@ -95,7 +191,7 @@ describe('actionNameDictionary processing', () => {
95191
expect(actionNameDictionary.allowlist.has('new')).toBeTrue()
96192
expect(actionNameDictionary.allowlist.has('another')).toBeTrue()
97193
// Old words should still be present
98-
expect(actionNameDictionary.allowlist.size).toBe(23)
194+
expect(actionNameDictionary.allowlist.size).toBe(initialAllowlistSize + 3)
99195
})
100196
})
101197

@@ -122,12 +218,23 @@ describe('maskActionName', () => {
122218
})
123219

124220
it('masks words not in allowlist (with dictionary from $DD_ALLOW)', () => {
221+
const matchRegex = getMatchRegex()
222+
let expected = 'test-💥-xxxxxx-xxx'
223+
if (!matchRegex.flags.includes('gu')) {
224+
expected = 'test-💥-$>xxxx-xxx'
225+
}
226+
125227
const testString1 = maskActionName('test-💥-$>=123-pii', actionNameDictionary.allowlist)
126228
expect(testString1.masked).toBeTrue()
127-
expect(testString1.name).toBe('test-💥-***-***')
229+
expect(testString1.name).toBe(expected)
230+
231+
expected = 'test-xxxxxx*hello xxxx'
232+
if (!matchRegex.flags.includes('gu')) {
233+
expected = 'test-💥xxxx*hello xxxx'
234+
}
128235
const testString2 = maskActionName('test-💥+123*hello wild', actionNameDictionary.allowlist)
129236
expect(testString2.masked).toBeTrue()
130-
expect(testString2.name).toBe('test-****hello ***')
237+
expect(testString2.name).toBe(expected)
131238
})
132239

133240
it('handles empty string', () => {

packages/rum-core/src/domain/action/privacy/allowedDictionary.ts

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import { CENSORED_STRING_MARK } from '../../privacy'
1+
import { TEXT_MASKING_CHAR } from '../../privacy'
22

33
declare global {
44
interface Window {
@@ -13,8 +13,8 @@ export function getMatchRegex(): RegExp {
1313
try {
1414
matchRegex = new RegExp('\\p{Letter}+|[\\p{Symbol}\\p{Number}]+', 'gu')
1515
} catch {
16-
// Fallback to support european letters and apostrophes
17-
matchRegex = /(?:(?![×Þß÷þø])[a-zÀ-ÿ])+|(?:(?!(?:(?![×Þß÷þø])[a-zÀ-ÿ]))[^\s])+/gi
16+
// Fallback to support european letters, apostrophes, and common symbols
17+
matchRegex = /[a-zà-ÿ]+|[^\s\w][0-9]+/gi
1818
}
1919
}
2020
return matchRegex
@@ -109,7 +109,7 @@ export function maskActionName(
109109
name: name.replace(getMatchRegex(), (word: string) => {
110110
if (!processedAllowlist.has(word.toLocaleLowerCase())) {
111111
masked = true
112-
return CENSORED_STRING_MARK
112+
return TEXT_MASKING_CHAR.repeat(word.length)
113113
}
114114
return word
115115
}),

packages/rum-core/src/domain/privacy.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ export const FORM_PRIVATE_TAG_NAMES: { [tagName: string]: true } = {
3535
OPTGROUP: true,
3636
}
3737

38-
const TEXT_MASKING_CHAR = 'x'
38+
export const TEXT_MASKING_CHAR = 'x'
3939

4040
export type NodePrivacyLevelCache = Map<Node, NodePrivacyLevel>
4141

0 commit comments

Comments
 (0)