Skip to content

Commit 20633f1

Browse files
fix: Do not double count secrets on /explain copilot function
Closes: #519 When using `/explain` function in copilot we were double counting th secrets. The problem was that we were getting several `user` messages after the last `assistant` message. We are using the last `assistant` message as means to identify the user messages. Here is an example of how the request looked like for `/explain`: ``` [ {"role": "assistant", "content": "some content"}, {"role": "user", "content": content_with_secrets}, {"role": "user", "content": content_with_secrets}, ] ``` To avoid double counting now we check which was the secrets that matched after the last `assistant` message and only consider the unique ones.
1 parent c00dc18 commit 20633f1

File tree

2 files changed

+19
-19
lines changed

2 files changed

+19
-19
lines changed

src/codegate/pipeline/secrets/secrets.py

+13-13
Original file line numberDiff line numberDiff line change
@@ -106,10 +106,10 @@ def _get_surrounding_secret_lines(
106106
end_line = min(secret_line + surrounding_lines, len(lines))
107107
return "\n".join(lines[start_line:end_line])
108108

109-
def obfuscate(self, text: str) -> tuple[str, int]:
109+
def obfuscate(self, text: str) -> tuple[str, List[Match]]:
110110
matches = CodegateSignatures.find_in_string(text)
111111
if not matches:
112-
return text, 0
112+
return text, []
113113

114114
logger.debug(f"Found {len(matches)} secrets in the user message")
115115

@@ -133,16 +133,16 @@ def obfuscate(self, text: str) -> tuple[str, int]:
133133
protected_text = list(text)
134134

135135
# Store matches for logging
136-
found_secrets = 0
136+
found_secrets = []
137137

138138
# First pass. Replace each match with its encrypted value
139-
logger.info("\nFound secrets:")
139+
logger.info(f"\nFound {len(absolute_matches)} secrets:")
140140
for start, end, match in absolute_matches:
141141
hidden_secret = self._hide_secret(match)
142142

143143
# Replace the secret in the text
144144
protected_text[start:end] = hidden_secret
145-
found_secrets += 1
145+
found_secrets.append(match)
146146
# Log the findings
147147
logger.info(
148148
f"\nService: {match.service}"
@@ -228,7 +228,7 @@ def name(self) -> str:
228228

229229
def _redact_text(
230230
self, text: str, secrets_manager: SecretsManager, session_id: str, context: PipelineContext
231-
) -> tuple[str, int]:
231+
) -> tuple[str, List[Match]]:
232232
"""
233233
Find and encrypt secrets in the given text.
234234
@@ -269,7 +269,7 @@ async def process(
269269
raise ValueError("Session ID not found in context")
270270

271271
new_request = request.copy()
272-
total_redacted = 0
272+
total_matches = []
273273

274274
# Process all messages
275275
last_assistant_idx = -1
@@ -281,15 +281,18 @@ async def process(
281281
for i, message in enumerate(new_request["messages"]):
282282
if "content" in message and message["content"]:
283283
# Protect the text
284-
protected_string, redacted_count = self._redact_text(
284+
protected_string, secrets_matched = self._redact_text(
285285
str(message["content"]), secrets_manager, session_id, context
286286
)
287287
new_request["messages"][i]["content"] = protected_string
288288

289-
# Sum redacted count for messages after the last assistant message
289+
# Append the matches for messages after the last assistant message
290290
if i > last_assistant_idx:
291-
total_redacted += redacted_count
291+
total_matches += secrets_matched
292292

293+
# Not count repeated secret matches
294+
set_secrets_value = set(match.value for match in total_matches)
295+
total_redacted = len(set_secrets_value)
293296
context.secrets_found = total_redacted > 0
294297
logger.info(f"Total secrets redacted since last assistant message: {total_redacted}")
295298

@@ -362,7 +365,6 @@ async def process_chunk(
362365
if match:
363366
# Found a complete marker, process it
364367
encrypted_value = match.group(1)
365-
print("----> encrypted_value: ", encrypted_value)
366368
original_value = input_context.sensitive.manager.get_original_value(
367369
encrypted_value,
368370
input_context.sensitive.session_id,
@@ -371,8 +373,6 @@ async def process_chunk(
371373
if original_value is None:
372374
# If value not found, leave as is
373375
original_value = match.group(0) # Keep the REDACTED marker
374-
else:
375-
print("----> original_value: ", original_value)
376376

377377
# Post an alert with the redacted content
378378
input_context.add_alert(self.name, trigger_string=encrypted_value)

tests/pipeline/secrets/test_secrets.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -97,9 +97,9 @@ def test_hide_secret(self):
9797
def test_obfuscate(self):
9898
# Test text with a secret
9999
text = "API_KEY=AKIAIOSFODNN7EXAMPLE\nOther text"
100-
protected, count = self.encryptor.obfuscate(text)
100+
protected, matched_secrets = self.encryptor.obfuscate(text)
101101

102-
assert count == 1
102+
assert len(matched_secrets) == 1
103103
assert "REDACTED<$" in protected
104104
assert "AKIAIOSFODNN7EXAMPLE" not in protected
105105
assert "Other text" in protected
@@ -128,9 +128,9 @@ def test_hide_secret(self):
128128
def test_obfuscate(self):
129129
# Test text with multiple secrets
130130
text = "API_KEY=AKIAIOSFODNN7EXAMPLE\nPASSWORD=AKIAIOSFODNN7EXAMPLE"
131-
protected, count = self.obfuscator.obfuscate(text)
131+
protected, matched_secrets = self.obfuscator.obfuscate(text)
132132

133-
assert count == 2
133+
assert len(matched_secrets) == 2
134134
assert "AKIAIOSFODNN7EXAMPLE" not in protected
135135
assert "*" * 32 in protected
136136

@@ -140,9 +140,9 @@ def test_obfuscate(self):
140140

141141
def test_obfuscate_no_secrets(self):
142142
text = "Regular text without secrets"
143-
protected, count = self.obfuscator.obfuscate(text)
143+
protected, matched_secrets = self.obfuscator.obfuscate(text)
144144

145-
assert count == 0
145+
assert len(matched_secrets) == 0
146146
assert protected == text
147147

148148

0 commit comments

Comments
 (0)