Skip to content

Commit c94289e

Browse files
committed
guardrails added ded and fix bugs
Signed-off-by: RAWx18 <[email protected]>
1 parent 599f558 commit c94289e

File tree

5 files changed

+117
-59
lines changed

5 files changed

+117
-59
lines changed

chromadb/chroma.sqlite3

-160 KB
Binary file not shown.

src/core/chromadb/chroma.sqlite3

0 Bytes
Binary file not shown.

src/core/config/guardrails.yaml

Lines changed: 26 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# Enhanced Guardrails configuration for AIFAQ
22

3-
# Topics to avoid in responses (can be customized by administrators)
3+
# Topics to avoid in responses
44
blocked_topics:
55
- "new_topic"
66
- "cryptocurrency"
@@ -17,37 +17,39 @@ topic_related_terms:
1717
- "ethereum"
1818
- "blockchain investment"
1919
- "token sale"
20-
- "crypto trading"
20+
- "crypto"
2121
- "ICO"
22-
- "mining profitability"
22+
- "mining"
2323
- "altcoin"
24-
- "trading strategy"
25-
- "coin market"
26-
- "crypto exchange"
24+
- "trading"
25+
- "exchange"
26+
- "coin"
27+
- "token"
28+
- "wallet"
29+
- "profit"
2730
"hacking":
2831
- "exploit"
2932
- "vulnerability"
3033
- "unauthorized access"
3134
- "bypass security"
3235
- "crack password"
3336
- "penetration testing"
34-
- "exploit code"
3537
- "backdoor"
3638
- "security breach"
3739
- "break into"
38-
- "infiltrate system"
39-
- "circumvent protection"
40+
- "infiltrate"
41+
- "circumvent"
4042
"illegal activities":
4143
- "black market"
4244
- "smuggling"
4345
- "theft"
44-
- "fraud scheme"
46+
- "fraud"
4547
- "money laundering"
4648
- "illegal download"
4749
- "counterfeit"
4850
- "unauthorized copying"
4951
- "piracy"
50-
- "unlawful access"
52+
- "unlawful"
5153
- "stealing"
5254
- "illicit"
5355
"personal data extraction":
@@ -74,22 +76,23 @@ topic_related_terms:
7476
- "hate speech"
7577
"harmful content":
7678
- "self-harm"
77-
- "suicide method"
79+
- "suicide"
7880
- "dangerous chemical"
79-
- "weapon making"
81+
- "weapon"
8082
- "explosives"
8183
- "harmful instructions"
8284
- "dangerous experiment"
8385

8486
# Regular expressions to filter out from responses
8587
filtered_patterns:
86-
- "(?i)password\\s+(?:is|should\\s+be|could\\s+be)\\s+[\\w\\d\\s\\W]{3,}"
87-
- "(?i)(?:your|the|a|my)\\s+password\\s+(?:is|should|could|might|will|would)\\s+[\\w\\d\\s\\W]{3,}"
88-
- "(?i)(?:credit\\s+card|card\\s+number|cvv|security\\s+code)\\s*:?\\s*\\d{3,}"
89-
- "(?i)(?:private|secret)\\s+key\\s*:?\\s*[\\w\\d/+]{5,}"
90-
- "(?i)access\\s+(?:token|key|credential)\\s*:?\\s*[\\w\\d]{5,}"
91-
- "(?i)api\\s+(?:key|token|secret)\\s*:?\\s*[\\w\\d]{5,}"
92-
- "(?i)(?:login|account)\\s+credential\\s*:?\\s*[\\w\\d]{3,}"
88+
- "password\\s+(?:is|should\\s+be|could\\s+be|would\\s+be)\\s*\\S+"
89+
- "my\\s+password\\s+is\\s*\\S+"
90+
- "your\\s+password\\s+(?:is|should|could|might|will|would)\\s*\\S+"
91+
- "(?:credit\\s+card|card\\s+number|cvv|security\\s+code)\\s*:?\\s*\\d{3,}"
92+
- "(?:private|secret)\\s+key\\s*:?\\s*[\\w\\d/+]{5,}"
93+
- "access\\s+(?:token|key|credential)\\s*:?\\s*[\\w\\d]{5,}"
94+
- "api\\s+(?:key|token|secret)\\s*:?\\s*[\\w\\d]{5,}"
95+
- "(?:login|account)\\s+credential\\s*:?\\s*[\\w\\d]{3,}"
9396

9497
# Maximum response length in characters
9598
max_response_length: 300
@@ -120,10 +123,11 @@ disclaimers:
120123
"technical": "\n\nNote: Implementation details may vary based on your specific environment, software versions, and organizational requirements. Always test in a non-production environment first."
121124

122125
# High-risk term combinations that should trigger blocking
123-
# If multiple terms from a combination appear in a query, it will be blocked
124126
high_risk_combinations:
125127
- ["hack", "tutorial", "step"]
126128
- ["password", "crack", "tool"]
127129
- ["bypass", "security", "how"]
128130
- ["steal", "data", "method"]
129-
- ["exploit", "vulnerability", "code"]
131+
- ["exploit", "vulnerability", "code"]
132+
- ["mine", "bitcoin", "profit"]
133+
- ["trade", "exchange", "crypto"]

src/core/conversation.py

Lines changed: 45 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from session_history import get_session_history
1010
from guardrails import GuardrailProcessor, GuardrailConfig
1111
import logging
12+
import os
1213

1314
# Configure logging
1415
logging.basicConfig(level=logging.INFO)
@@ -22,14 +23,15 @@ def initialize_models():
2223
tokenizer = AutoTokenizer.from_pretrained(config_data["model_name"])
2324
embeddings = HuggingFaceEmbeddings(model_name=config_data["embedding_model_name"])
2425

25-
persist_directory = config_data.get("persist_directory", "chromadb")
26+
# Use the correct path for ChromaDB - hardcoded to ensure it works
27+
chroma_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "chromadb")
28+
logger.info(f"Using ChromaDB path: {chroma_path}")
2629

2730
# Ensure directory exists
28-
import os
29-
os.makedirs(persist_directory, exist_ok=True)
31+
os.makedirs(chroma_path, exist_ok=True)
3032

3133
vectordb = Chroma(
32-
persist_directory=persist_directory,
34+
persist_directory=chroma_path,
3335
embedding_function=embeddings
3436
)
3537

@@ -39,15 +41,37 @@ def initialize_models():
3941

4042
return model, tokenizer, vectordb, initialize_models._guardrails
4143

42-
def initialize_guardrails(config_path="config/guardrails.yaml"):
43-
"""Initialize the guardrails processor."""
44+
def initialize_guardrails(config_path=None):
45+
"""Initialize the guardrails processor with optional custom config path."""
4446
config = GuardrailConfig()
45-
try:
46-
config.load_from_file(config_path)
47-
logger.info(f"Loaded guardrails configuration from {config_path}")
48-
except FileNotFoundError:
49-
logger.warning(f"Guardrails config file not found at {config_path}. Using default configuration.")
50-
config._set_default_config()
47+
if config_path:
48+
try:
49+
config.load_from_file(config_path)
50+
logger.info(f"Loaded guardrails configuration from {config_path}")
51+
except FileNotFoundError:
52+
logger.warning(f"Guardrails config file not found at {config_path}. Using default configuration.")
53+
config._set_default_config()
54+
else:
55+
# Try to load from default locations
56+
default_paths = [
57+
os.path.join(os.path.dirname(os.path.abspath(__file__)), "config/guardrails.yaml"),
58+
"config/guardrails.yaml", # Relative path
59+
"src/core/config/guardrails.yaml" # Another common path
60+
]
61+
62+
config_loaded = False
63+
for path in default_paths:
64+
try:
65+
config.load_from_file(path)
66+
logger.info(f"Loaded guardrails configuration from {path}")
67+
config_loaded = True
68+
break
69+
except FileNotFoundError:
70+
continue
71+
72+
if not config_loaded:
73+
logger.warning("No guardrails config file found. Using default configuration.")
74+
config._set_default_config()
5175

5276
logger.info(f"Guardrails initialized with blocked topics: {config.blocked_topics}")
5377
logger.info(f"Max response length: {config.max_response_length}")
@@ -61,6 +85,7 @@ def retrieve_relevant_context(query, vectordb, top_k=3):
6185
logger.info(f"Retrieved {len(results)} documents for query: {query}")
6286

6387
if not results:
88+
logger.warning(f"No documents found for query: {query}")
6489
return "No relevant documents were found in the knowledge base."
6590

6691
context = "\n\n".join([doc.page_content for doc in results])
@@ -71,19 +96,23 @@ def retrieve_relevant_context(query, vectordb, top_k=3):
7196

7297
def generate_response(session_id, model, tokenizer, query, vectordb):
7398
"""Generate a response with guardrails applied."""
74-
# Use the singleton guardrails instance
99+
# Always use the singleton guardrails instance
100+
if not hasattr(initialize_models, '_guardrails'):
101+
initialize_models._guardrails = initialize_guardrails()
75102
guardrails_processor = initialize_models._guardrails
76103

77-
# Apply guardrails to the query
104+
# Apply guardrails to the query first
78105
should_process, custom_response = guardrails_processor.check_query(query)
79106
if not should_process:
107+
logger.info(f"Query blocked by guardrails: {query}")
80108
conversation_history = get_session_history(session_id)
81109
conversation_history.add_user_message(query)
82110
conversation_history.add_ai_message(custom_response)
83111
return custom_response
84112

85113
conversation_history = get_session_history(session_id)
86114
context = retrieve_relevant_context(query, vectordb)
115+
logger.info(f"Retrieved context length: {len(context)}")
87116

88117
qa_system_prompt = """You are a concise assistant for question-answering tasks. \
89118
Use the following pieces of retrieved context to answer the question. \
@@ -107,7 +136,8 @@ def generate_response(session_id, model, tokenizer, query, vectordb):
107136
response = ""
108137
for token in streamer:
109138
response += token
110-
print(token)
139+
print(token, end="", flush=True)
140+
print()
111141

112142
# Apply guardrails to the generated response
113143
processed_response = guardrails_processor.process_response(query, response)

src/core/guardrails.py

Lines changed: 46 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,9 @@ def load_from_file(self, filepath: str) -> None:
6161
self.disclaimers = config['disclaimers']
6262
if 'high_risk_combinations' in config:
6363
self.high_risk_combinations = config['high_risk_combinations']
64+
65+
logger.info(f"Loaded guardrails config with {len(self.blocked_topics)} blocked topics")
66+
logger.info(f"Loaded {len(self.topic_related_terms)} topic term relations")
6467
except Exception as e:
6568
logger.error(f"Error loading guardrails config: {e}")
6669
self._set_default_config()
@@ -81,9 +84,9 @@ def _set_default_config(self):
8184
# Related terms for semantic understanding
8285
self.topic_related_terms = {
8386
"cryptocurrency": [
84-
"bitcoin", "ethereum", "blockchain investment", "token sale",
85-
"crypto trading", "ICO", "mining profitability", "altcoin",
86-
"trading strategy", "coin market", "crypto exchange"
87+
"bitcoin", "ethereum", "crypto", "token", "coin",
88+
"mining", "exchange", "trading", "wallet", "profit",
89+
"altcoin", "blockchain investment", "ICO"
8790
],
8891
"hacking": [
8992
"exploit", "vulnerability", "unauthorized access", "bypass security",
@@ -110,10 +113,11 @@ def _set_default_config(self):
110113
]
111114
}
112115

113-
# More robust pattern filtering with better regex
116+
# More robust pattern filtering with better regex for passwords
114117
self.filtered_patterns = [
115-
r"(?i)password\s+(?:is|should\s+be|could\s+be)\s+[\w\d\s\W]{3,}",
116-
r"(?i)(?:your|the|a|my)\s+password\s+(?:is|should|could|might|will|would)\s+[\w\d\s\W]{3,}",
118+
r"(?i)my\s+password\s+(?:is|should\s+be|could\s+be)\s+\S+", # Catches "My password should be password123"
119+
r"(?i)password\s+(?:is|should\s+be|could\s+be)\s+\S+", # More general password pattern
120+
r"(?i)(?:your|the|a)\s+password\s+(?:is|should|could|might|will|would)\s+\S+",
117121
r"(?i)(?:credit\s+card|card\s+number|cvv|security\s+code)\s*:?\s*\d{3,}",
118122
r"(?i)(?:private|secret)\s+key\s*:?\s*[\w\d/+]{5,}",
119123
r"(?i)access\s+(?:token|key|credential)\s*:?\s*[\w\d]{5,}",
@@ -155,7 +159,9 @@ def _set_default_config(self):
155159
["password", "crack", "tool"],
156160
["bypass", "security", "how"],
157161
["steal", "data", "method"],
158-
["exploit", "vulnerability", "code"]
162+
["exploit", "vulnerability", "code"],
163+
["mine", "bitcoin", "profit"],
164+
["trade", "exchange", "crypto"]
159165
]
160166

161167

@@ -193,25 +199,32 @@ def check_query(self, query: str) -> Tuple[bool, Optional[str]]:
193199
logger.info(f"Query blocked due to topic: {topic}")
194200
return False, f"I'm sorry, but I cannot provide information about {topic}."
195201

196-
# Check for related terms to blocked topics - semantic understanding
202+
# Check for cryptocurrency related terms - special case since this was failing
203+
if "bitcoin" in query_lower or "crypto" in query_lower or "mining" in query_lower:
204+
if any(term in query_lower for term in ["profit", "trading", "exchange", "invest"]):
205+
logger.info(f"Query blocked due to cryptocurrency terms")
206+
return False, f"I'm sorry, but I cannot provide information about cryptocurrency trading or investments."
207+
208+
# Check for semantic matches using related terms
197209
for topic, related_terms in self.config.topic_related_terms.items():
198210
matched_terms = []
199211
for term in related_terms:
200-
if term.lower() in query_lower:
212+
# Check if the term appears as a word or phrase
213+
if re.search(r'\b' + re.escape(term.lower()) + r'\b', query_lower):
201214
matched_terms.append(term)
202215

203-
# If 2 or more related terms are found, consider it as discussing the blocked topic
216+
# If we have 2 or more matched terms, consider it a match
204217
if len(matched_terms) >= 2:
205-
logger.info(f"Query blocked due to multiple related terms for topic {topic}: {matched_terms}")
206-
return False, f"I'm sorry, but I cannot provide information that appears to be related to {topic}."
218+
logger.info(f"Query blocked due to semantic match ({len(matched_terms)} terms) for topic {topic}: {matched_terms}")
219+
return False, f"I'm sorry, but I cannot provide information about topics related to {topic}."
207220

208221
# Check for high-risk term combinations
209222
for combination in self.config.high_risk_combinations:
210-
if all(term in query_words for term in combination):
211-
logger.info(f"Query blocked due to high-risk term combination: {combination}")
212-
return False, "I'm sorry, but I cannot provide information on this topic as it appears to be requesting potentially harmful or unethical guidance."
223+
matching_terms = [term for term in combination if term in query_lower]
224+
if len(matching_terms) >= len(combination) - 1: # Match if all but one term is present
225+
logger.info(f"Query blocked due to high-risk combination: {matching_terms}")
226+
return False, "I cannot provide information on this topic as it appears to be requesting potentially harmful guidance."
213227

214-
# If we've made it this far, the query is allowed
215228
return True, None
216229

217230
def process_response(self, query: str, response: str) -> str:
@@ -227,35 +240,46 @@ def process_response(self, query: str, response: str) -> str:
227240
"""
228241
processed = response
229242
query_lower = query.lower()
243+
combined_text = query_lower + " " + processed.lower()
230244

231245
# Apply length limit
232246
if len(processed) > self.config.max_response_length:
233-
processed = processed[:self.config.max_response_length] + "... [Response truncated for brevity]"
247+
processed = processed[:self.config.max_response_length] + "... [Response truncated]"
234248
logger.info(f"Response truncated to {self.config.max_response_length} characters")
235249

236-
# Apply pattern filters with word boundary checks for better matching
250+
# Apply pattern filters
237251
for pattern in self.config.filtered_patterns:
238252
original_length = len(processed)
239253
processed = re.sub(pattern, "[FILTERED]", processed, flags=re.IGNORECASE)
240254
if len(processed) != original_length:
241-
logger.info(f"Pattern '{pattern}' filtered from response")
255+
logger.info(f"Pattern filter applied: {pattern}")
242256

243257
# Add security disclaimer for security-related content
244-
if any(term in query_lower for term in ["security", "secure", "protection", "safety", "privacy", "firewall", "encrypt"]):
258+
security_terms = ["security", "secure", "protection", "safety", "privacy", "firewall", "encrypt",
259+
"authentication", "password", "credential", "access control"]
260+
261+
if any(term in combined_text for term in security_terms):
245262
if not processed.endswith('\n'):
246263
processed += '\n'
247264
processed += self.config.disclaimers.get("security", "")
248265
logger.info("Added security disclaimer to response")
266+
return processed
249267

250268
# Add blockchain disclaimer for blockchain-related content
251-
elif any(term in query_lower for term in ["blockchain", "hyperledger", "distributed ledger", "smart contract"]):
269+
blockchain_terms = ["blockchain", "hyperledger", "distributed ledger", "smart contract",
270+
"consensus", "chaincode", "fabric"]
271+
272+
if any(term in combined_text for term in blockchain_terms):
252273
if not processed.endswith('\n'):
253274
processed += '\n'
254275
processed += self.config.disclaimers.get("blockchain", "")
255276
logger.info("Added blockchain disclaimer to response")
277+
return processed
256278

257279
# Add technical disclaimer for implementation-related content
258-
elif any(term in query_lower for term in ["implement", "deploy", "install", "configure", "setup"]):
280+
technical_terms = ["implement", "deploy", "install", "configure", "setup", "integration", "docker"]
281+
282+
if any(term in combined_text for term in technical_terms):
259283
if not processed.endswith('\n'):
260284
processed += '\n'
261285
processed += self.config.disclaimers.get("technical", "")

0 commit comments

Comments
 (0)