hyperledger-labs
diff --git a/‎chromadb/chroma.sqlite3
-160 KB b/‎chromadb/chroma.sqlite3
-160 KB
diff --git a/‎src/core/chromadb/chroma.sqlite3
0 Bytes b/‎src/core/chromadb/chroma.sqlite3
0 Bytes
diff --git a/‎src/core/config/guardrails.yaml
Lines changed: 26 additions & 22 deletions b/‎src/core/config/guardrails.yaml
Lines changed: 26 additions & 22 deletions
diff --git a/‎src/core/conversation.py
Lines changed: 45 additions & 15 deletions b/‎src/core/conversation.py
Lines changed: 45 additions & 15 deletions
diff --git a/‎src/core/guardrails.py
Lines changed: 46 additions & 22 deletions b/‎src/core/guardrails.py
Lines changed: 46 additions & 22 deletions
@@ -1,6 +1,6 @@
 # Enhanced Guardrails configuration for AIFAQ
 
-# Topics to avoid in responses (can be customized by administrators)
+# Topics to avoid in responses
 blocked_topics:
   - "new_topic"
   - "cryptocurrency"
@@ -17,37 +17,39 @@ topic_related_terms:
     - "ethereum"
     - "blockchain investment"
     - "token sale"
-    - "crypto trading"
+    - "crypto"
     - "ICO"
-    - "mining profitability"
+    - "mining"
     - "altcoin"
-    - "trading strategy"
-    - "coin market"
-    - "crypto exchange"
+    - "trading"
+    - "exchange"
+    - "coin"
+    - "token"
+    - "wallet"
+    - "profit"
   "hacking":
     - "exploit"
     - "vulnerability"
     - "unauthorized access"
     - "bypass security"
     - "crack password"
     - "penetration testing"
-    - "exploit code"
     - "backdoor"
     - "security breach"
     - "break into"
-    - "infiltrate system"
-    - "circumvent protection"
+    - "infiltrate"
+    - "circumvent"
   "illegal activities":
     - "black market"
     - "smuggling"
     - "theft"
-    - "fraud scheme"
+    - "fraud"
     - "money laundering"
     - "illegal download"
     - "counterfeit"
     - "unauthorized copying"
     - "piracy"
-    - "unlawful access"
+    - "unlawful"
     - "stealing"
     - "illicit"
   "personal data extraction":
@@ -74,22 +76,23 @@ topic_related_terms:
     - "hate speech"
   "harmful content":
     - "self-harm"
-    - "suicide method"
+    - "suicide"
     - "dangerous chemical"
-    - "weapon making"
+    - "weapon"
     - "explosives"
     - "harmful instructions"
     - "dangerous experiment"
 
 # Regular expressions to filter out from responses
 filtered_patterns:
-  - "(?i)password\\s+(?:is|should\\s+be|could\\s+be)\\s+[\\w\\d\\s\\W]{3,}"
-  - "(?i)(?:your|the|a|my)\\s+password\\s+(?:is|should|could|might|will|would)\\s+[\\w\\d\\s\\W]{3,}"
-  - "(?i)(?:credit\\s+card|card\\s+number|cvv|security\\s+code)\\s*:?\\s*\\d{3,}"
-  - "(?i)(?:private|secret)\\s+key\\s*:?\\s*[\\w\\d/+]{5,}"
-  - "(?i)access\\s+(?:token|key|credential)\\s*:?\\s*[\\w\\d]{5,}"
-  - "(?i)api\\s+(?:key|token|secret)\\s*:?\\s*[\\w\\d]{5,}"
-  - "(?i)(?:login|account)\\s+credential\\s*:?\\s*[\\w\\d]{3,}"
+  - "password\\s+(?:is|should\\s+be|could\\s+be|would\\s+be)\\s*\\S+"
+  - "my\\s+password\\s+is\\s*\\S+"
+  - "your\\s+password\\s+(?:is|should|could|might|will|would)\\s*\\S+"
+  - "(?:credit\\s+card|card\\s+number|cvv|security\\s+code)\\s*:?\\s*\\d{3,}"
+  - "(?:private|secret)\\s+key\\s*:?\\s*[\\w\\d/+]{5,}"
+  - "access\\s+(?:token|key|credential)\\s*:?\\s*[\\w\\d]{5,}"
+  - "api\\s+(?:key|token|secret)\\s*:?\\s*[\\w\\d]{5,}"
+  - "(?:login|account)\\s+credential\\s*:?\\s*[\\w\\d]{3,}"
 
 # Maximum response length in characters
 max_response_length: 300
@@ -120,10 +123,11 @@ disclaimers:
   "technical": "\n\nNote: Implementation details may vary based on your specific environment, software versions, and organizational requirements. Always test in a non-production environment first."
 
 # High-risk term combinations that should trigger blocking
-# If multiple terms from a combination appear in a query, it will be blocked
 high_risk_combinations:
   - ["hack", "tutorial", "step"]
   - ["password", "crack", "tool"]
   - ["bypass", "security", "how"]
   - ["steal", "data", "method"]
-  - ["exploit", "vulnerability", "code"]
+  - ["exploit", "vulnerability", "code"]
+  - ["mine", "bitcoin", "profit"]
+  - ["trade", "exchange", "crypto"]
@@ -9,6 +9,7 @@
 from session_history import get_session_history
 from guardrails import GuardrailProcessor, GuardrailConfig
 import logging
+import os
 
 # Configure logging
 logging.basicConfig(level=logging.INFO)
@@ -22,14 +23,15 @@ def initialize_models():
     tokenizer = AutoTokenizer.from_pretrained(config_data["model_name"])
     embeddings = HuggingFaceEmbeddings(model_name=config_data["embedding_model_name"])
 
-    persist_directory = config_data.get("persist_directory", "chromadb")
+    # Use the correct path for ChromaDB - hardcoded to ensure it works
+    chroma_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "chromadb")
+    logger.info(f"Using ChromaDB path: {chroma_path}")
 
     # Ensure directory exists
-    import os
-    os.makedirs(persist_directory, exist_ok=True)
+    os.makedirs(chroma_path, exist_ok=True)
 
     vectordb = Chroma(
-        persist_directory=persist_directory,
+        persist_directory=chroma_path,
         embedding_function=embeddings
     )
 
@@ -39,15 +41,37 @@ def initialize_models():
 
     return model, tokenizer, vectordb, initialize_models._guardrails
 
-def initialize_guardrails(config_path="config/guardrails.yaml"):
-    """Initialize the guardrails processor."""
+def initialize_guardrails(config_path=None):
+    """Initialize the guardrails processor with optional custom config path."""
     config = GuardrailConfig()
-    try:
-        config.load_from_file(config_path)
-        logger.info(f"Loaded guardrails configuration from {config_path}")
-    except FileNotFoundError:
-        logger.warning(f"Guardrails config file not found at {config_path}. Using default configuration.")
-        config._set_default_config()
+    if config_path:
+        try:
+            config.load_from_file(config_path)
+            logger.info(f"Loaded guardrails configuration from {config_path}")
+        except FileNotFoundError:
+            logger.warning(f"Guardrails config file not found at {config_path}. Using default configuration.")
+            config._set_default_config()
+    else:
+        # Try to load from default locations
+        default_paths = [
+            os.path.join(os.path.dirname(os.path.abspath(__file__)), "config/guardrails.yaml"),
+            "config/guardrails.yaml",  # Relative path
+            "src/core/config/guardrails.yaml"  # Another common path
+        ]
+        
+        config_loaded = False
+        for path in default_paths:
+            try:
+                config.load_from_file(path)
+                logger.info(f"Loaded guardrails configuration from {path}")
+                config_loaded = True
+                break
+            except FileNotFoundError:
+                continue
+        
+        if not config_loaded:
+            logger.warning("No guardrails config file found. Using default configuration.")
+            config._set_default_config()
 
     logger.info(f"Guardrails initialized with blocked topics: {config.blocked_topics}")
     logger.info(f"Max response length: {config.max_response_length}")
@@ -61,6 +85,7 @@ def retrieve_relevant_context(query, vectordb, top_k=3):
         logger.info(f"Retrieved {len(results)} documents for query: {query}")
 
         if not results:
+            logger.warning(f"No documents found for query: {query}")
             return "No relevant documents were found in the knowledge base."
 
         context = "\n\n".join([doc.page_content for doc in results])
@@ -71,19 +96,23 @@ def retrieve_relevant_context(query, vectordb, top_k=3):
 
 def generate_response(session_id, model, tokenizer, query, vectordb):
     """Generate a response with guardrails applied."""
-    # Use the singleton guardrails instance
+    # Always use the singleton guardrails instance
+    if not hasattr(initialize_models, '_guardrails'):
+        initialize_models._guardrails = initialize_guardrails()
     guardrails_processor = initialize_models._guardrails
 
-    # Apply guardrails to the query
+    # Apply guardrails to the query first
     should_process, custom_response = guardrails_processor.check_query(query)
     if not should_process:
+        logger.info(f"Query blocked by guardrails: {query}")
         conversation_history = get_session_history(session_id)
         conversation_history.add_user_message(query)
         conversation_history.add_ai_message(custom_response)
         return custom_response
 
     conversation_history = get_session_history(session_id)
     context = retrieve_relevant_context(query, vectordb)
+    logger.info(f"Retrieved context length: {len(context)}")
 
     qa_system_prompt = """You are a concise assistant for question-answering tasks. \
     Use the following pieces of retrieved context to answer the question. \
@@ -107,7 +136,8 @@ def generate_response(session_id, model, tokenizer, query, vectordb):
     response = ""
     for token in streamer:
         response += token
-        print(token)
+        print(token, end="", flush=True)
+    print()
 
     # Apply guardrails to the generated response
     processed_response = guardrails_processor.process_response(query, response)
 
@@ -61,6 +61,9 @@ def load_from_file(self, filepath: str) -> None:
                 self.disclaimers = config['disclaimers']
             if 'high_risk_combinations' in config:
                 self.high_risk_combinations = config['high_risk_combinations']
+            
+            logger.info(f"Loaded guardrails config with {len(self.blocked_topics)} blocked topics")
+            logger.info(f"Loaded {len(self.topic_related_terms)} topic term relations")
         except Exception as e:
             logger.error(f"Error loading guardrails config: {e}")
             self._set_default_config()
@@ -81,9 +84,9 @@ def _set_default_config(self):
         # Related terms for semantic understanding
         self.topic_related_terms = {
             "cryptocurrency": [
-                "bitcoin", "ethereum", "blockchain investment", "token sale", 
-                "crypto trading", "ICO", "mining profitability", "altcoin", 
-                "trading strategy", "coin market", "crypto exchange"
+                "bitcoin", "ethereum", "crypto", "token", "coin", 
+                "mining", "exchange", "trading", "wallet", "profit", 
+                "altcoin", "blockchain investment", "ICO"
             ],
             "hacking": [
                 "exploit", "vulnerability", "unauthorized access", "bypass security",
@@ -110,10 +113,11 @@ def _set_default_config(self):
             ]
         }
 
-        # More robust pattern filtering with better regex
+        # More robust pattern filtering with better regex for passwords
         self.filtered_patterns = [
-            r"(?i)password\s+(?:is|should\s+be|could\s+be)\s+[\w\d\s\W]{3,}",
-            r"(?i)(?:your|the|a|my)\s+password\s+(?:is|should|could|might|will|would)\s+[\w\d\s\W]{3,}",
+            r"(?i)my\s+password\s+(?:is|should\s+be|could\s+be)\s+\S+",  # Catches "My password should be password123"
+            r"(?i)password\s+(?:is|should\s+be|could\s+be)\s+\S+",       # More general password pattern
+            r"(?i)(?:your|the|a)\s+password\s+(?:is|should|could|might|will|would)\s+\S+",
             r"(?i)(?:credit\s+card|card\s+number|cvv|security\s+code)\s*:?\s*\d{3,}",
             r"(?i)(?:private|secret)\s+key\s*:?\s*[\w\d/+]{5,}",
             r"(?i)access\s+(?:token|key|credential)\s*:?\s*[\w\d]{5,}",
@@ -155,7 +159,9 @@ def _set_default_config(self):
             ["password", "crack", "tool"],
             ["bypass", "security", "how"],
             ["steal", "data", "method"],
-            ["exploit", "vulnerability", "code"]
+            ["exploit", "vulnerability", "code"],
+            ["mine", "bitcoin", "profit"],
+            ["trade", "exchange", "crypto"]
         ]
 
 
@@ -193,25 +199,32 @@ def check_query(self, query: str) -> Tuple[bool, Optional[str]]:
                 logger.info(f"Query blocked due to topic: {topic}")
                 return False, f"I'm sorry, but I cannot provide information about {topic}."
 
-        # Check for related terms to blocked topics - semantic understanding
+        # Check for cryptocurrency related terms - special case since this was failing
+        if "bitcoin" in query_lower or "crypto" in query_lower or "mining" in query_lower:
+            if any(term in query_lower for term in ["profit", "trading", "exchange", "invest"]):
+                logger.info(f"Query blocked due to cryptocurrency terms")
+                return False, f"I'm sorry, but I cannot provide information about cryptocurrency trading or investments."
+        
+        # Check for semantic matches using related terms
         for topic, related_terms in self.config.topic_related_terms.items():
             matched_terms = []
             for term in related_terms:
-                if term.lower() in query_lower:
+                # Check if the term appears as a word or phrase
+                if re.search(r'\b' + re.escape(term.lower()) + r'\b', query_lower):
                     matched_terms.append(term)
 
-            # If 2 or more related terms are found, consider it as discussing the blocked topic
+            # If we have 2 or more matched terms, consider it a match
             if len(matched_terms) >= 2:
-                logger.info(f"Query blocked due to multiple related terms for topic {topic}: {matched_terms}")
-                return False, f"I'm sorry, but I cannot provide information that appears to be related to {topic}."
+                logger.info(f"Query blocked due to semantic match ({len(matched_terms)} terms) for topic {topic}: {matched_terms}")
+                return False, f"I'm sorry, but I cannot provide information about topics related to {topic}."
 
         # Check for high-risk term combinations
         for combination in self.config.high_risk_combinations:
-            if all(term in query_words for term in combination):
-                logger.info(f"Query blocked due to high-risk term combination: {combination}")
-                return False, "I'm sorry, but I cannot provide information on this topic as it appears to be requesting potentially harmful or unethical guidance."
+            matching_terms = [term for term in combination if term in query_lower]
+            if len(matching_terms) >= len(combination) - 1:  # Match if all but one term is present
+                logger.info(f"Query blocked due to high-risk combination: {matching_terms}")
+                return False, "I cannot provide information on this topic as it appears to be requesting potentially harmful guidance."
 
-        # If we've made it this far, the query is allowed
         return True, None
 
     def process_response(self, query: str, response: str) -> str:
@@ -227,35 +240,46 @@ def process_response(self, query: str, response: str) -> str:
         """
         processed = response
         query_lower = query.lower()
+        combined_text = query_lower + " " + processed.lower()
 
         # Apply length limit
         if len(processed) > self.config.max_response_length:
-            processed = processed[:self.config.max_response_length] + "... [Response truncated for brevity]"
+            processed = processed[:self.config.max_response_length] + "... [Response truncated]"
             logger.info(f"Response truncated to {self.config.max_response_length} characters")
 
-        # Apply pattern filters with word boundary checks for better matching
+        # Apply pattern filters
         for pattern in self.config.filtered_patterns:
             original_length = len(processed)
             processed = re.sub(pattern, "[FILTERED]", processed, flags=re.IGNORECASE)
             if len(processed) != original_length:
-                logger.info(f"Pattern '{pattern}' filtered from response")
+                logger.info(f"Pattern filter applied: {pattern}")
 
         # Add security disclaimer for security-related content
-        if any(term in query_lower for term in ["security", "secure", "protection", "safety", "privacy", "firewall", "encrypt"]):
+        security_terms = ["security", "secure", "protection", "safety", "privacy", "firewall", "encrypt", 
+                         "authentication", "password", "credential", "access control"]
+        
+        if any(term in combined_text for term in security_terms):
             if not processed.endswith('\n'):
                 processed += '\n'
             processed += self.config.disclaimers.get("security", "")
             logger.info("Added security disclaimer to response")
+            return processed
 
         # Add blockchain disclaimer for blockchain-related content
-        elif any(term in query_lower for term in ["blockchain", "hyperledger", "distributed ledger", "smart contract"]):
+        blockchain_terms = ["blockchain", "hyperledger", "distributed ledger", "smart contract", 
+                           "consensus", "chaincode", "fabric"]
+        
+        if any(term in combined_text for term in blockchain_terms):
             if not processed.endswith('\n'):
                 processed += '\n'
             processed += self.config.disclaimers.get("blockchain", "")
             logger.info("Added blockchain disclaimer to response")
+            return processed
 
         # Add technical disclaimer for implementation-related content
-        elif any(term in query_lower for term in ["implement", "deploy", "install", "configure", "setup"]):
+        technical_terms = ["implement", "deploy", "install", "configure", "setup", "integration", "docker"]
+        
+        if any(term in combined_text for term in technical_terms):
             if not processed.endswith('\n'):
                 processed += '\n'
             processed += self.config.disclaimers.get("technical", "")