INCF · Areeba-Tahir-18 · Mar 26, 2026
diff --git a/backend/ks_search_tool.py b/backend/ks_search_tool.py
@@ -4,10 +4,10 @@
 import requests
 import asyncio
 import aiohttp
-from typing import Dict, Optional, Set, Union, List, Any, Iterable
-import re
-from urllib.parse import urlparse
+from typing import List
 from difflib import SequenceMatcher
+from urllib.parse import urlparse, urlunparse
+import re
 
 
 def tool(args_schema):
@@ -320,6 +320,9 @@ async def general_search_async(query: str, top_k: int = 10, enrich_details: bool
                 or item.get("dc", {}).get("identifier")
                 or "https://knowledge-space.org"
             )
+
+
+
             normalized_results.append(
                 {
                     "_id": f"general_{i}",
@@ -443,8 +446,111 @@ def _perform_search(data_source_id: str, query: str, filters: dict, all_configs:
     except requests.RequestException as e:
         print(f"  -> Error searching {data_source_id}: {e}")
         return []
+
+
+
+ # Deduplication feature updated version 
+def normalize_url(url: str) -> str:
+    """Normalize URLs by stripping query params and fragments."""
+    if not url:
+        return ""
+    parsed = urlparse(url)
+    normalized = urlunparse((parsed.scheme, parsed.netloc, parsed.path, "", "", ""))
+    return normalized.lower().rstrip("/")
+
+
+def normalize_title(title: str) -> str:
+    """Normalize title: lowercase, strip punctuation, extra spaces."""
+    if not title:
+        return ""
+    title = title.lower()
+    title = re.sub(r"[^\w\s]", "", title)
+    title = re.sub(r"\s+", " ", title)
+    return title.strip()
+
+
+def titles_reordered_match(t1: str, t2: str) -> bool:
+    """Detect titles with same words but different order."""
+    tokens1 = set(t1.split())
+    tokens2 = set(t2.split())
+    return tokens1 == tokens2
+
+
+def deduplicate_datasets(all_datasets: List[dict]) -> List[dict]:
+    """Deduplicate datasets using canonical ID, normalized URL, fuzzy title, and reordered title detection."""
+
+    if not all_datasets:
+        return []
+
+    cleaned = []
+    seen_canonical = set()
+    seen_urls = set()
+
+    for dataset in all_datasets:
+        metadata = dataset.get("metadata", {}) or dataset.get("_source", {})
+
+        # Canonical ID
+        dataset_id = metadata.get("id") or metadata.get("dataset_id") or dataset.get("_id")
+        dataset_id = str(dataset_id).lower() if dataset_id else ""
+
+        datasource_id = str(dataset.get("datasource_id") or "default_source").lower()
+        canonical_key = f"{datasource_id}:{dataset_id}"
 
+        if dataset_id and canonical_key in seen_canonical:
+            continue
+
+        if dataset_id:
+            seen_canonical.add(canonical_key)
+
+        # URL deduplication
+        raw_url = dataset.get("primary_link", "")
+        normalized_url = normalize_url(raw_url)
+
+        if normalized_url and normalized_url in seen_urls:
+            continue
+
+        if normalized_url:
+            seen_urls.add(normalized_url)
+
+        # Title normalization
+        title = normalize_title(
+            dataset.get("title")
+            or dataset.get("title_guess")
+            or metadata.get("title")
+            or ""
+        )
+
+        duplicate_found = False
+
+        if title:
+            for existing in cleaned:
+                existing_title = normalize_title(
+                    existing.get("title")
+                    or existing.get("title_guess")
+                    or ""
+                )
 
+                if not existing_title:
+                    continue
+
+                # Fuzzy match
+                similarity = SequenceMatcher(None, title, existing_title).ratio()
+
+                if similarity > 0.93:
+                    duplicate_found = True
+                    break
+
+                # Reordered title match
+                if titles_reordered_match(title, existing_title):
+                    duplicate_found = True
+                    break
+
+        if not duplicate_found:
+            cleaned.append(dataset)
+
+    return cleaned
+
+
 @tool(args_schema=BaseModel)
 def smart_knowledge_search(
     query: Optional[str] = None,