Skip to content

Commit c1408e1

Browse files
authored
Merge pull request #25 from pamelafox/rrffix
Fix RRF algorithm to match AI Search algo
2 parents b598a30 + c455cc9 commit c1408e1

File tree

2 files changed

+42
-42
lines changed

2 files changed

+42
-42
lines changed

rag_documents_hybrid.py

+21-21
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
# pip install sentence-transformers
21
import json
32
import os
43

@@ -72,24 +71,25 @@ def cosine_similarity(a, b):
7271
return retrieved_documents
7372

7473

75-
def reciprocal_rank_fusion(text_results, vector_results, alpha=0.5):
74+
def reciprocal_rank_fusion(text_results, vector_results, k=60):
7675
"""
77-
Perform Reciprocal Rank Fusion on the results from text and vector searches.
76+
Perform Reciprocal Rank Fusion (RRF) on the results from text and vector searches,
77+
based on algorithm described here:
78+
https://learn.microsoft.com/azure/search/hybrid-search-ranking#how-rrf-ranking-works
7879
"""
79-
text_ids = {doc["id"] for doc in text_results}
80-
vector_ids = {doc["id"] for doc in vector_results}
81-
82-
combined_results = []
83-
for doc in text_results:
84-
if doc["id"] in vector_ids:
85-
combined_results.append((doc, alpha))
86-
else:
87-
combined_results.append((doc, 1 - alpha))
88-
for doc in vector_results:
89-
if doc["id"] not in text_ids:
90-
combined_results.append((doc, alpha))
91-
combined_results.sort(key=lambda x: x[1], reverse=True)
92-
return [doc for doc, _ in combined_results]
80+
scores = {}
81+
82+
for i, doc in enumerate(text_results):
83+
if doc["id"] not in scores:
84+
scores[doc["id"]] = 0
85+
scores[doc["id"]] += 1 / (i + k)
86+
for i, doc in enumerate(vector_results):
87+
if doc["id"] not in scores:
88+
scores[doc["id"]] = 0
89+
scores[doc["id"]] += 1 / (i + k)
90+
scored_documents = sorted(scores.items(), key=lambda x: x[1], reverse=True)
91+
retrieved_documents = [documents_by_id[doc_id] for doc_id, _ in scored_documents]
92+
return retrieved_documents
9393

9494

9595
def rerank(query, retrieved_documents):
@@ -108,13 +108,13 @@ def hybrid_search(query, limit):
108108
"""
109109
text_results = full_text_search(query, limit * 2)
110110
vector_results = vector_search(query, limit * 2)
111-
combined_results = reciprocal_rank_fusion(text_results, vector_results)
112-
combined_results = rerank(query, combined_results)
113-
return combined_results[:limit]
111+
fused_results = reciprocal_rank_fusion(text_results, vector_results)
112+
reranked_results = rerank(query, fused_results)
113+
return reranked_results[:limit]
114114

115115

116116
# Get the user question
117-
user_question = "cute gray fuzzsters"
117+
user_question = "cute gray fuzzy bee"
118118

119119
# Search the index for the user question
120120
retrieved_documents = hybrid_search(user_question, limit=5)

spanish/rag_documents_hybrid.py

+21-21
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
# pip install sentence-transformers
21
import json
32
import os
43

@@ -72,24 +71,25 @@ def cosine_similarity(a, b):
7271
return retrieved_documents
7372

7473

75-
def reciprocal_rank_fusion(text_results, vector_results, alpha=0.5):
74+
def reciprocal_rank_fusion(text_results, vector_results, k=60):
7675
"""
77-
Realizar la Fusión de Rango Recíproco en los resultados de búsquedas de texto y vectoriales.
76+
Realizar la Fusión de Rango Recíproco (RRF) en los resultados de búsquedas de texto y vectoriales,
77+
basado en el algoritmo descrito aqui:
78+
https://learn.microsoft.com/azure/search/hybrid-search-ranking#how-rrf-ranking-works
7879
"""
79-
text_ids = {doc["id"] for doc in text_results}
80-
vector_ids = {doc["id"] for doc in vector_results}
81-
82-
combined_results = []
83-
for doc in text_results:
84-
if doc["id"] in vector_ids:
85-
combined_results.append((doc, alpha))
86-
else:
87-
combined_results.append((doc, 1 - alpha))
88-
for doc in vector_results:
89-
if doc["id"] not in text_ids:
90-
combined_results.append((doc, alpha))
91-
combined_results.sort(key=lambda x: x[1], reverse=True)
92-
return [doc for doc, _ in combined_results]
80+
scores = {}
81+
82+
for i, doc in enumerate(text_results):
83+
if doc["id"] not in scores:
84+
scores[doc["id"]] = 0
85+
scores[doc["id"]] += 1 / (i + k)
86+
for i, doc in enumerate(vector_results):
87+
if doc["id"] not in scores:
88+
scores[doc["id"]] = 0
89+
scores[doc["id"]] += 1 / (i + k)
90+
scored_documents = sorted(scores.items(), key=lambda x: x[1], reverse=True)
91+
retrieved_documents = [documents_by_id[doc_id] for doc_id, _ in scored_documents]
92+
return retrieved_documents
9393

9494

9595
def rerank(query, retrieved_documents):
@@ -108,13 +108,13 @@ def hybrid_search(query, limit):
108108
"""
109109
text_results = full_text_search(query, limit * 2)
110110
vector_results = vector_search(query, limit * 2)
111-
combined_results = reciprocal_rank_fusion(text_results, vector_results)
112-
combined_results = rerank(query, combined_results)
113-
return combined_results[:limit]
111+
fused_results = reciprocal_rank_fusion(text_results, vector_results)
112+
reranked_results = rerank(query, fused_results)
113+
return reranked_results[:limit]
114114

115115

116116
# Obtener la pregunta del usuario
117-
user_question = "gris y solitario"
117+
user_question = "cual insecta es gris y velloso?"
118118

119119
# Buscar la pregunta del usuario en el índice
120120
retrieved_documents = hybrid_search(user_question, limit=5)

0 commit comments

Comments
 (0)