1
- # pip install sentence-transformers
2
1
import json
3
2
import os
4
3
@@ -72,24 +71,25 @@ def cosine_similarity(a, b):
72
71
return retrieved_documents
73
72
74
73
75
- def reciprocal_rank_fusion (text_results , vector_results , alpha = 0.5 ):
74
+ def reciprocal_rank_fusion (text_results , vector_results , k = 60 ):
76
75
"""
77
- Realizar la Fusión de Rango Recíproco en los resultados de búsquedas de texto y vectoriales.
76
+ Realizar la Fusión de Rango Recíproco (RRF) en los resultados de búsquedas de texto y vectoriales,
77
+ basado en el algoritmo descrito aqui:
78
+ https://learn.microsoft.com/azure/search/hybrid-search-ranking#how-rrf-ranking-works
78
79
"""
79
- text_ids = {doc ["id" ] for doc in text_results }
80
- vector_ids = {doc ["id" ] for doc in vector_results }
81
-
82
- combined_results = []
83
- for doc in text_results :
84
- if doc ["id" ] in vector_ids :
85
- combined_results .append ((doc , alpha ))
86
- else :
87
- combined_results .append ((doc , 1 - alpha ))
88
- for doc in vector_results :
89
- if doc ["id" ] not in text_ids :
90
- combined_results .append ((doc , alpha ))
91
- combined_results .sort (key = lambda x : x [1 ], reverse = True )
92
- return [doc for doc , _ in combined_results ]
80
+ scores = {}
81
+
82
+ for i , doc in enumerate (text_results ):
83
+ if doc ["id" ] not in scores :
84
+ scores [doc ["id" ]] = 0
85
+ scores [doc ["id" ]] += 1 / (i + k )
86
+ for i , doc in enumerate (vector_results ):
87
+ if doc ["id" ] not in scores :
88
+ scores [doc ["id" ]] = 0
89
+ scores [doc ["id" ]] += 1 / (i + k )
90
+ scored_documents = sorted (scores .items (), key = lambda x : x [1 ], reverse = True )
91
+ retrieved_documents = [documents_by_id [doc_id ] for doc_id , _ in scored_documents ]
92
+ return retrieved_documents
93
93
94
94
95
95
def rerank (query , retrieved_documents ):
@@ -108,13 +108,13 @@ def hybrid_search(query, limit):
108
108
"""
109
109
text_results = full_text_search (query , limit * 2 )
110
110
vector_results = vector_search (query , limit * 2 )
111
- combined_results = reciprocal_rank_fusion (text_results , vector_results )
112
- combined_results = rerank (query , combined_results )
113
- return combined_results [:limit ]
111
+ fused_results = reciprocal_rank_fusion (text_results , vector_results )
112
+ reranked_results = rerank (query , fused_results )
113
+ return reranked_results [:limit ]
114
114
115
115
116
116
# Obtener la pregunta del usuario
117
- user_question = "gris y solitario "
117
+ user_question = "cual insecta es gris y velloso? "
118
118
119
119
# Buscar la pregunta del usuario en el índice
120
120
retrieved_documents = hybrid_search (user_question , limit = 5 )
0 commit comments