fix: t2_ranking example

LongxingTan · web-flow · commit 9094bcae535f · 2024-08-01T14:11:19.000+08:00
diff --git a/.gitignore b/.gitignore
@@ -82,7 +82,7 @@ coverage.xml
 .hypothesis/
 .pytest_cache/
 
-# yuetan
+# specific
 **/nohup.out
 /reference/*
 /examples/data/*
@@ -92,12 +92,7 @@ coverage.xml
 /data/*.zip
 /data/raw/*
 /data/web/*
-/weights/scaler.pkl
-/weights/saved_model.pb
-/weights/variables/*
-/weights/checkpoint
-/weights/checkpoint.data-00000-of-00001
-/weights/checkpoint.index
+/weights/*
 /conda/*
 **/.pdf
 /encode.py
diff --git a/README_zh-CN.md b/README_zh-CN.md
@@ -37,7 +37,7 @@
 **Open-Retrievals** 一统向量、检索、重排，帮助开发者在信息检索、大语言模型RAG等领域便捷优化
 - 支持全套向量微调，对比学习、大模型、point-wise、pairwise、listwise
 - 支持全套重排微调，cross encoder、ColBERT、LLM
-- 支持定制化RAG，支持在Transformers、Langchain、LlamaIndex中便捷使用微调后的模型
+- 支持定制化、模块化RAG，支持在Transformers、Langchain、LlamaIndex中便捷使用微调后的模型
 
 | 实验                  | 模型                      | 尺寸| 原分数 | 微调分数      | Demo代码                                                                                                                           |
 |----------------------|-------------------------|----|-------|-----------|-------------------------------------------------------------------------------------------------------------------------------------|
diff --git a/codecov.yml b/codecov.yml
@@ -5,7 +5,7 @@ coverage:
   status:
     project:
       default:
-        threshold: 2%
+        threshold: 3%
 
     patch:
       default:
diff --git a/docs/source/embed.rst b/docs/source/embed.rst
@@ -124,5 +124,12 @@ offline hard mining
 online hard mining
 
 
-Ensemble embedding
-~~~~~~~~~~~~~~~~~~~~~~
+Matryoshka Representation Learning
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+
+Contrastive loss
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+cosent loss
+- similar to circle loss, but with cosine
diff --git a/docs/source/rag.rst b/docs/source/rag.rst
@@ -105,6 +105,9 @@ Enhance RAG Performance
 * Meta data of documents
 
 
+Graph RAG
+-------------------
+
 
 pdf parse
 --------------
diff --git a/docs/source/retrieval.rst b/docs/source/retrieval.rst
@@ -6,6 +6,23 @@ Offline indexing
 ----------------------------
 
 
+Ensemble retrieval
+---------------------
+
+we can use `RRF_fusion` to ensemble multiple retrievals to improve the retrieval performance.
+
 
 Query retrieval
 ----------------------------
+
+
+Faiss retrieval
+-----------------------
+
+
+BM25 retrieval
+-----------------------
+
+
+Elastic search retrieval
+---------------------------
diff --git a/examples/README.md b/examples/README.md
@@ -7,7 +7,7 @@
 - [rerank-llm finetune](rerank_llm_finetune.py)
 - [RAG with Langchain](./rag_langchain_demo.py)
 
-Check the whole pipeline
+Check the whole pipeline examples
 - [t2-ranking dataset](./t2_ranking/README.md)
 - [scifact dataset](./scifact/README.md)
 
diff --git a/examples/eval/README.md b/examples/eval/README.md
@@ -6,38 +6,9 @@ pip install datasets mteb[beir]
 pip install open-retrievals[eval]
 ```
 
-
-```python
-from typing import List, Union, Dict
-import numpy as np
-from retrievals import AutoModelForEmbedding
-
-
-class AutoModelForEmbeddingEval(AutoModelForEmbedding):
-    def __init__(self, **kwargs):
-        super(AutoModelForEmbeddingEval, self).__init__(**kwargs)
-
-    def encode_queries(self, queries: List[str], **kwargs) -> np.ndarray:
-        """For MTEB eval
-        This function will be used for retrieval task
-        if there is an instruction for queries, we will add it to the query text
-        """
-        if self.query_instruction is not None:
-            input_texts = ['{}{}'.format(self.query_instruction, q) for q in queries]
-        else:
-            input_texts = queries
-        return self.encode_from_text(input_texts, batch_size=4)
-
-    def encode_corpus(self, corpus: List[Union[Dict[str, str], str]], **kwargs) -> np.ndarray:
-        """For MTEB eval
-        This function will be used for retrieval task
-        encode corpus for retrieval task
-        """
-        if isinstance(corpus[0], dict):
-            input_texts = ['{} {}'.format(doc.get('title', ''), doc['text']).strip() for doc in corpus]
-        else:
-            input_texts = corpus
-        return self.encode_from_text(input_texts, batch_size=4)
+**Eval**
+```shell
+python run_eval.py  --model_name stella-base-zh  --output_dir ./zh_results/stella-base
 ```
 
 
diff --git a/examples/eval/run_eval.py b/examples/eval/run_eval.py
@@ -0,0 +1,81 @@
+"""Evaluation of embedding model"""
+
+import argparse
+import functools
+import random
+from typing import Dict, List
+
+import numpy as np
+import torch
+
+# from C_MTEB.tasks import *
+from mteb import MTEB, DRESModel
+
+from retrievals import AutoModelForEmbedding
+
+TASKS_WITH_PROMPTS = [
+    "T2Retrieval",
+    "MMarcoRetrieval",
+    "DuRetrieval",
+    "CovidRetrieval",
+    "CmedqaRetrieval",
+    "EcomRetrieval",
+    "MedicalRetrieval",
+    "VideoRetrieval",
+]
+
+parser = argparse.ArgumentParser(description='evaluation for CMTEB')
+parser.add_argument('--model_name', default='bert-base-uncased', type=str, help='which model to use')
+parser.add_argument('--output_dir', default='zh_results/', type=str, help='output directory')
+parser.add_argument('--max_len', default=512, type=int, help='max length')
+
+args = parser.parse_args()
+
+
+class RetrievalModel(DRESModel):
+    def __init__(self, encoder, query_instruction='', document_instruction='', **kwargs):
+        self.encoder = encoder
+        self.query_instruction = query_instruction
+        self.document_instruction = document_instruction
+
+    def encode_queries(self, queries: List[str], **kwargs) -> np.ndarray:
+        """For MTEB eval
+        This function will be used for retrieval task
+        if there is an instruction for queries, we will add it to the query text
+        """
+        input_texts = [self.query_instruction + q for q in queries]
+        return self._do_encode(input_texts)
+
+    def encode_corpus(self, corpus: List[Dict[str, str]], **kwargs) -> np.ndarray:
+        """For MTEB eval
+        This function will be used for retrieval task
+        encode corpus for retrieval task
+        """
+        if isinstance(corpus[0], dict):
+            input_texts = ['{} {}'.format(doc.get('title', ''), doc['text']).strip() for doc in corpus]
+        else:
+            input_texts = corpus
+
+        input_texts = [self.document_instruction + t for t in input_texts]
+        return self._do_encode(input_texts)
+
+    @torch.no_grad()
+    def _do_encode(self, input_texts: List[str]) -> np.ndarray:
+        return self.encoder.encode(
+            sentences=input_texts, batch_size=256, normalize_embeddings=True, convert_to_numpy=True
+        )
+
+
+if __name__ == '__main__':
+    encoder = AutoModelForEmbedding.from_pretrained(args.model_name)
+    encoder.encode = functools.partial(encoder.encode, normalize_embeddings=True)
+
+    task_names = [t.description["name"] for t in MTEB(task_langs=['zh', 'zh-CN']).tasks]
+    random.shuffle(task_names)
+
+    for task in task_names:
+        evaluation = MTEB(tasks=[task], task_langs=['zh', 'zh-CN'])
+        if task in TASKS_WITH_PROMPTS:
+            evaluation.run(RetrievalModel(encoder), output_folder=args.output_dir, overwrite_results=False)
+        else:
+            evaluation.run(encoder, output_folder=args.output_dir, overwrite_results=False)
diff --git a/examples/msmacro/README.md b/examples/msmacro/README.md
@@ -0,0 +1,16 @@
+# msmacro
+
+## Download the data
+- [msmacro data](https://microsoft.github.io/msmarco/Datasets.html)
+
+```shell
+sh download_data.sh
+```
+
+## Prepare data
+```shell
+python prepare_data.py
+```
+
+
+## Evaluation
diff --git a/examples/msmacro/download_data.sh b/examples/msmacro/download_data.sh
@@ -0,0 +1,9 @@
+
+wget https://msmarco.z22.web.core.windows.net/msmarcoranking/collectionandqueries.tar.gz
+wget https://msmarco.z22.web.core.windows.net/msmarcoranking/triples.train.small.tar.gz
+wget https://msmarco.z22.web.core.windows.net/msmarcoranking/top1000.eval.tar.gz
+
+tar -xzvf top1000.eval.tar.gz
+tar -xzvf triples.train.small.tar.gz
+tar -xzvf collectionandqueries.tar.gz
+rm *.gz
diff --git a/examples/t2_ranking/README.md b/examples/t2_ranking/README.md
@@ -18,28 +18,25 @@ python prepare_t2ranking_data.py
 ## 2. Finetune embedding
 
 ```shell
-sh pairwise_embed_train.sh
+sh embed_pairwise_train.sh
 ```
 
-## Indexing
-Encode corpus
 ```shell
-sh encode_corpus.sh
+sh embed_llm_train.sg
 ```
 
-Encode Query
+
+## Rerank
 ```shell
-sh encode_query.sh
+sh rerank_cross_encoder.sh
 ```
 
-## Retrieve
 ```shell
-sh retrieve.sh
+sh rerank_colbert.sh
 ```
 
-## Rerank
 ```shell
-sh rerank.sh
+sh rerank_llm.sh
 ```
 
 ## Evaluate
diff --git a/examples/t2_ranking/embed_llm_train.sh b/examples/t2_ranking/embed_llm_train.sh
@@ -0,0 +1,31 @@
+MODEL_NAME="Qwen/Qwen2-1.5B-Instruct"
+TRAIN_DATA="/t2_ranking.jsonl"
+OUTPUT_DIR="/t2_output"
+
+torchrun --nproc_per_node 1 \
+  -m retrievals.pipelines.embed \
+  --output_dir $OUTPUT_DIR \
+  --overwrite_output_dir \
+  --model_name_or_path $MODEL_NAME \
+  --pooling_method last \
+  --do_train \
+  --data_name_or_path $TRAIN_DATA \
+  --positive_key positive \
+  --negative_key negative \
+  --use_lora True \
+  --query_instruction "Retrieve the possible answer for query.\nQuery: " \
+  --document_instruction 'Document: ' \
+  --learning_rate 1e-4 \
+  --bf16 \
+  --num_train_epochs 3 \
+  --per_device_train_batch_size 4 \
+  --gradient_accumulation_steps 16 \
+  --dataloader_drop_last True \
+  --query_max_length 64 \
+  --document_max_length 256 \
+  --train_group_size 4 \
+  --logging_strategy steps \
+  --logging_steps 100 \
+  --temperature 0.02 \
+  --use_inbatch_negative false \
+  --save_total_limit 1
diff --git a/examples/t2_ranking/rerank.sh b/examples/t2_ranking/rerank.sh
diff --git a/examples/t2_ranking/rerank_colbert.sh b/examples/t2_ranking/rerank_colbert.sh
@@ -0,0 +1,26 @@
+MODEL_NAME='hfl/chinese-roberta-wwm-ext'
+TRAIN_DATA="t2_ranking.jsonl"
+OUTPUT_DIR="t2_output"
+
+torchrun --nproc_per_node 1 \
+  --module retrievals.pipelines.rerank \
+  --output_dir $OUTPUT_DIR \
+  --overwrite_output_dir \
+  --model_name_or_path $MODEL_NAME \
+  --tokenizer_name $MODEL_NAME \
+  --model_type colbert \
+  --do_train \
+  --data_name_or_path $TRAIN_DATA \
+  --positive_key positive \
+  --negative_key negative \
+  --learning_rate 5e-5 \
+  --bf16 \
+  --num_train_epochs 5 \
+  --per_device_train_batch_size 32 \
+  --dataloader_drop_last True \
+  --max_length 256 \
+  --train_group_size 4 \
+  --unfold_each_positive false \
+  --save_total_limit 1 \
+  --logging_steps 100 \
+  --use_inbatch_negative False
diff --git a/examples/t2_ranking/rerank_cross_encoder.sh b/examples/t2_ranking/rerank_cross_encoder.sh
@@ -0,0 +1,22 @@
+MODEL_NAME="BAAI/bge-reranker-base"
+TRAIN_DATA="t2_ranking.jsonl"
+OUTPUT_DIR="t2_rank_output"
+
+torchrun --nproc_per_node 1 \
+  -m retrievals.pipelines.rerank \
+  --output_dir $OUTPUT_DIR \
+  --overwrite_output_dir \
+  --model_name_or_path $MODEL_NAME \
+  --model_type cross-encoder \
+  --do_train \
+  --data_name_or_path $TRAIN_DATA \
+  --positive_key positive \
+  --negative_key negative \
+  --learning_rate 2e-5 \
+  --fp16 \
+  --num_train_epochs 3 \
+  --per_device_train_batch_size 64 \
+  --dataloader_drop_last True \
+  --max_length 512 \
+  --save_total_limit 1 \
+  --logging_steps 100
diff --git a/examples/t2_ranking/rerank_llm.sh b/examples/t2_ranking/rerank_llm.sh
@@ -0,0 +1,29 @@
+MODEL_NAME="Qwen/Qwen2-1.5B-Instruct"
+TRAIN_DATA="t2_ranking.jsonl"
+OUTPUT_DIR="t2_output"
+
+torchrun --nproc_per_node 1 \
+    -m retrievals.pipelines.rerank \
+    --output_dir ${OUTPUT_DIR} \
+    --overwrite_output_dir \
+    --model_name_or_path $MODEL_NAME \
+    --model_type llm \
+    --causal_lm True \
+    --use_lora True \
+    --data_name_or_path $TRAIN_DATA \
+    --task_prompt "Given a query A and a passage B, determine whether the passage contains an answer to the query by providing a prediction of either 'Yes' or 'No'." \
+    --query_instruction "A: " \
+    --document_instruction 'B: ' \
+    --positive_key positive \
+    --negative_key negative \
+    --learning_rate 2e-4 \
+    --num_train_epochs 3 \
+    --per_device_train_batch_size 4 \
+    --gradient_accumulation_steps 16 \
+    --dataloader_drop_last True \
+    --max_len 256 \
+    --train_group_size 4 \
+    --logging_steps 10 \
+    --save_steps 20000 \
+    --save_total_limit 1 \
+    --bf16
diff --git a/examples/t2_ranking/retrieve.sh b/examples/t2_ranking/retrieve.sh
diff --git a/src/retrievals/losses/circle.py b/src/retrievals/losses/circle.py
diff --git a/src/retrievals/losses/cosent.py b/src/retrievals/losses/cosent.py
diff --git a/src/retrievals/losses/distributed_loss.py b/src/retrievals/losses/distributed_loss.py
diff --git a/src/retrievals/models/base.py b/src/retrievals/models/base.py
diff --git a/src/retrievals/version.py b/src/retrievals/version.py