LongxingTan
diff --git a/Diff for: ‎README.md
+2-2 b/Diff for: ‎README.md
+2-2
diff --git a/Diff for: ‎README_ja-JP.md
+2-2 b/Diff for: ‎README_ja-JP.md
+2-2
diff --git a/Diff for: ‎README_zh-CN.md
+5-5 b/Diff for: ‎README_zh-CN.md
+5-5
diff --git a/Diff for: ‎docs/source/embed.rst
+17-22 b/Diff for: ‎docs/source/embed.rst
+17-22
diff --git a/Diff for: ‎docs/source/index.rst
+8-2 b/Diff for: ‎docs/source/index.rst
+8-2
diff --git a/Diff for: ‎docs/source/quick-start.rst
+64-8 b/Diff for: ‎docs/source/quick-start.rst
+64-8
diff --git a/Diff for: ‎docs/source/rerank.rst
+26-9 b/Diff for: ‎docs/source/rerank.rst
+26-9
diff --git a/Diff for: ‎examples/README.md
+12 b/Diff for: ‎examples/README.md
+12
diff --git a/Diff for: ‎src/retrievals/models/embedding_auto.py
+1-2 b/Diff for: ‎src/retrievals/models/embedding_auto.py
+1-2
@@ -9,7 +9,7 @@
 [lint-image]: https://github.com/LongxingTan/open-retrievals/actions/workflows/lint.yml/badge.svg?branch=master
 [lint-url]: https://github.com/LongxingTan/open-retrievals/actions/workflows/lint.yml?query=branch%3Amaster
 [docs-image]: https://readthedocs.org/projects/open-retrievals/badge/?version=latest
-[docs-url]: https://open-retrievals.readthedocs.io/en/latest/?version=latest
+[docs-url]: https://open-retrievals.readthedocs.io/en/master/
 [coverage-image]: https://codecov.io/gh/longxingtan/open-retrievals/branch/master/graph/badge.svg
 [coverage-url]: https://codecov.io/github/longxingtan/open-retrievals?branch=master
 [contributing-image]: https://img.shields.io/badge/contributions-welcome-brightgreen.svg?style=flat
@@ -29,7 +29,7 @@
   [![Code Coverage][coverage-image]][coverage-url]
   [![Contributing][contributing-image]][contributing-url]
 
-  **[Documentation](https://open-retrievals.readthedocs.io)** | **[中文](https://github.com/LongxingTan/open-retrievals/blob/master/README_zh-CN.md)** | **[日本語](https://github.com/LongxingTan/open-retrievals/blob/master/README_ja-JP.md)**
+  **[Documentation](https://open-retrievals.readthedocs.io/en/master/)** | **[中文](https://github.com/LongxingTan/open-retrievals/blob/master/README_zh-CN.md)** | **[日本語](https://github.com/LongxingTan/open-retrievals/blob/master/README_ja-JP.md)**
 
 </div>
 
 
@@ -9,7 +9,7 @@
 [lint-image]: https://github.com/LongxingTan/open-retrievals/actions/workflows/lint.yml/badge.svg?branch=master
 [lint-url]: https://github.com/LongxingTan/open-retrievals/actions/workflows/lint.yml?query=branch%3Amaster
 [docs-image]: https://readthedocs.org/projects/open-retrievals/badge/?version=latest
-[docs-url]: https://open-retrievals.readthedocs.io/en/latest/?version=latest
+[docs-url]: https://open-retrievals.readthedocs.io/en/master/
 [coverage-image]: https://codecov.io/gh/longxingtan/open-retrievals/branch/master/graph/badge.svg
 [coverage-url]: https://codecov.io/github/longxingtan/open-retrievals?branch=master
 [contributing-image]: https://img.shields.io/badge/contributions-welcome-brightgreen.svg?style=flat
@@ -29,7 +29,7 @@
 [![Code Coverage][coverage-image]][coverage-url]
 [![Contributing][contributing-image]][contributing-url]
 
-**[ドキュメント](https://open-retrievals.readthedocs.io)** | **[英語](https://github.com/LongxingTan/open-retrievals/blob/master/README.md)** | **[中文](https://github.com/LongxingTan/open-retrievals/blob/master/README_zh-CN.md)**
+**[ドキュメント](https://open-retrievals.readthedocs.io/en/master/)** | **[英語](https://github.com/LongxingTan/open-retrievals/blob/master/README.md)** | **[中文](https://github.com/LongxingTan/open-retrievals/blob/master/README_zh-CN.md)**
 </div>
 
 ![structure](./docs/source/_static/structure.png)
 
@@ -9,7 +9,7 @@
 [lint-image]: https://github.com/LongxingTan/open-retrievals/actions/workflows/lint.yml/badge.svg?branch=master
 [lint-url]: https://github.com/LongxingTan/open-retrievals/actions/workflows/lint.yml?query=branch%3Amaster
 [docs-image]: https://readthedocs.org/projects/open-retrievals/badge/?version=latest
-[docs-url]: https://open-retrievals.readthedocs.io/en/latest/?version=latest
+[docs-url]: https://open-retrievals.readthedocs.io/en/master/
 [coverage-image]: https://codecov.io/gh/longxingtan/open-retrievals/branch/master/graph/badge.svg
 [coverage-url]: https://codecov.io/github/longxingtan/open-retrievals?branch=master
 [contributing-image]: https://img.shields.io/badge/contributions-welcome-brightgreen.svg?style=flat
@@ -29,7 +29,7 @@
 [![Code Coverage][coverage-image]][coverage-url]
 [![Contributing][contributing-image]][contributing-url]
 
-**[中文wiki](https://github.com/LongxingTan/open-retrievals/wiki)** | **[英文文档](https://open-retrievals.readthedocs.io)**
+**[中文wiki](https://github.com/LongxingTan/open-retrievals/wiki)** | **[英文文档](https://open-retrievals.readthedocs.io/en/master/)**
 </div>
 
 ![structure](./docs/source/_static/structure.png)
@@ -80,8 +80,8 @@ from retrievals import AutoModelForEmbedding
 
 sentences = [
     "在1974年，第一次在东南亚打自由搏击就得了冠军",
-    "1982年打赢了日本重炮手雷龙，接着连续三年打败所有日本空手道高手，赢得全日本自由搏击冠军",
     "中国古拳法唯一传人鬼王达，被喻为空手道的克星，绰号魔鬼筋肉人",
+    "1982年打赢了日本重炮手雷龙，接着连续三年打败所有日本空手道高手，赢得全日本自由搏击冠军",
     "古人有云，有功夫，无懦夫"
 ]
 
@@ -97,12 +97,12 @@ print(scores.tolist())
 from retrievals import AutoModelForEmbedding, AutoModelForRetrieval
 
 index_path = './database/faiss/faiss.index'
-sentences = ['A dog is chasing car.', 'A man is playing a guitar.']
+sentences = ['在中国是中国人', '在美国是美国人', '2000人民币大于3000美元']
 model_name_or_path = "sentence-transformers/all-MiniLM-L6-v2"
 model = AutoModelForEmbedding.from_pretrained(model_name_or_path)
 model.build_index(sentences, index_path=index_path)
 
-query_embed = model.encode("He plays guitar.")
+query_embed = model.encode("在加拿大是加拿大人")
 matcher = AutoModelForRetrieval()
 dists, indices = matcher.search(query_embed, index_path=index_path)
 print(indices)
 
@@ -3,50 +3,45 @@ Embedding
 
 .. _embed:
 
-Use embedding from open-retrievals
+1. Use embedding from open-retrievals
 ---------------------------------------
 
 we can use `AutoModelForEmbedding` to get the sentence embedding from pretrained transformer or large language model.
 
 The Transformer model could get the representation vector from a sentence.
 
-
-.. epigraph::
-    :align: left
-
     Choose the right `pooling_method` when use the pretrained embedding, check in `huggingface <https://huggingface.co/models>`_
 
 
-Fine-tune
+2. Fine-tune
 ------------------
 
-point-wise
+- point-wise
 
-- `{(query, label), (document, label)}`
+    `{(query, label), (document, label), ...}`
 
 
-pairwise
+- pairwise
 
-- `{(query, positive, label), (query, negative, label)}`
+    `{(query, positive, negative), {query, positive, negative}, ...}`
 
-- `{(query, positive, negative), {query, positive, negative}}`
+    `{(query, positive, negative1, negative2, negative3), (query, positive, negative1, negative2, negative3), ...}`
 
-- `{(query, positive, negative1, negative2, negative3...)}`
+    `{(query, positive, label), (query, negative, label), ...}`
 
-listwise
-
-- `{(query+positive)}`
+- listwise
 
 
 Loss function
 ~~~~~~~~~~~~~~~~~~~~~~
 
-- binary classification:
-    - similarity(query, positive) > similarity(query, negative)
-    - hinge loss: max(0, similarity(query, positive) - similarity(query, negative) + margin)
-    - logistic loss: logistic(similarity(query, positive) - similarity(query, negative))
-- multi-label classification:
-    - similarity(query, positive), similarity(query, negative1), similarity(query, negative2)
+binary classification:
+- similarity(query, positive) > similarity(query, negative)
+- hinge loss: max(0, similarity(query, positive) - similarity(query, negative) + margin)
+- logistic loss: logistic(similarity(query, positive) - similarity(query, negative))
+
+multi-label classification:
+- similarity(query, positive), similarity(query, negative1), similarity(query, negative2)
 
 
 Pair wise
@@ -112,7 +107,7 @@ arcface
 List wise
 ~~~~~~~~~~~~~~
 
-Training skills
+3. Training skills
 -----------------------------------
 
 multiple gpus
 
@@ -38,9 +38,15 @@ Run a simple example
 
 .. code-block:: python
 
-    import retrievals
+    from retrievals import AutoModelForEmbedding
 
+    sentences = ["Hello NLP", "Open-retrievals is designed for retrieval, rerank and RAG"]
+    model_name_or_path = "sentence-transformers/all-MiniLM-L6-v2"
+    model = AutoModelForEmbedding.from_pretrained(model_name_or_path, pooling_method="mean")
+    sentence_embeddings = model.encode(sentences, normalize_embeddings=True, convert_to_tensor=True)
+    print(sentence_embeddings)
 
+Open-retrievals support to fine-tune the embedding model, reranking model, llm easily for custom usage.
 
 * `Pairwise embedding fine-tuning <https://github.com/LongxingTan/open-retrievals/blob/master/examples/embedding_pairwise_finetune.py>`_
 * `Pairwise LLM embedding fine-tuning <https://github.com/LongxingTan/open-retrievals/blob/master/examples/embedding_llm_finetune.py>`_
@@ -49,7 +55,7 @@ Run a simple example
 * `LLM reranking fine-tuning <https://github.com/LongxingTan/open-retrievals/blob/master/examples/rerank_llm_finetune.py>`_
 
 
-More datasets
+More datasets examples
 
 * `T2 ranking dataset <https://github.com/LongxingTan/open-retrievals/tree/master/examples/t2_ranking>`_
 * `scifact dataset <https://github.com/LongxingTan/open-retrievals/tree/master/examples/scifact>`_
 
@@ -5,6 +5,10 @@ Quick start
 
 We can easily use Open-retrievals to fine-tune the model easily for information retrieval and RAG application.
 
+.. image:: https://colab.research.google.com/assets/colab-badge.svg
+    :target: https://colab.research.google.com/drive/1-WBMisdWLeHUKlzJ2DrREXY_kSV8vjP3?usp=sharing
+    :alt: Open In Colab
+
 
 1. Embedding
 -----------------------------
@@ -15,18 +19,26 @@ We can use the pretrained embedding easily from transformers or sentence-transfo
 
     from retrievals import AutoModelForEmbedding
 
-    sentences = ["Hello NLP", "Open-retrievals is designed for retrieval, rerank and RAG"]
-    model_name_or_path = "sentence-transformers/all-MiniLM-L6-v2"
+    sentences = [
+        'query: how much protein should a female eat',
+        'query: summit define',
+        "passage: As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. ",
+        "passage: Definition of summit for English Language Learners. : 1  the highest point of a mountain : the top of a mountain. : 2  the highest level."
+    ]
+    model_name_or_path = 'intfloat/e5-base-v2'
+    # sentence embedding mode
     model = AutoModelForEmbedding.from_pretrained(model_name_or_path, pooling_method="mean")
-    sentence_embeddings = model.encode(sentences, normalize_embeddings=True, convert_to_tensor=True)
-    print(sentence_embeddings)
+    # encode the sentence to embedding vector
+    embeddings = model.encode(sentences, normalize_embeddings=True, convert_to_tensor=True)
+    scores = (embeddings[:2] @ embeddings[2:].T) * 100
+    print(scores.tolist())
 
 .. code::
 
-    output
+    [[89.92379760742188, 68.0742416381836], [68.93356323242188, 91.32250213623047]]
 
 
-Embedding fine-tuned
+Fine-tune embedding
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 If we want to further improve the retrieval performance, an optional method is to fine tune the embedding model weights. It will project the vector of query and answer to similar representation space.
@@ -99,15 +111,59 @@ If we have multiple retrieval source or a better sequence, we can add the rerank
 
     from retrievals import AutoModelForRanking
 
+    sentences = [
+        ["In 1974, I won the championship in Southeast Asia in my first kickboxing match", "In 1982, I defeated the heavy hitter Ryu Long."],
+        ['A dog is chasing car.', 'A man is playing a guitar.'],
+    ]
+
     model_name_or_path: str = "BAAI/bge-reranker-base"
     rerank_model = AutoModelForRanking.from_pretrained(model_name_or_path)
-    scores_list = rerank_model.compute_score(["In 1974, I won the championship in Southeast Asia in my first kickboxing match", "In 1982, I defeated the heavy hitter Ryu Long."])
+    scores_list = rerank_model.compute_score(sentences)
     print(scores_list)
 
+.. code::
 
-Rerank fine-tuned
+    [-5.075257778167725, -10.194067001342773]
+
+
+Fine-tune reranking
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+.. code-block:: python
+
+    from transformers import AutoTokenizer, TrainingArguments, get_cosine_schedule_with_warmup, AdamW
+    from retrievals import RerankCollator, AutoModelForRanking, RerankTrainer, RerankTrainDataset
+
+    model_name_or_path: str = "microsoft/deberta-v3-base"
+    max_length: int = 128
+    learning_rate: float = 3e-5
+    batch_size: int = 4
+    epochs: int = 3
+
+    train_dataset = RerankTrainDataset('./t2rank.json', positive_key='pos', negative_key='neg')
+    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=False)
+    model = AutoModelForRanking.from_pretrained(model_name_or_path)
+    optimizer = AdamW(model.parameters(), lr=learning_rate)
+    num_train_steps = int(len(train_dataset) / batch_size * epochs)
+    scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=0.05 * num_train_steps, num_training_steps=num_train_steps)
+
+    training_args = TrainingArguments(
+        learning_rate=learning_rate,
+        per_device_train_batch_size=batch_size,
+        num_train_epochs=epochs,
+        output_dir='./checkpoints',
+        remove_unused_columns=False,
+    )
+    trainer = RerankTrainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset,
+        data_collator=RerankCollator(tokenizer, max_length=max_length),
+    )
+    trainer.optimizer = optimizer
+    trainer.scheduler = scheduler
+    trainer.train()
+
 
 4. RAG
 -----------------------------
 
@@ -3,22 +3,30 @@ Rerank
 
 .. _rerank:
 
-Use Rerank from open-retrievals
-------------------------------------
+1. Use reranking from open-retrievals
+-------------------------------------------
 
 .. code-block:: python
 
     from retrievals import AutoModelForRanking
 
+    sentences = [
+        ["In 1974, I won the championship in Southeast Asia in my first kickboxing match", "In 1982, I defeated the heavy hitter Ryu Long."],
+        ['A dog is chasing car.', 'A man is playing a guitar.'],
+    ]
+
     model_name_or_path: str = "BAAI/bge-reranker-base"
     rerank_model = AutoModelForRanking.from_pretrained(model_name_or_path)
-    scores_list = rerank_model.compute_score(["In 1974, I won the championship in Southeast Asia in my first kickboxing match", "In 1982, I defeated the heavy hitter Ryu Long."])
+    scores_list = rerank_model.compute_score(sentences)
     print(scores_list)
 
+.. code::
+
+    [-5.075257778167725, -10.194067001342773]
 
 
-Fine tuning Cross-encoder
-----------------------------
+2. Fine-tune cross-encoder reranking model
+-----------------------------------------------
 
 .. image:: https://colab.research.google.com/assets/colab-badge.svg
     :target: https://colab.research.google.com/drive/1QvbUkZtG56SXomGYidwI4RQzwODQrWNm?usp=sharing
@@ -61,12 +69,21 @@ Fine tuning Cross-encoder
     trainer.train()
 
 
-Fine tuning ColBERT
-----------------------------
+3. Fine-tune ColBERT reranking model
+----------------------------------------
+
+.. image:: https://colab.research.google.com/assets/colab-badge.svg
+    :target: https://colab.research.google.com/drive/1QVtqhQ080ZMltXoJyODMmvEQYI6oo5kO?usp=sharing
+    :alt: Open In Colab
 
 
-Fine tuning LLM ranker
-----------------------------
+4. Fine-tune LLM reranker
+-------------------------------------
+
+.. image:: https://colab.research.google.com/assets/colab-badge.svg
+    :target: https://colab.research.google.com/drive/1fzq1iV7-f8hNKFnjMmpVhVxadqPb9IXk?usp=sharing
+    :alt: Open In Colab
+
 
 - Point-wise style prompt:
 
 
@@ -16,11 +16,20 @@ Check the whole pipeline examples
 ## Embedding
 
 **Data Format**
+
+- In-batch negative fine-tuning
+```
+{'query': TEXT_TYPE, 'positive': List[TEXT_TYPE]}
+...
+```
+
+- Hard negative (+ In-batch negative) fine-tuning
 ```
 {'query': TEXT_TYPE, 'positive': List[TEXT_TYPE], 'negative': List[TEXT_TYPE]}
 ...
 ```
 
+
 **Pairwise embedding finetune**
 ```shell
 MODEL_NAME="BAAI/bge-base-zh-v1.5"
@@ -216,3 +225,6 @@ The grad_norm during training is always zero?
 The fine-tuned embedding performance during inference is worse than original?
 - check whether the pooling_method is correct
 - check whether the prompt is the same as training for LLM model
+
+How can we fine-tune the `BAAI/bge-m3` ColBERT model?
+- download the weights first using `snapshot_download` from huggingface_hub to model_dir, then use ColBERT.from_pretrained(model_dir)
@@ -9,8 +9,7 @@
 import torch
 import torch.nn as nn
 from torch.utils.data import DataLoader
-from tqdm.auto import tqdm
-from tqdm.autonotebook import trange
+from tqdm.auto import tqdm, trange
 from transformers import (
     AutoConfig,
     AutoModel,