Skip to content

Commit 42c7ce1

Browse files
committed
add colbert
add colbert embedding
1 parent 695419d commit 42c7ce1

23 files changed

+184
-227
lines changed

src/embeddings/bert/README.md

-1
This file was deleted.
+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# Use an official Python runtime as a parent image
2+
FROM python:3.9-slim
3+
4+
WORKDIR /app
5+
6+
RUN apt-get update && apt-get install -y \
7+
build-essential \
8+
git \
9+
&& rm -rf /var/lib/apt/lists/*
10+
11+
# Install requirements
12+
COPY requirements.txt requirements.txt
13+
RUN pip3 install -r requirements.txt
14+
15+
RUN apt-get update && apt-get install -y wget
16+
# Download necessary files
17+
RUN gdown "https://drive.google.com/uc?id=1VlLcGWmDKAoK3aUthVXOFxzOdgzf-SNo" -O Testing1.csv
18+
19+
# Clone necessary repositories
20+
RUN apt-get update && apt-get install -y git
21+
RUN git clone https://huggingface.co/GautamR/colbert_agri_embeddings
22+
23+
# Copy the rest of the application code to the working directory
24+
COPY . /app/
25+
EXPOSE 8000
26+
# Set the entrypoint for the container
27+
CMD ["hypercorn", "--bind", "0.0.0.0:8000", "api:app"]
+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
from .request import ModelRequest
2+
from .request import Model

src/embeddings/colbert/local/api.py

+26
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
from model import Model
2+
from request import ModelRequest
3+
from quart import Quart, request
4+
import aiohttp
5+
import pandas as pd
6+
import gdown
7+
8+
app = Quart(__name__)
9+
10+
model = None
11+
12+
@app.before_serving
13+
async def startup():
14+
app.client = aiohttp.ClientSession()
15+
global model
16+
model = Model(app)
17+
18+
@app.route('/', methods=['POST'])
19+
async def embed():
20+
global model
21+
data = await request.get_json()
22+
req = ModelRequest(**data)
23+
return await model.inference(req)
24+
25+
if __name__ == "__main__":
26+
app.run()

src/embeddings/colbert/local/model.py

+31
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
import pandas as pd
2+
from ragatouille import RAGPretrainedModel
3+
from request import ModelRequest
4+
from colbert import Indexer, Searcher
5+
from colbert.infra import Run, RunConfig, ColBERTConfig
6+
from colbert.data import Queries, Collection
7+
8+
9+
10+
class Model():
11+
def __new__(cls, context):
12+
cls.context = context
13+
if not hasattr(cls, 'instance'):
14+
cls.instance = super(Model, cls).__new__(cls)
15+
# Initialize Colbert
16+
cls.df = pd.read_csv('/Testing1.csv')
17+
cls.df['PID'] = cls.df.index.astype(str)
18+
with Run().context(RunConfig(experiment='notebook')):
19+
cls.searcher = Searcher(index='/colbert_agri_embeddings/', collection=cls.df['content'].to_list())
20+
print(cls.df.columns)
21+
22+
return cls.instance
23+
24+
async def inference(self, request: ModelRequest):
25+
query = request.text
26+
k = request.k
27+
column_returned = 'id'
28+
results = self.searcher.search(query, k)
29+
searched_ids = self.df.loc[results[0], column_returned].to_list()
30+
searched_content = self.df.loc[results[0], 'content'].to_list()
31+
return {"ids": searched_ids, "content": searched_content, "scores": results[2]}
+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
import requests
2+
import json
3+
4+
5+
class ModelRequest():
6+
def __init__(self, text, k ):
7+
self.text = text
8+
self.k = k
9+
10+
def to_json(self):
11+
return json.dumps(self, default=lambda o: o.__dict__,
12+
sort_keys=True, indent=4)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
torch
2+
scikit-learn
3+
quart
4+
aiohttp
5+
pandas
6+
faiss-gpu
7+
datasets
8+
gdown
9+
ragatouille
10+
langchain-openai
11+
colbert-ai
12+
gdown

src/embeddings/instructor/local/api.py

+9-24
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
import aiohttp
55
import pandas as pd
66
import io
7-
from quart import jsonify
87

98
app = Quart(__name__)
109

@@ -16,37 +15,23 @@ async def startup():
1615
global model
1716
model = Model(app)
1817

19-
2018
@app.route('/', methods=['POST'])
2119
async def embed():
2220
global model
2321
data = await request.get_json()
24-
files = await request.files
25-
uploaded_file = files.get('file')
22+
files = await request.files # await the coroutine
23+
uploaded_file = files.get('file') # now you can use .get()
2624

2725
if uploaded_file:
2826
df = pd.read_csv(uploaded_file.stream)
29-
if df.empty or df['content'].isnull().any():
30-
return jsonify({'error': 'There are nonzero null rows'}), 400 # Return a 400 Bad Request response with the error message
31-
32-
req = ModelRequest(df=df)
27+
req = ModelRequest(df=df) # Pass the DataFrame to ModelRequest
3328
response = await model.inference(req)
34-
35-
# If the response from the model is an error message, return it with a 400 status
36-
if response == 'There are nonzero null rows':
37-
return jsonify({'error': response}), 400
38-
39-
# Otherwise, assume response is a CSV string
40-
df = pd.read_csv(io.StringIO(response))
29+
df = pd.read_csv(io.StringIO(response)) # Convert the CSV string back to a DataFrame
30+
# Save the DataFrame to a CSV file
4131
df.to_csv('output.csv', index=False)
32+
4233
return await send_file('output.csv', mimetype='text/csv', as_attachment=True, attachment_filename='output.csv')
43-
else:
34+
35+
else:
4436
req = ModelRequest(**data)
45-
response = await model.inference(req)
46-
47-
# Handle potential error from model inference in a similar way
48-
if response == 'There are nonzero null rows':
49-
return jsonify({'error': response}), 400
50-
51-
# Otherwise, send back the model's response
52-
return response
37+
return await model.inference(req)

src/embeddings/instructor/local/model.py

+13-36
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,6 @@
44
import wget
55
import pandas as pd
66
import os
7-
from quart import jsonify # Import jsonify to send JSON responses
8-
97

108
class Model():
119
def __new__(cls, context):
@@ -18,23 +16,13 @@ def __new__(cls, context):
1816

1917
async def inference(self, request: ModelRequest):
2018
# Modify this function according to model requirements such that inputs and output remains the same
21-
corpus_instruction = "Represent the document for retrieval:"
22-
query_instruction = 'Represent the question for retrieving supporting documents: '
19+
corpus_instruction = "Represent the Wikipedia document for retrieval:"
20+
query_instruction = 'Represent the Wikipedia question for retrieving supporting documents: '
2321
query = request.query
24-
query_type = request.query_type
2522

2623
if(query != None):
2724
# print('Query Encoding Process :-')
28-
if query_type == 'retrieval':
29-
query_embeddings = self.model.encode(
30-
[[corpus_instruction, query]],
31-
show_progress_bar=False,
32-
batch_size=32,
33-
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
34-
)
35-
36-
else :
37-
query_embeddings = self.model.encode(
25+
query_embeddings = self.model.encode(
3826
[[query_instruction, query]],
3927
show_progress_bar=False,
4028
batch_size=32,
@@ -45,26 +33,15 @@ async def inference(self, request: ModelRequest):
4533
if not request.df.empty:
4634
# print('Text corpus Encoding Process :-')
4735
data = request.df
48-
data = data.loc[~pd.isnull(data['content']),:]
49-
data['content'] = data['content'].astype(str)
50-
51-
if data.empty or data['content'].isnull().any():
52-
return 'There are nonzero null rows'
53-
54-
else :
55-
text_corpus = data.loc[:,'content'].to_list()
56-
57-
if not text_corpus:
58-
corpus_embeddings = self.model.encode(
59-
[[corpus_instruction, text] for text in text_corpus],
60-
show_progress_bar=False,
61-
batch_size=32,
62-
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
63-
)
64-
data['embeddings'] = corpus_embeddings.tolist()
65-
csv_string = data.to_csv(index=False)
66-
else:
67-
return 'There are nonzero null rows'
68-
36+
37+
text_corpus = data.loc[:,'content'].to_list()
38+
corpus_embeddings = self.model.encode(
39+
[[corpus_instruction, text] for text in text_corpus],
40+
show_progress_bar=False,
41+
batch_size=32,
42+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
43+
)
44+
data['embeddings'] = corpus_embeddings.tolist()
45+
csv_string = data.to_csv(index=False)
6946

7047
return str(csv_string)

src/embeddings/instructor/local/request.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,9 @@
33

44

55
class ModelRequest():
6-
def __init__(self, query=None, df = pd.DataFrame(), query_type = None):
6+
def __init__(self, query=None, df = pd.DataFrame()):
77
# Url to download csv file
88
self.query = query # String
9-
self.query_type = query_type
109
self.df = df
1110

1211
def to_json(self):
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
torch==2.0.1 --index-url https://download.pytorch.org/whl/cpu
1+
torch
22
quart
33
aiohttp
44
InstructorEmbedding
55
wget
66
pandas
77
tqdm
8-
sentence_transformers
8+
sentence-transformers==2.2.2

src/embeddings/instructor_gpu/README.md

-1
This file was deleted.

src/embeddings/instructor_gpu/local/Dockerfile

-15
This file was deleted.

src/embeddings/instructor_gpu/local/README.md

-18
This file was deleted.

src/embeddings/instructor_gpu/local/__init__.py

-2
This file was deleted.

src/embeddings/instructor_gpu/local/api.py

-37
This file was deleted.

0 commit comments

Comments
 (0)