Skip to content

Commit 4610ab0

Browse files
authored
feat: arXiv search (#107)
1 parent c43672f commit 4610ab0

23 files changed

+1322
-37
lines changed

.env.template

+2-1
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,5 @@ ESMFOLD_WEIGHTS_LOCATION=${NOLABS_HOME}/esmfold-weights
44
RFDIFFUSION_WEIGHTS_LOCATION=${NOLABS_HOME}/rfdiffusion-weights
55
ROSETTAFOLD_BFD_PATH='/media/jt/Local Disk/RoseTTAFold/bfd'
66
ROSETTAFOLD_PDB_PATH='/media/jt/Local Disk/RoseTTAFold/pdb100_2021Mar03'
7-
ROSETTAFOLD_UNIREF_PATH='/media/jt/Local Disk/RoseTTAFold/UniRef30_2020_06'
7+
ROSETTAFOLD_UNIREF_PATH='/media/jt/Local Disk/RoseTTAFold/UniRef30_2020_06'
8+
ARXIV_ABSTRACTS_DB=${NOLABS_HOME}/arxiv_abstracts/chroma_db
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
name: arxiv-abstracts-master
2+
run-name: arxiv-abstracts-master
3+
4+
# Build and push nolabs image using GitHub Cache API
5+
# Only if relevant files were changed
6+
7+
on: workflow_dispatch
8+
9+
jobs:
10+
build:
11+
permissions:
12+
contents: read
13+
packages: write
14+
15+
uses: ./.github/workflows/build-docker.yaml
16+
with:
17+
microservice_name: "arxiv-ai-abstractions-search"
18+
19+
push:
20+
if: github.repository == 'BasedLabs/NoLabs'
21+
needs: build
22+
23+
permissions:
24+
contents: read
25+
packages: write
26+
27+
uses: ./.github/workflows/push-docker.yaml
28+
with:
29+
microservice_name: "arxiv-ai-abstractions-search"
+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
name: arxiv-abstracts-pr
2+
run-name: arxiv-abstracts-pr
3+
4+
# Build and push nolabs image using GitHub Cache API
5+
# Only if relevant files were changed
6+
7+
on:
8+
pull_request:
9+
branches:
10+
- master
11+
paths:
12+
- 'microservices/arxiv_abstracts/**'
13+
14+
jobs:
15+
build:
16+
permissions:
17+
contents: read
18+
packages: write
19+
20+
uses: ./.github/workflows/build-docker.yaml
21+
with:
22+
microservice_name: "arxiv-ai-abstractions-search"

.gitignore

+2-1
Original file line numberDiff line numberDiff line change
@@ -182,4 +182,5 @@ nolabs/experiments/**/*
182182

183183
conformations/client/**
184184

185-
celerybeat-schedule
185+
checkpoint.db
186+
history.json

Makefile

+8-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ install-mock-server:
1515
start-mock-server:
1616
prism mock http://127.0.0.1:${UVICORN_HOST}/openapi.json
1717
gen-envs:
18-
scripts/gen-envs.sh
18+
python3 scripts/gen_envs.py
1919
download-diffdock-weights:
2020
@echo "Downloading diffdock model weights..."
2121
mkdir -p ${DIFFDOCK_WEIGHTS_LOCATION}
@@ -31,4 +31,11 @@ download-rfdiffusion-weights:
3131
@echo "Downloading rfdiffusion model weights..."
3232
mkdir -p ${RFDIFFUSION_WEIGHTS_LOCATION}
3333
python3 microservices/rfdiffusion/scripts/download_weights.py
34+
@echo "Download complete!"
35+
download-arxiv-abstracts-db:
36+
@echo "Downloading arxiv abstracts"
37+
! command -v unzip &> /dev/null && echo 'You have to install unzip: $ sudo apt-get install unzip OR $ brew install unzip (for macos)'
38+
mkdir -p ${ARXIV_ABSTRACTS_DB}
39+
curl -L -o ${ARXIV_ABSTRACTS_DB}/chroma_db.zip https://www.kaggle.com/api/v1/datasets/download/timurishmuratov/nolabs-arxiv-abstract-vector-db
40+
unzip "${ARXIV_ABSTRACTS_DB}/chroma_db.zip" -d "${ARXIV_ABSTRACTS_DB}" && rm ${ARXIV_ABSTRACTS_DB}/chroma_db.zip
3441
@echo "Download complete!"

Makefile.bat

+99
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
@echo off
2+
REM Load variables from .env if it exists
3+
if exist .env (
4+
for /f "usebackq tokens=1,* delims==" %%i in (".env") do (
5+
set %%i=%%j
6+
)
7+
)
8+
9+
set TARGET=%1
10+
11+
if "%TARGET%"=="" (
12+
echo Usage: build.bat [target]
13+
echo Available targets:
14+
echo flower
15+
echo install-openapi-generator
16+
echo generate-client
17+
echo install-mock-server
18+
echo start-mock-server
19+
echo gen-envs
20+
echo download-diffdock-weights
21+
echo download-esmfold-weights
22+
echo download-rfdiffusion-weights
23+
echo download-arxiv-abstracts-db
24+
exit /b 1
25+
)
26+
27+
if "%TARGET%"=="flower" (
28+
celery --broker=%REDIS_URL% flower --port=5555
29+
goto :EOF
30+
)
31+
32+
if "%TARGET%"=="install-openapi-generator" (
33+
npm install -g openapi-typescript-codegen
34+
goto :EOF
35+
)
36+
37+
if "%TARGET%"=="generate-client" (
38+
openapi --input "http://127.0.0.1:8000/openapi.json" --output frontend/src/api/client --client axios
39+
goto :EOF
40+
)
41+
42+
if "%TARGET%"=="install-mock-server" (
43+
npm install -g @stoplight/prism-cli
44+
goto :EOF
45+
)
46+
47+
if "%TARGET%"=="start-mock-server" (
48+
prism mock http://127.0.0.1:%UVICORN_HOST%/openapi.json
49+
goto :EOF
50+
)
51+
52+
if "%TARGET%"=="gen-envs" (
53+
python3 scripts/gen_envs.py
54+
goto :EOF
55+
)
56+
57+
if "%TARGET%"=="download-diffdock-weights" (
58+
echo Downloading diffdock model weights...
59+
if not exist "%DIFFDOCK_WEIGHTS_LOCATION%" mkdir "%DIFFDOCK_WEIGHTS_LOCATION%"
60+
python3 microservices/diffdock/scripts/download_weights.py
61+
echo Download complete!
62+
goto :EOF
63+
)
64+
65+
if "%TARGET%"=="download-esmfold-weights" (
66+
echo Downloading esmfold model weights...
67+
if not exist "%ESMFOLD_WEIGHTS_LOCATION%" mkdir "%ESMFOLD_WEIGHTS_LOCATION%"
68+
pip3 install transformers[torch] --verbose
69+
python3 microservices/esmfold/scripts/download_weights.py
70+
echo Download complete!
71+
goto :EOF
72+
)
73+
74+
if "%TARGET%"=="download-rfdiffusion-weights" (
75+
echo Downloading rfdiffusion model weights...
76+
if not exist "%RFDIFFUSION_WEIGHTS_LOCATION%" mkdir "%RFDIFFUSION_WEIGHTS_LOCATION%"
77+
python3 microservices/rfdiffusion/scripts/download_weights.py
78+
echo Download complete!
79+
goto :EOF
80+
)
81+
82+
if "%TARGET%"=="download-arxiv-abstracts-db" (
83+
echo Downloading arxiv abstracts
84+
where unzip >nul 2>nul
85+
if errorlevel 1 (
86+
echo You must install unzip. For example:
87+
echo choco install unzip (if using Chocolatey)
88+
goto :EOF
89+
)
90+
if not exist "%ARXIV_ABSTRACTS_DB%" mkdir "%ARXIV_ABSTRACTS_DB%"
91+
curl -L -o "%ARXIV_ABSTRACTS_DB%\chroma_db.zip" https://www.kaggle.com/api/v1/datasets/download/timurishmuratov/nolabs-arxiv-abstract-vector-db
92+
powershell -Command "Expand-Archive '%ARXIV_ABSTRACTS_DB%\chroma_db.zip' '%ARXIV_ABSTRACTS_DB%'"
93+
del "%ARXIV_ABSTRACTS_DB%\chroma_db.zip"
94+
echo Download complete!
95+
goto :EOF
96+
)
97+
98+
echo Unknown target "%TARGET%"
99+
exit /b 1

README.md

+36-2
Original file line numberDiff line numberDiff line change
@@ -66,9 +66,19 @@ $ chmod +x scripts/gen-envs.sh
6666
$ make gen-envs
6767
```
6868

69+
OR if you use Windows (untested!)
70+
71+
```
72+
# Clone this project
73+
$ git clone https://github.com/BasedLabs/nolabs
74+
$ cd nolabs
75+
# Create .env files (you will be able to adjust them)
76+
$ Makefile.bat gen-envs
77+
```
78+
6979
Generate a new token for docker registry
70-
https://github.com/settings/tokens/new
71-
Select 'read:packages'
80+
https://github.com/settings/tokens/new?scopes=read:packages
81+
Select 'read:packages' (should be automatically selected when navigating)
7282

7383
```bash
7484
$ docker login ghcr.io -u username -p ghp_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
@@ -259,6 +269,30 @@ Nolabs is running on GPT4 for the best performance. You can adjust the model you
259269
docker compose up blast-query
260270
```
261271

272+
### 10) Arxiv abstracts AI search (Standalone)
273+
274+
<img src="media/arxiv-abstracts-search.png" width="100%">
275+
276+
This microservice contains LLM RAG search over arXiv abstracts (up to 01/12/2024).
277+
How to use this docker image:
278+
279+
1) Generate a new token for docker registry
280+
https://github.com/settings/tokens/new?scopes=read:packages
281+
Select 'read:packages' (should be automatically selected when navigating link above).
282+
2) Download ChromaDb for vector search
283+
```bash
284+
$ make gen-envs
285+
$ make download-arxiv-abstracts-db
286+
```
287+
3) You must set your openai api key either in `microservices/arxiv_abstracts/service/.env` or on UI
288+
4) Start docker
289+
```bash
290+
$ docker login ghcr.io -u username -p ghp_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
291+
$ docker compose -f docker-compose.api.yaml up arxiv-ai-abstractions-search-api
292+
```
293+
5) Wait until fastapi messages appear
294+
6) You can access UI in browser `http://0.0.0.0:8001/chat`
295+
262296
## Requirements ##
263297

264298
**[Recommended for laptops]**

docker-compose.api.yaml

+6
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,9 @@ services:
2222
service: proteinmpnn
2323
file: docker-compose.yaml
2424
command: python api.py
25+
26+
arxiv-ai-abstractions-search-api:
27+
extends:
28+
service: arxiv-ai-abstractions-search
29+
file: docker-compose.yaml
30+
command: python -u api.py

docker-compose.yaml

+12
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,18 @@ services:
2626
count: 1
2727
capabilities: [ gpu ]
2828

29+
arxiv-ai-abstractions-search:
30+
image: 'ghcr.io/basedlabs/arxiv-ai-abstractions-search:1.0.0'
31+
network_mode: host
32+
command: python worker.py
33+
build:
34+
context: microservices/arxiv_abstracts
35+
dockerfile: build/Dockerfile
36+
env_file:
37+
- microservices/arxiv_abstracts/service/.env
38+
volumes:
39+
- ${ARXIV_ABSTRACTS_DB}:/app/chroma_db
40+
2941
esmfold-light:
3042
image: 'ghcr.io/basedlabs/esmfold-light:2.0.0'
3143
network_mode: host

media/arxiv-abstracts-search.png

282 KB
Loading
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
# Git
2+
.git
3+
.gitignore
4+
.gitattributes
5+
6+
7+
# CI
8+
.codeclimate.yml
9+
.travis.yml
10+
.taskcluster.yml
11+
12+
# Docker
13+
docker-compose.yml
14+
Dockerfile
15+
.docker
16+
.dockerignore
17+
18+
# Byte-compiled / optimized / DLL files
19+
**/__pycache__/
20+
**/*.py[cod]
21+
22+
# C extensions
23+
*.so
24+
25+
# Distribution / packaging
26+
.Python
27+
env/
28+
build/
29+
develop-eggs/
30+
dist/
31+
downloads/
32+
eggs/
33+
lib/
34+
lib64/
35+
parts/
36+
sdist/
37+
var/
38+
*.egg-info/
39+
.installed.cfg
40+
*.egg
41+
42+
# PyInstaller
43+
# Usually these files are written by a python script from a template
44+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
45+
*.manifest
46+
*.spec
47+
48+
# Installer logs
49+
pip-log.txt
50+
pip-delete-this-directory.txt
51+
52+
# Unit test / coverage reports
53+
htmlcov/
54+
.tox/
55+
.coverage
56+
.cache
57+
nosetests.xml
58+
coverage.xml
59+
60+
# Translations
61+
*.mo
62+
*.pot
63+
64+
# Django stuff:
65+
*.log
66+
67+
# Sphinx documentation
68+
docs/_build/
69+
70+
# PyBuilder
71+
target/
72+
73+
# Virtual environment
74+
.env
75+
.venv/
76+
venv/
77+
78+
# PyCharm
79+
.idea
80+
81+
# Python mode for VIM
82+
.ropeproject
83+
**/.ropeproject
84+
85+
# Vim swap files
86+
**/*.swp
87+
88+
# VS Code
89+
.vscode/
90+
91+
build/Dockerfile
92+
chroma_db
93+
checkpoint.db
94+
history.json
+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# Arxiv AI abstracts search
2+
3+
This microservice contains LLM RAG search over arxiv abstracts.
4+
5+
To start visit main page of repository
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
FROM python:3.12
2+
WORKDIR /app
3+
COPY ./requirements.txt /app/
4+
RUN pip install --upgrade pip
5+
RUN pip install --default-timeout=100 -r requirements.txt
6+
COPY service /app

0 commit comments

Comments
 (0)