Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
gabriielcmps committed Oct 8, 2023
1 parent 15109ef commit e52c2fd
Show file tree
Hide file tree
Showing 9 changed files with 25 additions and 9 deletions.
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -319,3 +319,8 @@ This is a test project to validate the feasibility of a fully local solution for
conda uninstall tokenizers, transformers
pip install transformers
```
- [ERROR: "If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation..." ](https://pytorch.org/docs/stable/notes/cuda.html#memory-management)
```shell
export PYTORCH_NO_CUDA_MEMORY_CACHING=1
```

1 change: 1 addition & 0 deletions SOURCE_DOCUMENTS/data_training1.txt

Large diffs are not rendered by default.

11 changes: 6 additions & 5 deletions constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,13 @@
)

# Context Window and Max New Tokens
CONTEXT_WINDOW_SIZE = 4096
CONTEXT_WINDOW_SIZE = 2048
MAX_NEW_TOKENS = CONTEXT_WINDOW_SIZE # int(CONTEXT_WINDOW_SIZE/4)

#### If you get a "not enough space in the buffer" error, you should reduce the values below, start with half of the original values and keep halving the value until the error stops appearing

N_GPU_LAYERS = 100 # Llama-2-70B has 83 layers
N_BATCH = 512
N_GPU_LAYERS = 1 # Llama-2-70B has 83 layers
N_BATCH = 1

### From experimenting with the Llama-2-7B-Chat-GGML model on 8GB VRAM, these values work:
# N_GPU_LAYERS = 20
Expand All @@ -46,8 +46,8 @@
".txt": TextLoader,
".md": TextLoader,
".py": TextLoader,
# ".pdf": PDFMinerLoader,
".pdf": UnstructuredFileLoader,
".pdf": PDFMinerLoader,
# ".pdf": UnstructuredFileLoader,
".csv": CSVLoader,
".xls": UnstructuredExcelLoader,
".xlsx": UnstructuredExcelLoader,
Expand Down Expand Up @@ -176,3 +176,4 @@
# MODEL_BASENAME = "wizard-vicuna-13B.ggmlv3.q2_K.bin"
# MODEL_ID = "TheBloke/orca_mini_3B-GGML"
# MODEL_BASENAME = "orca-mini-3b.ggmlv3.q4_0.bin"

3 changes: 3 additions & 0 deletions ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@
from langchain.text_splitter import Language, RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma

torch.cuda.empty_cache()
torch.cuda.memory_summary(device=None, abbreviated=False)

from constants import (
CHROMA_SETTINGS,
DOCUMENT_MAP,
Expand Down
2 changes: 1 addition & 1 deletion load_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ def load_full_model(model_id, model_basename, device_type, logging):
# load_in_4bit=True,
# bnb_4bit_quant_type="nf4",
# bnb_4bit_compute_dtype=torch.float16,
# max_memory={0: "15GB"} # Uncomment this line with you encounter CUDA out of memory errors
max_memory={0: "15GB"} # Uncomment this line with you encounter CUDA out of memory errors
)
model.tie_weights()
return model, tokenizer
File renamed without changes.
Binary file not shown.
5 changes: 3 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,14 @@ sentence-transformers
faiss-cpu
huggingface_hub
transformers
protobuf==3.20.0; sys_platform != 'darwin'
protobuf==3.20.0; sys_platform == 'darwin' and platform_machine != 'arm64'
protobuf>=3.20.0; sys_platform != 'darwin'
protobuf>=3.20.0; sys_platform == 'darwin' and platform_machine != 'arm64'
protobuf==3.20.3; sys_platform == 'darwin' and platform_machine == 'arm64'
auto-gptq==0.2.2
docx2txt
unstructured
unstructured[pdf]
llama-cpp-python

# Utilities
urllib3==1.26.6
Expand Down
7 changes: 6 additions & 1 deletion run_localGPT.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler # for streaming response
from langchain.callbacks.manager import CallbackManager

torch.cuda.empty_cache()
torch.cuda.memory_summary(device=None, abbreviated=False)

torch.set_grad_enabled(False)

callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

from prompt_template_utils import get_prompt_template
Expand Down Expand Up @@ -78,7 +83,7 @@ def load_model(device_type, model_id, model_basename=None, LOGGING=logging):
"text-generation",
model=model,
tokenizer=tokenizer,
max_length=MAX_NEW_TOKENS,
max_length=50,
temperature=0.2,
# top_p=0.95,
repetition_penalty=1.15,
Expand Down

0 comments on commit e52c2fd

Please sign in to comment.