initial commit

project-katara · Oct 8, 2023 · e52c2fd · e52c2fd
1 parent 15109ef
commit e52c2fd
Show file tree

Hide file tree

Showing 9 changed files with 25 additions and 9 deletions.
diff --git a/README.md b/README.md
@@ -319,3 +319,8 @@ This is a test project to validate the feasibility of a fully local solution for
        conda uninstall tokenizers, transformers
        pip install transformers
     ```
+- [ERROR: "If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation..." ](https://pytorch.org/docs/stable/notes/cuda.html#memory-management)
+    ```shell
+      export PYTORCH_NO_CUDA_MEMORY_CACHING=1
+    ```
+
diff --git a/SOURCE_DOCUMENTS/data_training1.txt b/SOURCE_DOCUMENTS/data_training1.txt
diff --git a/constants.py b/constants.py
@@ -28,13 +28,13 @@
 )
 
 # Context Window and Max New Tokens
-CONTEXT_WINDOW_SIZE = 4096
+CONTEXT_WINDOW_SIZE = 2048
 MAX_NEW_TOKENS = CONTEXT_WINDOW_SIZE  # int(CONTEXT_WINDOW_SIZE/4)
 
 #### If you get a "not enough space in the buffer" error, you should reduce the values below, start with half of the original values and keep halving the value until the error stops appearing
 
-N_GPU_LAYERS = 100  # Llama-2-70B has 83 layers
-N_BATCH = 512
+N_GPU_LAYERS = 1  # Llama-2-70B has 83 layers
+N_BATCH = 1
 
 ### From experimenting with the Llama-2-7B-Chat-GGML model on 8GB VRAM, these values work:
 # N_GPU_LAYERS = 20
@@ -46,8 +46,8 @@
     ".txt": TextLoader,
     ".md": TextLoader,
     ".py": TextLoader,
-    # ".pdf": PDFMinerLoader,
-    ".pdf": UnstructuredFileLoader,
+    ".pdf": PDFMinerLoader,
+    # ".pdf": UnstructuredFileLoader,
     ".csv": CSVLoader,
     ".xls": UnstructuredExcelLoader,
     ".xlsx": UnstructuredExcelLoader,
@@ -176,3 +176,4 @@
 # MODEL_BASENAME = "wizard-vicuna-13B.ggmlv3.q2_K.bin"
 # MODEL_ID = "TheBloke/orca_mini_3B-GGML"
 # MODEL_BASENAME = "orca-mini-3b.ggmlv3.q4_0.bin"
+
diff --git a/ingest.py b/ingest.py
@@ -9,6 +9,9 @@
 from langchain.text_splitter import Language, RecursiveCharacterTextSplitter
 from langchain.vectorstores import Chroma
 
+torch.cuda.empty_cache()
+torch.cuda.memory_summary(device=None, abbreviated=False)
+
 from constants import (
     CHROMA_SETTINGS,
     DOCUMENT_MAP,

diff --git a/load_models.py b/load_models.py
@@ -145,7 +145,7 @@ def load_full_model(model_id, model_basename, device_type, logging):
             # load_in_4bit=True,
             # bnb_4bit_quant_type="nf4",
             # bnb_4bit_compute_dtype=torch.float16,
-            # max_memory={0: "15GB"} # Uncomment this line with you encounter CUDA out of memory errors
+            max_memory={0: "15GB"} # Uncomment this line with you encounter CUDA out of memory errors
         )
         model.tie_weights()
     return model, tokenizer
diff --git a/SOURCE_DOCUMENTS/Orca_paper.pdf → ...I/static/document_examples/Orca_paper.pdf b/SOURCE_DOCUMENTS/Orca_paper.pdf → ...I/static/document_examples/Orca_paper.pdf
diff --git a/localGPTUI/static/document_examples/constitution.pdf b/localGPTUI/static/document_examples/constitution.pdf
diff --git a/requirements.txt b/requirements.txt
@@ -7,13 +7,14 @@ sentence-transformers
 faiss-cpu
 huggingface_hub
 transformers
-protobuf==3.20.0; sys_platform != 'darwin'
-protobuf==3.20.0; sys_platform == 'darwin' and platform_machine != 'arm64'
+protobuf>=3.20.0; sys_platform != 'darwin'
+protobuf>=3.20.0; sys_platform == 'darwin' and platform_machine != 'arm64'
 protobuf==3.20.3; sys_platform == 'darwin' and platform_machine == 'arm64'
 auto-gptq==0.2.2
 docx2txt
 unstructured
 unstructured[pdf]
+llama-cpp-python 
 
 # Utilities
 urllib3==1.26.6

diff --git a/run_localGPT.py b/run_localGPT.py
@@ -8,6 +8,11 @@
 from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler  # for streaming response
 from langchain.callbacks.manager import CallbackManager
 
+torch.cuda.empty_cache()
+torch.cuda.memory_summary(device=None, abbreviated=False)
+
+torch.set_grad_enabled(False)
+
 callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
 
 from prompt_template_utils import get_prompt_template
@@ -78,7 +83,7 @@ def load_model(device_type, model_id, model_basename=None, LOGGING=logging):
         "text-generation",
         model=model,
         tokenizer=tokenizer,
-        max_length=MAX_NEW_TOKENS,
+        max_length=50,
         temperature=0.2,
         # top_p=0.95,
         repetition_penalty=1.15,