From 32daa49c4f0c35a92573c2026afe1d38bc8112bb Mon Sep 17 00:00:00 2001 From: Ankush Singal Date: Fri, 8 Dec 2023 20:45:27 +0530 Subject: [PATCH] Add files via upload --- StableLM/StableLM-Zephyr-3B.ipynb | 630 ++++++++++++++++++++++++++++++ 1 file changed, 630 insertions(+) create mode 100644 StableLM/StableLM-Zephyr-3B.ipynb diff --git a/StableLM/StableLM-Zephyr-3B.ipynb b/StableLM/StableLM-Zephyr-3B.ipynb new file mode 100644 index 0000000..536a519 --- /dev/null +++ b/StableLM/StableLM-Zephyr-3B.ipynb @@ -0,0 +1,630 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "7d0f6393-aea2-498d-af05-84f74cf5b641", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting llama-cpp-python\n", + " Downloading llama_cpp_python-0.2.20.tar.gz (8.7 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m8.7/8.7 MB\u001b[0m \u001b[31m36.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", + "\u001b[?25h Installing build dependencies ... \u001b[?25ldone\n", + "\u001b[?25h Getting requirements to build wheel ... \u001b[?25ldone\n", + "\u001b[?25h Installing backend dependencies ... \u001b[?25ldone\n", + "\u001b[?25h Preparing metadata (pyproject.toml) ... \u001b[?25ldone\n", + "\u001b[?25hCollecting typing-extensions>=4.5.0 (from llama-cpp-python)\n", + " Downloading typing_extensions-4.8.0-py3-none-any.whl.metadata (3.0 kB)\n", + "Requirement already satisfied: numpy>=1.20.0 in /usr/local/lib/python3.10/dist-packages (from llama-cpp-python) (1.24.1)\n", + "Collecting diskcache>=5.6.1 (from llama-cpp-python)\n", + " Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)\n", + "Downloading diskcache-5.6.3-py3-none-any.whl (45 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m45.5/45.5 kB\u001b[0m \u001b[31m1.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading typing_extensions-4.8.0-py3-none-any.whl (31 kB)\n", + "Building wheels for collected packages: llama-cpp-python\n", + " Building wheel for llama-cpp-python (pyproject.toml) ... \u001b[?25ldone\n", + "\u001b[?25h Created wheel for llama-cpp-python: filename=llama_cpp_python-0.2.20-cp310-cp310-manylinux_2_35_x86_64.whl size=2001190 sha256=e36ed0d07baeb2f21bf48eab6eb840de5b1ff6c502e816cf2577461366ac7c67\n", + " Stored in directory: /root/.cache/pip/wheels/ef/f2/d2/0becb03047a348d7bd9a5b91ec88f4654d6fa7d67ea4e84d43\n", + "Successfully built llama-cpp-python\n", + "Installing collected packages: typing-extensions, diskcache, llama-cpp-python\n", + " Attempting uninstall: typing-extensions\n", + " Found existing installation: typing_extensions 4.4.0\n", + " Uninstalling typing_extensions-4.4.0:\n", + " Successfully uninstalled typing_extensions-4.4.0\n", + "Successfully installed diskcache-5.6.3 llama-cpp-python-0.2.20 typing-extensions-4.8.0\n", + "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", + "\u001b[0mRequirement already satisfied: llama-cpp-python in /usr/local/lib/python3.10/dist-packages (0.2.20)\n", + "Requirement already satisfied: typing-extensions>=4.5.0 in /usr/local/lib/python3.10/dist-packages (from llama-cpp-python) (4.8.0)\n", + "Requirement already satisfied: numpy>=1.20.0 in /usr/local/lib/python3.10/dist-packages (from llama-cpp-python) (1.24.1)\n", + "Requirement already satisfied: diskcache>=5.6.1 in /usr/local/lib/python3.10/dist-packages (from llama-cpp-python) (5.6.3)\n", + "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", + "\u001b[0mRequirement already satisfied: llama-cpp-python in /usr/local/lib/python3.10/dist-packages (0.2.20)\n", + "Requirement already satisfied: typing-extensions>=4.5.0 in /usr/local/lib/python3.10/dist-packages (from llama-cpp-python) (4.8.0)\n", + "Requirement already satisfied: numpy>=1.20.0 in /usr/local/lib/python3.10/dist-packages (from llama-cpp-python) (1.24.1)\n", + "Requirement already satisfied: diskcache>=5.6.1 in /usr/local/lib/python3.10/dist-packages (from llama-cpp-python) (5.6.3)\n", + "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", + "\u001b[0mRequirement already satisfied: llama-cpp-python in /usr/local/lib/python3.10/dist-packages (0.2.20)\n", + "Requirement already satisfied: typing-extensions>=4.5.0 in /usr/local/lib/python3.10/dist-packages (from llama-cpp-python) (4.8.0)\n", + "Requirement already satisfied: numpy>=1.20.0 in /usr/local/lib/python3.10/dist-packages (from llama-cpp-python) (1.24.1)\n", + "Requirement already satisfied: diskcache>=5.6.1 in /usr/local/lib/python3.10/dist-packages (from llama-cpp-python) (5.6.3)\n", + "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", + "\u001b[0mRequirement already satisfied: llama-cpp-python in /usr/local/lib/python3.10/dist-packages (0.2.20)\n", + "Requirement already satisfied: typing-extensions>=4.5.0 in /usr/local/lib/python3.10/dist-packages (from llama-cpp-python) (4.8.0)\n", + "Requirement already satisfied: numpy>=1.20.0 in /usr/local/lib/python3.10/dist-packages (from llama-cpp-python) (1.24.1)\n", + "Requirement already satisfied: diskcache>=5.6.1 in /usr/local/lib/python3.10/dist-packages (from llama-cpp-python) (5.6.3)\n", + "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", + "\u001b[0mRequirement already satisfied: llama-cpp-python in /usr/local/lib/python3.10/dist-packages (0.2.20)\n", + "Requirement already satisfied: typing-extensions>=4.5.0 in /usr/local/lib/python3.10/dist-packages (from llama-cpp-python) (4.8.0)\n", + "Requirement already satisfied: numpy>=1.20.0 in /usr/local/lib/python3.10/dist-packages (from llama-cpp-python) (1.24.1)\n", + "Requirement already satisfied: diskcache>=5.6.1 in /usr/local/lib/python3.10/dist-packages (from llama-cpp-python) (5.6.3)\n", + "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "# Base ctransformers with no GPU acceleration\n", + "!pip install llama-cpp-python\n", + "# With NVidia CUDA acceleration\n", + "!CMAKE_ARGS=\"-DLLAMA_CUBLAS=on\" pip install llama-cpp-python\n", + "# Or with OpenBLAS acceleration\n", + "!CMAKE_ARGS=\"-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS\" pip install llama-cpp-python\n", + "# Or with CLBLast acceleration\n", + "!CMAKE_ARGS=\"-DLLAMA_CLBLAST=on\" pip install llama-cpp-python\n", + "# Or with AMD ROCm GPU acceleration (Linux only)\n", + "!CMAKE_ARGS=\"-DLLAMA_HIPBLAS=on\" pip install llama-cpp-python\n", + "# Or with Metal GPU acceleration for macOS systems only\n", + "!CMAKE_ARGS=\"-DLLAMA_METAL=on\" pip install llama-cpp-python\n", + "\n", + "# In windows, to set the variables CMAKE_ARGS in PowerShell, follow this format; eg for NVidia CUDA:\n", + "#$env:CMAKE_ARGS = \"-DLLAMA_OPENBLAS=on\"\n", + "#pip install llama-cpp-python\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "375311eb-79fc-47bd-acc3-dc93f9aa0b1d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2023-12-08 14:55:10-- https://huggingface.co/TheBloke/stablelm-zephyr-3b-GGUF/resolve/main/stablelm-zephyr-3b.Q4_K_M.gguf\n", + "Resolving huggingface.co (huggingface.co)... 3.161.213.110, 3.161.213.25, 3.161.213.11, ...\n", + "Connecting to huggingface.co (huggingface.co)|3.161.213.110|:443... connected.\n", + "HTTP request sent, awaiting response... 302 Found\n", + "Location: https://cdn-lfs-us-1.huggingface.co/repos/26/b5/26b5e8ac044170dde1558b79e36c57d2e5dfc5b15833f0afdcb1bc79190717a5/74b2613b6e89d904a2ea38d56d233e4d2ca2fe663844b2e7aa90e769d359061b?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27stablelm-zephyr-3b.Q4_K_M.gguf%3B+filename%3D%22stablelm-zephyr-3b.Q4_K_M.gguf%22%3B&Expires=1702306510&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwMjMwNjUxMH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zLzI2L2I1LzI2YjVlOGFjMDQ0MTcwZGRlMTU1OGI3OWUzNmM1N2QyZTVkZmM1YjE1ODMzZjBhZmRjYjFiYzc5MTkwNzE3YTUvNzRiMjYxM2I2ZTg5ZDkwNGEyZWEzOGQ1NmQyMzNlNGQyY2EyZmU2NjM4NDRiMmU3YWE5MGU3NjlkMzU5MDYxYj9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=lBIbo3WXcoOpcOjPW1mDgXBo1zokX3kwg1JeUxuazVck6sxKYUPpiK31Qytdb4LV7K8vSNHLQrJmwrltOCifmuNju2VUo4eItgV-9vGlCG15q7gsnVNd0Hm9jJuROo69XyoZAztKt4imJFgnukoio46rDsiyQuEcJ1AcLbG9TyPFOGZdxCXzPgz9103VHpUSOoMcy1oDOr5CVpF29614DiPj2T5BT2Q4eMEB6ltIuabzMpBot%7EsAMqToBP0-nWtxvXZGXIL-EAxXNpDlMoAU9wyW5SPTwg7Zby7HK9H7hX1AengjtcZ4JOm4TLiaaOlycfmio01QrNBwJ%7ET4o5BF7A__&Key-Pair-Id=KCD77M1F0VK2B [following]\n", + "--2023-12-08 14:55:10-- https://cdn-lfs-us-1.huggingface.co/repos/26/b5/26b5e8ac044170dde1558b79e36c57d2e5dfc5b15833f0afdcb1bc79190717a5/74b2613b6e89d904a2ea38d56d233e4d2ca2fe663844b2e7aa90e769d359061b?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27stablelm-zephyr-3b.Q4_K_M.gguf%3B+filename%3D%22stablelm-zephyr-3b.Q4_K_M.gguf%22%3B&Expires=1702306510&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwMjMwNjUxMH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zLzI2L2I1LzI2YjVlOGFjMDQ0MTcwZGRlMTU1OGI3OWUzNmM1N2QyZTVkZmM1YjE1ODMzZjBhZmRjYjFiYzc5MTkwNzE3YTUvNzRiMjYxM2I2ZTg5ZDkwNGEyZWEzOGQ1NmQyMzNlNGQyY2EyZmU2NjM4NDRiMmU3YWE5MGU3NjlkMzU5MDYxYj9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=lBIbo3WXcoOpcOjPW1mDgXBo1zokX3kwg1JeUxuazVck6sxKYUPpiK31Qytdb4LV7K8vSNHLQrJmwrltOCifmuNju2VUo4eItgV-9vGlCG15q7gsnVNd0Hm9jJuROo69XyoZAztKt4imJFgnukoio46rDsiyQuEcJ1AcLbG9TyPFOGZdxCXzPgz9103VHpUSOoMcy1oDOr5CVpF29614DiPj2T5BT2Q4eMEB6ltIuabzMpBot%7EsAMqToBP0-nWtxvXZGXIL-EAxXNpDlMoAU9wyW5SPTwg7Zby7HK9H7hX1AengjtcZ4JOm4TLiaaOlycfmio01QrNBwJ%7ET4o5BF7A__&Key-Pair-Id=KCD77M1F0VK2B\n", + "Resolving cdn-lfs-us-1.huggingface.co (cdn-lfs-us-1.huggingface.co)... 3.162.3.62, 3.162.3.84, 3.162.3.52, ...\n", + "Connecting to cdn-lfs-us-1.huggingface.co (cdn-lfs-us-1.huggingface.co)|3.162.3.62|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 1708595648 (1.6G) [binary/octet-stream]\n", + "Saving to: ‘stablelm-zephyr-3b.Q4_K_M.gguf’\n", + "\n", + "stablelm-zephyr-3b. 100%[===================>] 1.59G 48.9MB/s in 35s \n", + "\n", + "2023-12-08 14:55:46 (46.5 MB/s) - ‘stablelm-zephyr-3b.Q4_K_M.gguf’ saved [1708595648/1708595648]\n", + "\n" + ] + } + ], + "source": [ + "!wget https://huggingface.co/TheBloke/stablelm-zephyr-3b-GGUF/resolve/main/stablelm-zephyr-3b.Q4_K_M.gguf" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c1808a48-68d8-4e04-ae35-f6a7eef7fd9b", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_loader: loaded meta data with 21 key-value pairs and 356 tensors from /workspace/stablelm-zephyr-3b.Q4_K_M.gguf (version GGUF V3 (latest))\n", + "llama_model_loader: - tensor 0: output.weight q6_K [ 2560, 50304, 1, 1 ]\n", + "llama_model_loader: - tensor 1: token_embd.weight q4_K [ 2560, 50304, 1, 1 ]\n", + "llama_model_loader: - tensor 2: blk.0.attn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 3: blk.0.attn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 4: blk.0.ffn_down.weight q6_K [ 6912, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 5: blk.0.ffn_gate.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 6: blk.0.ffn_up.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 7: blk.0.ffn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 8: blk.0.ffn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 9: blk.0.attn_k.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 10: blk.0.attn_output.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 11: blk.0.attn_q.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 12: blk.0.attn_v.weight q6_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 13: blk.1.attn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 14: blk.1.attn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 15: blk.1.ffn_down.weight q6_K [ 6912, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 16: blk.1.ffn_gate.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 17: blk.1.ffn_up.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 18: blk.1.ffn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 19: blk.1.ffn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 20: blk.1.attn_k.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 21: blk.1.attn_output.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 22: blk.1.attn_q.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 23: blk.1.attn_v.weight q6_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 24: blk.10.attn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 25: blk.10.attn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 26: blk.10.ffn_down.weight q6_K [ 6912, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 27: blk.10.ffn_gate.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 28: blk.10.ffn_up.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 29: blk.10.ffn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 30: blk.10.ffn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 31: blk.10.attn_k.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 32: blk.10.attn_output.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 33: blk.10.attn_q.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 34: blk.10.attn_v.weight q6_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 35: blk.11.attn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 36: blk.11.attn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 37: blk.11.ffn_down.weight q6_K [ 6912, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 38: blk.11.ffn_gate.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 39: blk.11.ffn_up.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 40: blk.11.ffn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 41: blk.11.ffn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 42: blk.11.attn_k.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 43: blk.11.attn_output.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 44: blk.11.attn_q.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 45: blk.11.attn_v.weight q6_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 46: blk.12.attn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 47: blk.12.attn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 48: blk.12.ffn_down.weight q4_K [ 6912, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 49: blk.12.ffn_gate.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 50: blk.12.ffn_up.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 51: blk.12.ffn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 52: blk.12.ffn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 53: blk.12.attn_k.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 54: blk.12.attn_output.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 55: blk.12.attn_q.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 56: blk.12.attn_v.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 57: blk.13.attn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 58: blk.13.attn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 59: blk.13.ffn_down.weight q4_K [ 6912, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 60: blk.13.ffn_gate.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 61: blk.13.ffn_up.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 62: blk.13.ffn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 63: blk.13.ffn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 64: blk.13.attn_k.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 65: blk.13.attn_output.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 66: blk.13.attn_q.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 67: blk.13.attn_v.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 68: blk.14.attn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 69: blk.14.attn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 70: blk.14.ffn_down.weight q6_K [ 6912, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 71: blk.14.ffn_gate.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 72: blk.14.ffn_up.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 73: blk.14.ffn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 74: blk.14.ffn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 75: blk.14.attn_k.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 76: blk.14.attn_output.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 77: blk.14.attn_q.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 78: blk.14.attn_v.weight q6_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 79: blk.15.attn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 80: blk.15.attn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 81: blk.15.ffn_down.weight q4_K [ 6912, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 82: blk.15.ffn_gate.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 83: blk.15.ffn_up.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 84: blk.15.ffn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 85: blk.15.ffn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 86: blk.15.attn_k.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 87: blk.15.attn_output.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 88: blk.15.attn_q.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 89: blk.15.attn_v.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 90: blk.16.attn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 91: blk.16.attn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 92: blk.16.ffn_down.weight q4_K [ 6912, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 93: blk.16.ffn_gate.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 94: blk.16.ffn_up.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 95: blk.16.ffn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 96: blk.16.ffn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 97: blk.16.attn_k.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 98: blk.16.attn_output.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 99: blk.16.attn_q.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 100: blk.16.attn_v.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 101: blk.17.attn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 102: blk.17.attn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 103: blk.17.ffn_down.weight q6_K [ 6912, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 104: blk.17.ffn_gate.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 105: blk.17.ffn_up.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 106: blk.17.ffn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 107: blk.17.ffn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 108: blk.17.attn_k.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 109: blk.17.attn_output.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 110: blk.17.attn_q.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 111: blk.17.attn_v.weight q6_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 112: blk.18.attn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 113: blk.18.attn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 114: blk.18.ffn_down.weight q4_K [ 6912, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 115: blk.18.ffn_gate.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 116: blk.18.ffn_up.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 117: blk.18.ffn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 118: blk.18.ffn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 119: blk.18.attn_k.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 120: blk.18.attn_output.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 121: blk.18.attn_q.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 122: blk.18.attn_v.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 123: blk.19.attn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 124: blk.19.attn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 125: blk.19.ffn_down.weight q4_K [ 6912, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 126: blk.19.ffn_gate.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 127: blk.19.ffn_up.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 128: blk.19.ffn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 129: blk.19.ffn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 130: blk.19.attn_k.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 131: blk.19.attn_output.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 132: blk.19.attn_q.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 133: blk.19.attn_v.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 134: blk.2.attn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 135: blk.2.attn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 136: blk.2.ffn_down.weight q6_K [ 6912, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 137: blk.2.ffn_gate.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 138: blk.2.ffn_up.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 139: blk.2.ffn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 140: blk.2.ffn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 141: blk.2.attn_k.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 142: blk.2.attn_output.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 143: blk.2.attn_q.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 144: blk.2.attn_v.weight q6_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 145: blk.20.attn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 146: blk.20.attn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 147: blk.20.ffn_down.weight q4_K [ 6912, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 148: blk.20.ffn_gate.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 149: blk.20.ffn_up.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 150: blk.20.ffn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 151: blk.20.ffn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 152: blk.20.attn_k.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 153: blk.20.attn_output.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 154: blk.20.attn_q.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 155: blk.20.attn_v.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 156: blk.21.attn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 157: blk.21.attn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 158: blk.21.ffn_down.weight q4_K [ 6912, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 159: blk.21.ffn_gate.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 160: blk.21.ffn_up.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 161: blk.21.ffn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 162: blk.21.ffn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 163: blk.21.attn_k.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 164: blk.21.attn_output.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 165: blk.21.attn_q.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 166: blk.21.attn_v.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 167: blk.22.attn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 168: blk.22.attn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 169: blk.22.ffn_down.weight q6_K [ 6912, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 170: blk.22.ffn_gate.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 171: blk.22.ffn_up.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 172: blk.22.ffn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 173: blk.22.ffn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 174: blk.22.attn_k.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 175: blk.22.attn_output.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 176: blk.22.attn_q.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 177: blk.22.attn_v.weight q6_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 178: blk.23.attn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 179: blk.23.attn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 180: blk.23.ffn_down.weight q4_K [ 6912, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 181: blk.23.ffn_gate.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 182: blk.23.ffn_up.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 183: blk.23.ffn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 184: blk.23.ffn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 185: blk.23.attn_k.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 186: blk.23.attn_output.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 187: blk.23.attn_q.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 188: blk.23.attn_v.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 189: blk.24.attn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 190: blk.24.attn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 191: blk.24.ffn_down.weight q4_K [ 6912, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 192: blk.24.ffn_gate.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 193: blk.24.ffn_up.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 194: blk.24.ffn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 195: blk.24.ffn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 196: blk.24.attn_k.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 197: blk.24.attn_output.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 198: blk.24.attn_q.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 199: blk.24.attn_v.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 200: blk.25.attn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 201: blk.25.attn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 202: blk.25.ffn_down.weight q6_K [ 6912, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 203: blk.25.ffn_gate.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 204: blk.25.ffn_up.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 205: blk.25.ffn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 206: blk.25.ffn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 207: blk.25.attn_k.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 208: blk.25.attn_output.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 209: blk.25.attn_q.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 210: blk.25.attn_v.weight q6_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 211: blk.26.attn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 212: blk.26.attn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 213: blk.26.ffn_down.weight q4_K [ 6912, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 214: blk.26.ffn_gate.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 215: blk.26.ffn_up.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 216: blk.26.ffn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 217: blk.26.ffn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 218: blk.26.attn_k.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 219: blk.26.attn_output.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 220: blk.26.attn_q.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 221: blk.26.attn_v.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 222: blk.27.attn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 223: blk.27.attn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 224: blk.27.ffn_down.weight q4_K [ 6912, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 225: blk.27.ffn_gate.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 226: blk.27.ffn_up.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 227: blk.27.ffn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 228: blk.27.ffn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 229: blk.27.attn_k.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 230: blk.27.attn_output.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 231: blk.27.attn_q.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 232: blk.27.attn_v.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 233: blk.28.attn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 234: blk.28.attn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 235: blk.28.ffn_down.weight q6_K [ 6912, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 236: blk.28.ffn_gate.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 237: blk.28.ffn_up.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 238: blk.28.ffn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 239: blk.28.ffn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 240: blk.28.attn_k.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 241: blk.28.attn_output.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 242: blk.28.attn_q.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 243: blk.28.attn_v.weight q6_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 244: blk.29.attn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 245: blk.29.attn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 246: blk.29.ffn_down.weight q4_K [ 6912, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 247: blk.29.ffn_gate.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 248: blk.29.ffn_up.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 249: blk.29.ffn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 250: blk.29.ffn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 251: blk.29.attn_k.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 252: blk.29.attn_output.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 253: blk.29.attn_q.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 254: blk.29.attn_v.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 255: blk.3.attn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 256: blk.3.attn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 257: blk.3.ffn_down.weight q4_K [ 6912, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 258: blk.3.ffn_gate.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 259: blk.3.ffn_up.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 260: blk.3.ffn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 261: blk.3.ffn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 262: blk.3.attn_k.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 263: blk.3.attn_output.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 264: blk.3.attn_q.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 265: blk.3.attn_v.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 266: blk.30.attn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 267: blk.30.attn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 268: blk.30.ffn_down.weight q6_K [ 6912, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 269: blk.30.ffn_gate.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 270: blk.30.ffn_up.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 271: blk.30.ffn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 272: blk.30.ffn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 273: blk.30.attn_k.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 274: blk.30.attn_output.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 275: blk.30.attn_q.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 276: blk.30.attn_v.weight q6_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 277: blk.31.attn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 278: blk.31.attn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 279: blk.31.ffn_down.weight q4_K [ 6912, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 280: blk.31.ffn_gate.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 281: blk.31.ffn_up.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 282: blk.31.ffn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 283: blk.31.ffn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 284: blk.31.attn_k.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 285: blk.31.attn_output.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 286: blk.31.attn_q.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 287: blk.31.attn_v.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 288: blk.4.attn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 289: blk.4.attn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 290: blk.4.ffn_down.weight q4_K [ 6912, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 291: blk.4.ffn_gate.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 292: blk.4.ffn_up.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 293: blk.4.ffn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 294: blk.4.ffn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 295: blk.4.attn_k.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 296: blk.4.attn_output.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 297: blk.4.attn_q.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 298: blk.4.attn_v.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 299: blk.5.attn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 300: blk.5.attn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 301: blk.5.ffn_down.weight q6_K [ 6912, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 302: blk.5.ffn_gate.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 303: blk.5.ffn_up.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 304: blk.5.ffn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 305: blk.5.ffn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 306: blk.5.attn_k.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 307: blk.5.attn_output.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 308: blk.5.attn_q.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 309: blk.5.attn_v.weight q6_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 310: blk.6.attn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 311: blk.6.attn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 312: blk.6.ffn_down.weight q6_K [ 6912, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 313: blk.6.ffn_gate.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 314: blk.6.ffn_up.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 315: blk.6.ffn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 316: blk.6.ffn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 317: blk.6.attn_k.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 318: blk.6.attn_output.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 319: blk.6.attn_q.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 320: blk.6.attn_v.weight q6_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 321: blk.7.attn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 322: blk.7.attn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 323: blk.7.ffn_down.weight q6_K [ 6912, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 324: blk.7.ffn_gate.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 325: blk.7.ffn_up.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 326: blk.7.ffn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 327: blk.7.ffn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 328: blk.7.attn_k.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 329: blk.7.attn_output.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 330: blk.7.attn_q.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 331: blk.7.attn_v.weight q6_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 332: blk.8.attn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 333: blk.8.attn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 334: blk.8.ffn_down.weight q6_K [ 6912, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 335: blk.8.ffn_gate.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 336: blk.8.ffn_up.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 337: blk.8.ffn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 338: blk.8.ffn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 339: blk.8.attn_k.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 340: blk.8.attn_output.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 341: blk.8.attn_q.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 342: blk.8.attn_v.weight q6_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 343: blk.9.attn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 344: blk.9.attn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 345: blk.9.ffn_down.weight q6_K [ 6912, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 346: blk.9.ffn_gate.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 347: blk.9.ffn_up.weight q4_K [ 2560, 6912, 1, 1 ]\n", + "llama_model_loader: - tensor 348: blk.9.ffn_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 349: blk.9.ffn_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 350: blk.9.attn_k.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 351: blk.9.attn_output.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 352: blk.9.attn_q.weight q4_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 353: blk.9.attn_v.weight q6_K [ 2560, 2560, 1, 1 ]\n", + "llama_model_loader: - tensor 354: output_norm.bias f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 355: output_norm.weight f32 [ 2560, 1, 1, 1 ]\n", + "llama_model_loader: - kv 0: general.architecture str = stablelm\n", + "llama_model_loader: - kv 1: general.name str = source\n", + "llama_model_loader: - kv 2: stablelm.context_length u32 = 4096\n", + "llama_model_loader: - kv 3: stablelm.embedding_length u32 = 2560\n", + "llama_model_loader: - kv 4: stablelm.block_count u32 = 32\n", + "llama_model_loader: - kv 5: stablelm.feed_forward_length u32 = 6912\n", + "llama_model_loader: - kv 6: stablelm.rope.dimension_count u32 = 20\n", + "llama_model_loader: - kv 7: stablelm.attention.head_count u32 = 32\n", + "llama_model_loader: - kv 8: stablelm.use_parallel_residual bool = true\n", + "llama_model_loader: - kv 9: stablelm.attention.layer_norm_epsilon f32 = 0.000010\n", + "llama_model_loader: - kv 10: tokenizer.ggml.model str = gpt2\n", + "llama_model_loader: - kv 11: tokenizer.ggml.tokens arr[str,50304] = [\"<|endoftext|>\", \"<|padding|>\", \"!\",...\n", + "llama_model_loader: - kv 12: tokenizer.ggml.token_type arr[i32,50304] = [3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...\n", + "llama_model_loader: - kv 13: tokenizer.ggml.merges arr[str,50009] = [\"Ġ Ġ\", \"Ġ t\", \"Ġ a\", \"h e\", \"i n...\n", + "llama_model_loader: - kv 14: tokenizer.ggml.bos_token_id u32 = 0\n", + "llama_model_loader: - kv 15: tokenizer.ggml.eos_token_id u32 = 0\n", + "llama_model_loader: - kv 16: tokenizer.ggml.unknown_token_id u32 = 0\n", + "llama_model_loader: - kv 17: tokenizer.ggml.padding_token_id u32 = 0\n", + "llama_model_loader: - kv 18: tokenizer.chat_template str = {% for message in messages %}\\n{% if m...\n", + "llama_model_loader: - kv 19: general.quantization_version u32 = 2\n", + "llama_model_loader: - kv 20: general.file_type u32 = 15\n", + "llama_model_loader: - type f32: 130 tensors\n", + "llama_model_loader: - type q4_K: 193 tensors\n", + "llama_model_loader: - type q6_K: 33 tensors\n", + "llm_load_vocab: mismatch in special tokens definition ( 31/50304 vs 52/50304 ).\n", + "llm_load_print_meta: format = GGUF V3 (latest)\n", + "llm_load_print_meta: arch = stablelm\n", + "llm_load_print_meta: vocab type = BPE\n", + "llm_load_print_meta: n_vocab = 50304\n", + "llm_load_print_meta: n_merges = 50009\n", + "llm_load_print_meta: n_ctx_train = 4096\n", + "llm_load_print_meta: n_embd = 2560\n", + "llm_load_print_meta: n_head = 32\n", + "llm_load_print_meta: n_head_kv = 32\n", + "llm_load_print_meta: n_layer = 32\n", + "llm_load_print_meta: n_rot = 20\n", + "llm_load_print_meta: n_gqa = 1\n", + "llm_load_print_meta: f_norm_eps = 1.0e-05\n", + "llm_load_print_meta: f_norm_rms_eps = 0.0e+00\n", + "llm_load_print_meta: f_clamp_kqv = 0.0e+00\n", + "llm_load_print_meta: f_max_alibi_bias = 0.0e+00\n", + "llm_load_print_meta: n_ff = 6912\n", + "llm_load_print_meta: rope scaling = linear\n", + "llm_load_print_meta: freq_base_train = 10000.0\n", + "llm_load_print_meta: freq_scale_train = 1\n", + "llm_load_print_meta: n_yarn_orig_ctx = 4096\n", + "llm_load_print_meta: rope_finetuned = unknown\n", + "llm_load_print_meta: model type = 3B\n", + "llm_load_print_meta: model ftype = mostly Q4_K - Medium\n", + "llm_load_print_meta: model params = 2.80 B\n", + "llm_load_print_meta: model size = 1.59 GiB (4.88 BPW) \n", + "llm_load_print_meta: general.name = source\n", + "llm_load_print_meta: BOS token = 0 '<|endoftext|>'\n", + "llm_load_print_meta: EOS token = 0 '<|endoftext|>'\n", + "llm_load_print_meta: UNK token = 0 '<|endoftext|>'\n", + "llm_load_print_meta: PAD token = 0 '<|endoftext|>'\n", + "llm_load_print_meta: LF token = 128 'Ä'\n", + "llm_load_tensors: ggml ctx size = 0.13 MiB\n", + "llm_load_tensors: mem required = 1627.87 MiB\n", + "............................................................................................\n", + "llama_new_context_with_model: n_ctx = 512\n", + "llama_new_context_with_model: freq_base = 10000.0\n", + "llama_new_context_with_model: freq_scale = 1\n", + "llama_new_context_with_model: kv self size = 160.00 MiB\n", + "llama_build_graph: non-view tensors processed: 805/805\n", + "llama_new_context_with_model: compute buffer total size = 111.31 MiB\n", + "AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | \n", + "\n", + "llama_print_timings: load time = 2638.30 ms\n", + "llama_print_timings: sample time = 176.50 ms / 417 runs ( 0.42 ms per token, 2362.63 tokens per second)\n", + "llama_print_timings: prompt eval time = 2638.26 ms / 33 tokens ( 79.95 ms per token, 12.51 tokens per second)\n", + "llama_print_timings: eval time = 43936.16 ms / 416 runs ( 105.62 ms per token, 9.47 tokens per second)\n", + "llama_print_timings: total time = 48444.48 ms\n" + ] + }, + { + "data": { + "text/plain": [ + "{'id': 'chatcmpl-a7b6bc93-45fb-4aa8-80d6-f6a416a49bd0',\n", + " 'object': 'chat.completion',\n", + " 'created': 1702047380,\n", + " 'model': '/workspace/stablelm-zephyr-3b.Q4_K_M.gguf',\n", + " 'choices': [{'index': 0,\n", + " 'message': {'role': 'assistant',\n", + " 'content': '\\nOnce upon a time, in the Andes Mountains of South America, there lived a group of fluffy, pink and white llamas. They were well-trained and loved to carry their human friends on their backs up into the mountains where they would explore new trails and valleys. The llamas had soft fur that kept them warm during the cold nights and protected them from the sun\\'s harsh rays during the day.\\n\\nOne day, a group of children went on an adventure with their llama friends. They trekked through fields of colorful flowers and climbed up steep hillsides to reach a hidden waterfall deep in the mountains. As they walked, the children marveled at how well the llamas could navigate through the rugged terrain and carry them safely over rocky paths and streams.\\n\\nAfter reaching the waterfall, the children took a refreshing swim in the cool waters and sat down to rest under the shade of a tree. Suddenly, they heard a loud rustling noise coming from the nearby bushes. They looked around nervously and saw a group of wild llamas running towards them, their fur standing on end as they prepared to attack.\\n\\nBut then, one of the children remembered something his mother had told him: \"Always be kind to animals.\" He quickly went over to the frightened llamas and spoke softly to them, petting and stroking their soft fur. The other children joined in, and soon the group of scared llamas stopped running and began to calm down.\\n\\nThe children learned that day that being kind and gentle can make a big difference in how animals perceive humans. They also realized that sometimes the best way to solve problems is by working together with animals who are already living in their environment. With this new understanding, they continued on their adventure, knowing that they had made friends with some of the most amazing creatures they could ever hope to meet.\\n\\nAnd so, the llamas and children returned home, each with a new appreciation for the wonders of nature and the importance of treating all animals with kindness and respect.'},\n", + " 'finish_reason': 'stop'}],\n", + " 'usage': {'prompt_tokens': 33, 'completion_tokens': 416, 'total_tokens': 449}}" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from llama_cpp import Llama\n", + "\n", + "\n", + "# Chat Completion API\n", + "\n", + "llm = Llama(model_path=\"/workspace/stablelm-zephyr-3b.Q4_K_M.gguf\", chat_format=\"llama-2\") # Set chat_format according to the model you are using\n", + "llm.create_chat_completion(\n", + " messages = [\n", + " {\"role\": \"system\", \"content\": \"You are a story writing assistant.\"},\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": \"Write a story about llamas.\"\n", + " }\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fbb4da7d-de5e-45d1-ad25-8e1b0b46ae5e", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}