mindspore-courses
diff --git a/‎Season2.step_into_llm/04.LLaMA/LLaMA_slides.pdf
4.7 MB b/‎Season2.step_into_llm/04.LLaMA/LLaMA_slides.pdf
4.7 MB
diff --git a/‎Season2.step_into_llm/04.LLaMA/assets/formula.png
13.2 KB b/‎Season2.step_into_llm/04.LLaMA/assets/formula.png
13.2 KB
diff --git a/‎Season2.step_into_llm/04.LLaMA/assets/rope-calculation.png
61.9 KB b/‎Season2.step_into_llm/04.LLaMA/assets/rope-calculation.png
61.9 KB
diff --git a/‎Season2.step_into_llm/04.LLaMA/assets/rotary_embedding.png
158 KB b/‎Season2.step_into_llm/04.LLaMA/assets/rotary_embedding.png
158 KB
diff --git a/‎Season2.step_into_llm/04.LLaMA/assets/rotation-2d.png
17 KB b/‎Season2.step_into_llm/04.LLaMA/assets/rotation-2d.png
17 KB
diff --git a/‎Season2.step_into_llm/04.LLaMA/assets/rotation-general.png
45.6 KB b/‎Season2.step_into_llm/04.LLaMA/assets/rotation-general.png
45.6 KB
diff --git a/‎Season2.step_into_llm/04.LLaMA/rotary_embedding.ipynb
+171 b/‎Season2.step_into_llm/04.LLaMA/rotary_embedding.ipynb
+171
@@ -0,0 +1,171 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<div align=center><img src=\"./assets/rotary_embedding.png\"></div>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from enum import Enum\n",
+    "import numpy as np\n",
+    "\n",
+    "from mindspore.common.tensor import Tensor\n",
+    "from mindspore.common.parameter import Parameter\n",
+    "from mindspore import nn\n",
+    "import mindspore.common.dtype as mstype\n",
+    "from mindspore.ops import operations as P\n",
+    "from mindspore.ops import functional as F\n",
+    "from mindspore.nn.cell import Cell\n",
+    "\n",
+    "\n",
+    "def precompute_freqs_cis(\n",
+    "        dim: int,\n",
+    "        end: int,\n",
+    "        theta: float = 10000.0,\n",
+    "        dtype=mstype.float32,\n",
+    "        pretrain_seqlen=2048,\n",
+    "        extend_method=SeqExtendMethod.NONE.value):\n",
+    "    \"\"\"\n",
+    "    Precompute of freqs and mask for rotary embedding.\n",
+    "    \"\"\"\n",
+    "    ratio = 1.\n",
+    "    if extend_method != SeqExtendMethod.NONE.value and end > pretrain_seqlen:\n",
+    "        ratio = end / pretrain_seqlen\n",
+    "    if extend_method == SeqExtendMethod.NTK.value:\n",
+    "        theta *= ratio\n",
+    "\n",
+    "    # 2i/d\n",
+    "    # dim = 64\n",
+    "    # 2i: np.arange(0, dim, 2) ==> [0, 2, 4, ..., 64], tot_num = 32\n",
+    "    # 2i/d = np.arange(0, dim, 2)[: (dim // 2)], dim // 2 = tot_num = 32\n",
+    "    freqs_base = np.arange(0, dim, 2)[: (dim // 2)].astype(np.float32) # (head_dim // 2, )\n",
+    "\n",
+    "    # theta**(-2i/d) = 1/theta**(2i/d)\n",
+    "    # (dim//2, ) => (32,)\n",
+    "    freqs = 1.0 / (theta ** (freqs_base / dim)) # (head_dim // 2, )\n",
+    "\n",
+    "    # t ==> m\n",
+    "    # t = [0, 1, 2, 3, ..., 1024]\n",
+    "    if extend_method == SeqExtendMethod.PI.value:\n",
+    "        t = np.arange(0, end / ratio, 1 / ratio).astype(np.float32)\n",
+    "    else:\n",
+    "        t = np.arange(0, end, 1).astype(np.float32)  # type: ignore # (seq_len,)\n",
+    "    # (1024, )(32, ) ==> (1024, 32) m*theta_i\n",
+    "    freqs = np.outer(t, freqs)  # type: ignore (seq_len, head_dim // 2)\n",
+    "    emb = np.concatenate((freqs, freqs), axis=-1)\n",
+    "\n",
+    "    freqs_cos = np.cos(emb) # (seq_len, head_dim)\n",
+    "    freqs_sin = np.sin(emb) # (seq_len, head_dim)\n",
+    "    freqs_cos = Tensor(freqs_cos, dtype=dtype)\n",
+    "    freqs_sin = Tensor(freqs_sin, dtype=dtype)\n",
+    "\n",
+    "    swap_mask = get_swap_mask(dim)\n",
+    "    swap_mask = Tensor(swap_mask, dtype=dtype)\n",
+    "\n",
+    "    # sin(m * theta_i)\n",
+    "    # cos(m * theta_i)\n",
+    "    return freqs_cos, freqs_sin, swap_mask"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "在2D vector情况下，旋转的矩阵表达应为：\n",
+    "\n",
+    "<div align=center><img src=\"./assets/rotation-2d.png\"></div>\n",
+    "\n",
+    "拓展到general form，即当模型的hidden size长度大于2时：\n",
+    "\n",
+    "<div align=center><img src=\"./assets/rotation-general.png\"></div>\n",
+    "\n",
+    "将旋转变化作用在q、k之上，然后q、k再进行点积的结果就变为如下公式，即该点积只和两个向量之间的相对位置有关：\n",
+    "\n",
+    "<div align=center><img src=\"./assets/formula.png\"></div>\n",
+    "\n",
+    "但因为矩阵的稀疏性，直接用矩阵乘法来实现会很浪费算力，实际情况下一般会通过下述方式来实现RoPE：\n",
+    "\n",
+    "<div align=center><img src=\"./assets/rope-calculation.png\"></div>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class LlamaRotaryEmbedding(Cell):\n",
+    "    r\"\"\"\n",
+    "    Rotary Position Embedding.\n",
+    "\n",
+    "    Args:\n",
+    "            - **head_dim** (int): The dim of multi head attention.\n",
+    "            - **compute_dtype** (mstype): The compute type, default mstype.float16.\n",
+    "            - **parallel_config** (dict): - Parallel Config.\n",
+    "    Inputs:\n",
+    "            - **x** (Tensor) - Tensor of shape :math:`(batch, seq\\_length, hidden\\_size)`.\n",
+    "\n",
+    "    Outputs:\n",
+    "            Tensor of shape :math:`(batch, seq_length, hidden_size)`.\n",
+    "    \"\"\"\n",
+    "\n",
+    "    def __init__(self, head_dim=128, compute_dtype=mstype.float32):\n",
+    "        super().__init__(auto_prefix=False)\n",
+    "        self.head_dim = head_dim\n",
+    "        self.dtype = compute_dtype\n",
+    "\n",
+    "        self.add = P.Add()\n",
+    "        self.bmm_swap = P.BatchMatMul()\n",
+    "        self.mul = P.Mul()\n",
+    "\n",
+    "        self.cast = P.Cast()\n",
+    "\n",
+    "    def rotate_half(self, x, swap_mask):\n",
+    "        # [bs, n_head/n_kv_head, seq/1, head_dim], [head_dim, head_dim]\n",
+    "        x = self.bmm_swap(x, swap_mask)\n",
+    "        return x\n",
+    "\n",
+    "    def construct(self, xq: Tensor, xk: Tensor, freqs_cis):\n",
+    "        \"\"\"Forward of rotary position embedding.\"\"\"\n",
+    "        original_type = xq.dtype\n",
+    "        xq = self.cast(xq, self.dtype)\n",
+    "        xk = self.cast(xk, self.dtype)\n",
+    "        # xq, xk: [bs, n_head/n_kv_head, seq/1, head_dim]\n",
+    "        freqs_cos, freqs_sin, swap_mask = freqs_cis\n",
+    "        xq_out = self.add(self.mul(xq, freqs_cos),\n",
+    "                          self.mul(self.rotate_half(xq, swap_mask), freqs_sin))\n",
+    "        xk_out = self.add(self.mul(xk, freqs_cos),\n",
+    "                          self.mul(self.rotate_half(xk, swap_mask), freqs_sin))\n",
+    "\n",
+    "        xq_out = self.cast(xq_out, original_type)\n",
+    "        xk_out = self.cast(xk_out, original_type)\n",
+    "        return xq_out, xk_out\n",
+    "\n",
+    "    def shard(self, strategy_in):\n",
+    "        self.add.shard((strategy_in, strategy_in))\n",
+    "        self.bmm_swap.shard((strategy_in, (1, 1)))\n",
+    "        self.mul.shard((strategy_in, (strategy_in[0], 1, 1, 1)))\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "mindspore_2.2",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.7.16"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}