vllm-project · Akshat-Tripathi · Nov 20, 2024 · Nov 20, 2024 · Nov 21, 2024 · Nov 22, 2024
diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@@ -45,6 +45,8 @@ docker run --privileged --net host --shm-size=16G -it \
     && pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py \
     && echo TEST_10 \
     && pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py" \
+    && echo TEST_10 \
+    && pytest -s -v /workspace/vllm/tests/tpu/test_lora.py" \
 
 
 # TODO: This test fails because it uses RANDOM_SEED sampling

diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
@@ -47,7 +47,7 @@ def dist_init():
     temp_file = tempfile.mkstemp()[1]
 
     backend = "nccl"
-    if current_platform.is_cpu():
+    if current_platform.is_cpu() or current_platform.is_tpu():
         backend = "gloo"
 
     init_distributed_environment(world_size=1,

diff --git a/tests/lora/tpu/__init__.py b/tests/lora/tpu/__init__.py
diff --git a/tests/lora/tpu/test_pallas_kernels.py b/tests/lora/tpu/test_pallas_kernels.py
@@ -0,0 +1,97 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+import torch
+
+# Required to register the custom ops
+import vllm.lora.ops.xla_ops.pallas  # noqa # pylint: disable=unused-import
+
+N_TOKENS = [16, 1024, 4096]
+HIDDEN_SIZES = [1024, 2048, 4096]
+
+DTYPES = [torch.float16]
+NUM_LORA = [1, 4, 16]
+RANKS = [32, 256, 512]
+
+
+def generate_test_data(T, D, L, N, seed, dtype=torch.float32):
+    """
+    Inputs: (All integers)
+        T: Total number of tokens
+        D: Input dim
+        L: LoRA Dim
+        N: N LoRAs
+
+    Outputs:
+        inputs:     torch.Tensor - shape (T, D)
+        loras:      torch.Tensor - shape (N, 1, L, D)
+        idxs:       torch.Tensor - shape (T, ) - all values must be in [0, N)
+
+        ref_output: torch.Tensor - shape (T, L) - inputs @ loras[idxs].T
+    """
+    torch.manual_seed(seed)
+
+    inputs = torch.randn((T, D), device="xla", dtype=dtype)
+    loras = torch.randn((N, 1, L, D), device="xla", dtype=dtype)
+    idxs = torch.randint(0, N, (T, ), dtype=torch.int32, device="xla")
+
+    ref_output = ref_bgmv(inputs, loras, idxs)
+    return inputs, loras, idxs, ref_output
+
+
+def ref_bgmv(inputs: torch.Tensor, loras: torch.Tensor, idxs: torch.Tensor):
+    selected_loras = loras[idxs]
+    if len(selected_loras.shape) == 4:
+        selected_loras = selected_loras.squeeze(axis=1)
+
+    batch_size, output_size, input_size = selected_loras.shape
+    return (selected_loras @ inputs.reshape(
+        (batch_size, input_size, 1))).reshape((batch_size, output_size))
+
+
+# Parameterize tests with various shapes and dtypes
+@pytest.mark.parametrize("T", N_TOKENS)
+@pytest.mark.parametrize("D", HIDDEN_SIZES)
+@pytest.mark.parametrize("L", RANKS)
+@pytest.mark.parametrize("N", NUM_LORA)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("op_type", ["shrink", "expand"])
+@pytest.mark.parametrize("seed", [0])
+def test_bgmv_correctness(T, D, L, N, dtype, op_type, seed):
+    if op_type == "expand":
+        D, L = L, D
+
+    inputs, loras, idxs, ref_output = generate_test_data(
+        T, D, L, N, seed, dtype)
+
+    # Run bgmv
+    if op_type == "shrink":
+        output = torch.ops.xla.bgmv_shrink(inputs, loras, idxs)
+    else:
+        output = torch.ops.xla.bgmv_expand(inputs, loras.transpose(2, 3), idxs)
+
+    # Make sure we have no NaNs
+    assert not torch.any(torch.isnan(output))
+
+    # Compare with reference output
+    assert torch.allclose(output, ref_output, rtol=1e-2, atol=1e-2)
+
+
+# Parameterize tests with various shapes and dtypes
+@pytest.mark.parametrize("T", N_TOKENS)
+@pytest.mark.parametrize("D", HIDDEN_SIZES)
+@pytest.mark.parametrize("L", RANKS)
+@pytest.mark.parametrize("N", NUM_LORA)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", [0])
+def test_lora_laning_correctness(T, D, L, N, dtype, seed):
+    inputs, loras_a, idxs, _ = generate_test_data(T, D, L, N, seed, dtype)
+    _, loras_b, _, _ = generate_test_data(T, L, D, N, seed, dtype)
+
+    r1 = ref_bgmv(inputs, loras_a, idxs)
+    r2 = ref_bgmv(r1, loras_b, idxs)
+
+    o1 = torch.ops.xla.bgmv_shrink(inputs, loras_a, idxs)
+    o2 = torch.ops.xla.bgmv_expand(o1, loras_b.transpose(2, 3), idxs)
+
+    # Compare with reference output
+    assert torch.allclose(o2, r2, rtol=1e-2, atol=1e-2)
diff --git a/tests/tpu/test_lora.py b/tests/tpu/test_lora.py
@@ -0,0 +1,53 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+
+import vllm
+from vllm.lora.request import LoRARequest
+
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v1_only(monkeypatch: pytest.MonkeyPatch):
+    """
+    Since Multi-LoRA is only supported on the v1 TPU backend, set VLLM_USE_V1=1
+    for all tests in this file
+    """
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        yield
+
+
+@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+def test_lora_e2e(num_loras: int):
+    """
+    This test ensures that we can run with LoRA adapters on the TPU backend.
+    It verifies multiple capabilities:
+        1. We can compile a model with LoRA adapters enabled
+        2. We can run <num_loras> LoRA adapters
+        3. We receive correct outputs when running with multiple LoRA adapters
+        4. We can swap LoRA adapters between host and device
+    """
+    lora_name_template = \
+        "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_{}_adapter"
+    lora_requests = [
+        LoRARequest(f"lora_adapter_{i}", i, lora_name_template.format(i))
+        for i in range(1, 5)
+    ]
+
+    llm = vllm.LLM(model="Qwen/Qwen2.5-3B-Instruct",
+                   num_scheduler_steps=1,
+                   max_model_len=256,
+                   max_seq_len_to_capture=256,
+                   max_num_seqs=8,
+                   enable_lora=True,
+                   max_loras=num_loras,
+                   max_lora_rank=8)
+
+    prompt = "What is 1+1? \n"
+
+    for _ in range(2):
+        for i, req in enumerate(lora_requests):
+            output = llm.generate(prompt,
+                                  sampling_params=vllm.SamplingParams(
+                                      max_tokens=256, temperature=0),
+                                  lora_request=req)[0].outputs[0].text
+            assert int(output.strip()[0]) == i + 1
 expected_lora_output = [ 
 expected_lora_output = [ 
diff --git a/vllm/config.py b/vllm/config.py
@@ -2603,8 +2603,8 @@ class LoRAConfig:
     max_cpu_loras: Optional[int] = None
     lora_dtype: Optional[Union[torch.dtype, str]] = None
     lora_extra_vocab_size: int = 256
-    # This is a constant.
-    lora_vocab_padding_size: ClassVar[int] = 256
+    lora_vocab_padding_size: ClassVar[int] = current_platform\
+        .get_lora_vocab_padding_size()
     long_lora_scaling_factors: Optional[tuple[float]] = None
     bias_enabled: bool = False
 
@@ -2626,6 +2626,7 @@ def compute_hash(self) -> str:
         factors.append(self.fully_sharded_loras)
         factors.append(self.lora_dtype)
         factors.append(self.lora_extra_vocab_size)
+        factors.append(self.lora_vocab_padding_size)
         factors.append(self.long_lora_scaling_factors)
         factors.append(self.bias_enabled)
         hash_str = hashlib.md5(str(factors).encode(),

diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py
@@ -16,6 +16,7 @@
                               MergedQKVParallelLinearWithLoRA,
                               QKVParallelLinearWithLoRA,
                               RowParallelLinearWithLoRA)
+from vllm.platforms import current_platform
 
 if TYPE_CHECKING:
     pass
@@ -57,15 +58,25 @@ def _mcp_apply(x, bias, layer: ColumnParallelLinearWithLoRA):
         device=x.device,
     )
 
-    layer.punica_wrapper.add_shrink(buffers, x, layer.lora_a_stacked, 1.0)
+    shrunk_buffers: Optional[torch.Tensor] = layer.punica_wrapper.add_shrink(
+        buffers, x, layer.lora_a_stacked, 1.0)
+
+    if not current_platform.can_update_inplace():
+        buffers = shrunk_buffers
+
     buffers = tensor_model_parallel_all_gather(buffers)
-    layer.punica_wrapper.add_expand(output,
-                                    buffers,
-                                    layer.lora_b_stacked,
-                                    layer.lora_bias_stacked,
-                                    layer.output_slices,
-                                    offset_start=0,
-                                    add_input=True)
+
+    lora_output: Optional[torch.Tensor] = layer.punica_wrapper.add_expand(
+        output,
+        buffers,
+        layer.lora_b_stacked,
+        layer.lora_bias_stacked,
+        layer.output_slices,
+        offset_start=0,
+        add_input=True)
+
+    if not current_platform.can_update_inplace():
+        output = lora_output
 
     output = output.view(*out_orig_shape)
     # now have column partitioned and packed output
@@ -292,7 +303,11 @@ def apply(self,
             device=x.device,
         )
 
-        self.punica_wrapper.add_shrink(buffer, x, self.lora_a_stacked, 1.0)
+        shrunk_buffer: Optional[torch.Tensor] = self.punica_wrapper.add_shrink(
+            buffer, x, self.lora_a_stacked, 1.0)
+        if not current_platform.can_update_inplace():
+            buffer = shrunk_buffer
+
         buffer = tensor_model_parallel_all_reduce(buffer)
 
         # following S-LoRA, allows the fusing of all_gather and all_reduce
@@ -304,7 +319,7 @@ def apply(self,
         # NOTE offset are based on the rank.
         shard_size = self.lora_b_stacked[0].shape[2]
         offset_start = self.tp_rank * shard_size
-        self.punica_wrapper.add_expand(
+        lora_output: Optional[torch.Tensor] = self.punica_wrapper.add_expand(
             output,
             buffer,
             self.lora_b_stacked,
@@ -313,6 +328,10 @@ def apply(self,
             offset_start=offset_start,
             add_input=True,
         )
+
+        if not current_platform.can_update_inplace():
+            output = lora_output
+
         output = output.view(*out_orig_shape)
         return output
 

diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
@@ -261,10 +261,17 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
                 full_lora_a_embeddings.shape[1],
                 -1,
             )
-        self.punica_wrapper.add_lora_embedding(full_output,
-                                               full_lora_a_embeddings,
-                                               self.lora_b_stacked,
-                                               add_input=True)
+
+        lora_output: Optional[
+            torch.Tensor] = self.punica_wrapper.add_lora_embedding(
+                full_output,
+                full_lora_a_embeddings,
+                self.lora_b_stacked,
+                add_input=True)
+
+        if not current_platform.can_update_inplace():
+            full_output = lora_output
+
         return full_output.view_as(full_output_org)
 
     @classmethod
@@ -410,10 +417,13 @@ def apply(self,
             output = output.flatten(0, 1)
             x = x.flatten(0, 1)
 
-        self.punica_wrapper.add_lora_linear(output, x, self.lora_a_stacked,
-                                            self.lora_b_stacked,
-                                            self.lora_bias_stacked, 1.0,
-                                            self.output_slices)
+        lora_output: Optional[
+            torch.Tensor] = self.punica_wrapper.add_lora_linear(
+                output, x, self.lora_a_stacked, self.lora_b_stacked,
+                self.lora_bias_stacked, 1.0, self.output_slices)
+        if not current_platform.can_update_inplace():
+            output = lora_output
+
         return output
 
     @property
@@ -1133,15 +1143,23 @@ def _get_logits(
         torch.matmul(self.embeddings_tensors,
                      hidden_states.T,
                      out=lora_logits[:-1])
-        lora_logits[-1] = float("-inf")
+
+        neg_inf, pos_inf = current_platform.get_infinity_values(
+            lora_logits.dtype)
+
+        lora_logits[-1] = neg_inf
         lora_logits = lora_logits.mT
         indices_padded = self.punica_wrapper.sampler_indices_padded
+
+        if current_platform.is_tpu():
+            indices_padded = indices_padded[:logits.size(0)]
+
         lora_logits = (lora_logits.reshape(
             lora_logits.shape[0] * lora_logits.shape[1],
             lora_logits.shape[2],
-        ).index_select(0, indices_padded).nan_to_num_(nan=float("-inf"),
-                                                      posinf=float("inf"),
-                                                      neginf=float("-inf")))
+        ).index_select(0, indices_padded).nan_to_num_(nan=neg_inf,
+                                                      posinf=pos_inf,
+                                                      neginf=neg_inf))
 
         # HPU needs special handling to prune out dummy samples.
         if current_platform.is_hpu():
@@ -1151,10 +1169,13 @@ def _get_logits(
                self.base_layer.org_vocab_size:self.base_layer.org_vocab_size +
                lora_logits.shape[1]] = lora_logits
 
-        # LogitsProcessorWithLoRA always using bgmv
-        self.punica_wrapper.add_lora_logits(logits, hidden_states,
-                                            self.lora_a_stacked,
-                                            self.lora_b_stacked, 1.0)
+        lora_output: Optional[
+            torch.Tensor] = self.punica_wrapper.add_lora_logits(
+                logits, hidden_states, self.lora_a_stacked,
+                self.lora_b_stacked, 1.0)
+
+        if not current_platform.can_update_inplace():
+            logits = lora_output
 
         # Remove paddings in vocab (if any).
         logits = logits[:, :self.base_layer.vocab_size]

diff --git a/vllm/lora/models.py b/vllm/lora/models.py
@@ -199,7 +199,7 @@ def from_local_checkpoint(
         weights_mapper: Optional[WeightsMapper] = None,
     ) -> "LoRAModel":
         """Create a LoRAModel from a local checkpoint.
-        
+
         Args:
             lora_dir: The local path that has lora data.
             expected_lora_modules: Name of modules that are expected to be
@@ -605,7 +605,7 @@ def _match_target_modules(self, module_name: str):
     def _filter_unsupported_mm_module(self, module_name: str) -> bool:
         """
         Regarding multimodal models, vLLM currently only supports adding LoRA to
-        language model. LoRA for other modules, such as the vision tower, will 
+        language model. LoRA for other modules, such as the vision tower, will
         be filtered out.
         """
         if self.supports_mm:

diff --git a/vllm/lora/ops/xla_ops/__init__.py b/vllm/lora/ops/xla_ops/__init__.py
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from vllm.lora.ops.xla_ops.lora_ops import (bgmv_expand, bgmv_expand_slice,
+                                            bgmv_shrink)
+from vllm.lora.ops.xla_ops.pallas import LORA_RANK_BLOCK_SIZE
+
+__all__ = [
+    "bgmv_expand", "bgmv_expand_slice", "bgmv_shrink", "LORA_RANK_BLOCK_SIZE"
+]