vllm-project
diff --git a/‎tests/lora/test_gptoss_tp.py‎
Lines changed: 6 additions & 1 deletion b/‎tests/lora/test_gptoss_tp.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎vllm/lora/layers/__init__.py‎
Lines changed: 2 additions & 1 deletion b/‎vllm/lora/layers/__init__.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎vllm/lora/layers/base.py‎
Lines changed: 2 additions & 2 deletions b/‎vllm/lora/layers/base.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎vllm/lora/layers/base_linear.py‎
Lines changed: 4 additions & 2 deletions b/‎vllm/lora/layers/base_linear.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎vllm/lora/layers/column_parallel_linear.py‎
Lines changed: 2 additions & 2 deletions b/‎vllm/lora/layers/column_parallel_linear.py‎
Lines changed: 2 additions & 2 deletions
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import pytest
+
 import vllm
 from vllm.lora.request import LoRARequest
 
@@ -84,14 +86,17 @@ def test_gpt_oss_lora(gptoss20b_lora_files):
 
 
 @multi_gpu_test(num_gpus=2)
-def test_gpt_oss_lora_tp2(gptoss20b_lora_files):
+@pytest.mark.parametrize("fully_sharded_loras", [False, True])
+def test_gpt_oss_lora_tp2(gptoss20b_lora_files, fully_sharded_loras):
     llm = vllm.LLM(
         MODEL_PATH,
         max_model_len=1024,
         enable_lora=True,
         max_loras=2,
         max_lora_rank=8,
+        max_num_seqs=16,
         tensor_parallel_size=2,
+        fully_sharded_loras=fully_sharded_loras,
         compilation_config=vllm.config.CompilationConfig(  # Avoid OOM
             cudagraph_specialize_lora=False,
         ),
 
@@ -11,7 +11,7 @@
     QKVParallelLinearWithLoRA,
     QKVParallelLinearWithShardedLoRA,
 )
-from vllm.lora.layers.fused_moe import FusedMoEWithLoRA
+from vllm.lora.layers.fused_moe import FusedMoE3DWithLoRA, FusedMoEWithLoRA
 from vllm.lora.layers.logits_processor import LogitsProcessorWithLoRA
 from vllm.lora.layers.replicated_linear import ReplicatedLinearWithLoRA
 from vllm.lora.layers.row_parallel_linear import (
@@ -38,4 +38,5 @@
     "ReplicatedLinearWithLoRA",
     "LoRAMapping",
     "FusedMoEWithLoRA",
+    "FusedMoE3DWithLoRA",
 ]
@@ -42,8 +42,8 @@ def reset_lora(self, index: int):
     def set_lora(
         self,
         index: int,
-        lora_a: torch.Tensor,
-        lora_b: torch.Tensor,
+        lora_a: torch.Tensor | list[torch.Tensor],
+        lora_b: torch.Tensor | list[torch.Tensor],
     ):
         """Overwrites lora tensors at index."""
         ...
 
@@ -94,13 +94,15 @@ def reset_lora(self, index: int):
     def set_lora(
         self,
         index: int,
-        lora_a: torch.Tensor,
-        lora_b: torch.Tensor,
+        lora_a: torch.Tensor | list[torch.Tensor],
+        lora_b: torch.Tensor | list[torch.Tensor],
     ):
         # Except for QKVParallelLinearWithLoRA and
         # MergedColumnParallelLinearWithLoRA, all other linear LoRA layers
         # store weights in a tuple of size 1. These two layers will
         # override this function.
+        assert isinstance(lora_a, torch.Tensor)
+        assert isinstance(lora_b, torch.Tensor)
         assert (
             len(self.lora_a_stacked) == len(self.lora_b_stacked) == self.n_slices == 1
         )
 
@@ -246,8 +246,8 @@ def slice_lora_b(
     def set_lora(
         self,
         index: int,
-        lora_a: torch.Tensor,
-        lora_b: torch.Tensor,
+        lora_a: torch.Tensor | list[torch.Tensor],
+        lora_b: torch.Tensor | list[torch.Tensor],
     ):
         self.reset_lora(index)
Original file line number	Diff line number	Diff line change
`@@ -11,7 +11,7 @@`
`11`	`11`	`QKVParallelLinearWithLoRA,`
`12`	`12`	`QKVParallelLinearWithShardedLoRA,`
`13`	`13`	`)`
`14`		`-from vllm.lora.layers.fused_moe import FusedMoEWithLoRA`
	`14`	`+from vllm.lora.layers.fused_moe import FusedMoE3DWithLoRA, FusedMoEWithLoRA`
`15`	`15`	`from vllm.lora.layers.logits_processor import LogitsProcessorWithLoRA`
`16`	`16`	`from vllm.lora.layers.replicated_linear import ReplicatedLinearWithLoRA`
`17`	`17`	`from vllm.lora.layers.row_parallel_linear import (`
`@@ -38,4 +38,5 @@`
`38`	`38`	`"ReplicatedLinearWithLoRA",`
`39`	`39`	`"LoRAMapping",`
`40`	`40`	`"FusedMoEWithLoRA",`
	`41`	`+ "FusedMoE3DWithLoRA",`
`41`	`42`	`]`