sgl-project
diff --git a/‎python/sgl_jax/bench_one_batch.py‎
Lines changed: 5 additions & 1 deletion b/‎python/sgl_jax/bench_one_batch.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎python/sgl_jax/srt/entrypoints/engine.py‎
Lines changed: 19 additions & 0 deletions b/‎python/sgl_jax/srt/entrypoints/engine.py‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎python/sgl_jax/srt/lora/backend/bgmv_backend.py‎
Lines changed: 1 addition & 1 deletion b/‎python/sgl_jax/srt/lora/backend/bgmv_backend.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/sgl_jax/srt/lora/lora_manager.py‎
Lines changed: 36 additions & 20 deletions b/‎python/sgl_jax/srt/lora/lora_manager.py‎
Lines changed: 36 additions & 20 deletions
diff --git a/‎python/sgl_jax/srt/managers/schedule_batch.py‎
Lines changed: 6 additions & 1 deletion b/‎python/sgl_jax/srt/managers/schedule_batch.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎python/sgl_jax/srt/managers/scheduler.py‎
Lines changed: 2 additions & 0 deletions b/‎python/sgl_jax/srt/managers/scheduler.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎python/sgl_jax/srt/managers/tp_worker.py‎
Lines changed: 5 additions & 2 deletions b/‎python/sgl_jax/srt/managers/tp_worker.py‎
Lines changed: 5 additions & 2 deletions
@@ -267,7 +267,11 @@ def _run_forward_and_sample(model_runner, batch: ScheduleBatch, token_first_arg:
     )
 
     model_worker_batch = batch.get_model_worker_batch(
-        [token_first_arg], [bs_needed], [cache_loc_needed], page_size
+        [token_first_arg],
+        [bs_needed],
+        [cache_loc_needed],
+        page_size,
+        False,
     )
 
     # Prepare attention forward metadata (required by FlashAttention backend)
 
@@ -19,6 +19,10 @@
 import uvloop
 import zmq
 import zmq.asyncio
+from flax import nnx
+
+from sgl_jax.srt.utils.common_utils import SUPPORTED_LORA_TARGET_MODULES
+from sgl_jax.utils import traverse_and_update
 
 # ruff: noqa: E402
 # Fix a bug of Python threading
@@ -194,6 +198,21 @@ async def async_generate(
         else:
             return await generator.__anext__()
 
+    def apply_dummy_lora_ab_buffer(self, target_modules: list | None = None):
+        if target_modules is None or len(target_modules) == 0:
+            logger.warning("No %v is specified, so skip to apply", target_modules)
+            return
+
+        if "all" in target_modules:
+            target_modules = SUPPORTED_LORA_TARGET_MODULES
+
+        logger.info("Applying dummy LoRA buffers to modules: %v", target_modules)
+
+        model_runner = self.scheduler_info["scheduler"].tp_worker.worker.model_runner
+        model_state = nnx.split(model_runner.model)[1]
+        new_state = traverse_and_update(model_state, target_modules)
+        nnx.update(model_runner.model, new_state)
+
     def encode(
         self,
         prompt: str | list[str] | list[dict] | list[list[dict]],
 
@@ -233,7 +233,7 @@ def prepare_lora_batch(
                     scalings_cpu, [0, num_to_pad], mode="constant", constant_values=0.0
                 )
                 padded_token_lora_indices_cpu = np.pad(
-                    token_lora_indices_cpu, [0, num_to_pad], mode="constant", constant_values=-1
+                    token_lora_indices_cpu, [0, num_to_pad], mode="constant", constant_values=0
                 )
                 padded_lora_ranks_cpu = np.pad(
                     lora_ranks_cpu, [0, num_to_pad], mode="constant", constant_values=0
 
@@ -111,6 +111,7 @@ def __init__(
         self.num_attention_heads = base_hf_config.num_attention_heads
         self.num_kv_heads = getattr(base_hf_config, "num_key_value_heads", self.num_attention_heads)
         self.head_dim = getattr(base_hf_config, "head_dim", None)
+        self.static_lora = server_args.enable_static_lora
 
         # Get original num_kv_heads and tp_size for replication
         if model_config is not None:
@@ -420,34 +421,49 @@ def prepare_lora_batch(self, model_worker_batch: ModelWorkerBatch):
 
         assert len(cur_uids) <= self.max_loras_per_batch
 
-        # Load adapters into device memory pool (CPU -> device transfer)
-        self.memory_pool.prepare_lora_batch(
-            cur_uids=cur_uids,
-            lora_adapters=self.loras,
-        )
-
         weight_indices = [0] * len(model_worker_batch.lora_ids)
         lora_ranks = [0] * self.max_loras_per_batch
         scalings = [0] * self.max_loras_per_batch
 
-        for i, uid in enumerate(model_worker_batch.lora_ids):
-            weight_indices[i] = self.memory_pool.get_buffer_id(uid)
-            if uid is not None and uid in self.loras:
-                lora = self.loras[uid]
-                lora_ranks[weight_indices[i]] = lora.config.r
-                scalings[weight_indices[i]] = lora.scaling
-
-        self.lora_backend.prepare_lora_batch(
-            model_worker_batch=model_worker_batch,
-            weight_indices=weight_indices,
-            lora_ranks=lora_ranks,
-            scalings=scalings,
-        )
+        def prepare_static_lora_batch():
+            self.lora_backend.prepare_lora_batch(
+                model_worker_batch=model_worker_batch,
+                weight_indices=[0] * len(model_worker_batch.lora_ids),
+                lora_ranks=[self.max_lora_rank] * self.max_loras_per_batch,
+                scalings=[self.server_args.lora_scaling] * self.max_loras_per_batch,
+            )
+
+        def prepare_dynamic_lora_batch():
+            # Load adapters into device memory pool (CPU -> device transfer)
+            self.memory_pool.prepare_lora_batch(
+                cur_uids=cur_uids,
+                lora_adapters=self.loras,
+            )
+
+            for i, uid in enumerate(model_worker_batch.lora_ids):
+                weight_indices[i] = self.memory_pool.get_buffer_id(uid)
+                if uid is not None and uid in self.loras:
+                    lora = self.loras[uid]
+                    lora_ranks[weight_indices[i]] = lora.config.r
+                    scalings[weight_indices[i]] = lora.scaling
+
+            self.lora_backend.prepare_lora_batch(
+                model_worker_batch=model_worker_batch,
+                weight_indices=weight_indices,
+                lora_ranks=lora_ranks,
+                scalings=scalings,
+            )
+
+        if self.static_lora:
+            prepare_static_lora_batch()
+        else:
+            prepare_dynamic_lora_batch()
 
         # Update LoRA layer buffer references after loading new weights
         # This is necessary because JAX arrays are immutable, and load_lora_weight_to_buffer
         # creates new arrays. We need to update the references in LoRALinear layers.
-        self.update_lora_info()
+        if not self.static_lora:
+            self.update_lora_info()
 
         logger.debug("Prepared LoRA batch: %d unique adapters", len(cur_uids))
 
 
@@ -1100,6 +1100,7 @@ def get_model_worker_batch(
         bs_paddings: list,
         cache_loc_paddings: list,
         page_size: int,
+        enable_static_lora: bool = False,
         skip_padding: bool = False,
     ) -> ModelWorkerBatch:
         if skip_padding:
@@ -1371,7 +1372,11 @@ def get_model_worker_batch(
             extend_seq_lens=(extend_seq_lens if self.forward_mode == ForwardMode.EXTEND else None),
             extend_logprob_start_lens=extend_logprob_start_lens,
             extend_input_logprob_token_ids=self.extend_input_logprob_token_ids,
-            lora_ids=lora_ids,
+            lora_ids=(
+                [req.lora_id for req in self.reqs] + [None] * bs_padding_size
+                if not enable_static_lora
+                else ["0"] * bs_paddings[select_bs_index]
+            ),
             real_bs=real_bs,
             capture_hidden_mode=CaptureHiddenMode.NULL,
             launch_done=self.launch_done,
 
@@ -1189,6 +1189,7 @@ def run_batch(self, batch: ScheduleBatch) -> GenerationBatchResult:
                 precompile_bs_paddings,
                 precompile_cache_loc_paddings,
                 self.page_size,
+                self.server_args.enable_static_lora,
             )
 
             if self.enable_overlap:
@@ -1229,6 +1230,7 @@ def run_batch(self, batch: ScheduleBatch) -> GenerationBatchResult:
                 precompile_bs_paddings,
                 precompile_cache_loc_paddings,
                 self.page_size,
+                self.server_args.enable_static_lora,
                 # eagle's model_worker_batch will be modified and repadding within eagle_worker
                 skip_padding=True,
             )
 
@@ -236,6 +236,7 @@ def precompile_extend(self, future_token_ids_map=None):
                     num_tokens,
                     ForwardMode.EXTEND,
                     self.precompile_cache_loc_paddings[-1],
+                    enable_static_lora=self.server_args.enable_static_lora,
                 )
                 # Prepare LoRA batch if LoRA is enabled
                 if self.server_args.enable_lora:
@@ -278,6 +279,7 @@ def precompile_decode(self, future_token_ids_map=None):
                     bs,
                     ForwardMode.DECODE,
                     aligned_cache_loc_size,
+                    enable_static_lora=self.server_args.enable_static_lora,
                 )
                 # Prepare LoRA batch if LoRA is enabled
                 if self.server_args.enable_lora:
@@ -341,6 +343,7 @@ def generate_model_worker_batch(
         max_cache_loc_size: int,
         do_penalties: bool = False,
         speculative_algotithm=None,
+        enable_static_lora: bool = None,
     ) -> ModelWorkerBatch:
         valid_input_ids = np.array([1] * bs, dtype=jnp.int32)
         invalid_input_ids = np.array([0] * (num_tokens - bs), dtype=jnp.int32)
@@ -354,7 +357,7 @@ def generate_model_worker_batch(
 
         valid_cache_loc = np.arange(bs)
         invalid_cache_loc = np.array([0] * (invalid_cache_loc_size), dtype=jnp.int32)
-        lora_ids = [0] if bs == 1 else [0] * (bs // 2) + [None] * (bs - bs // 2)
+        lora_ids = ["0"] if bs == 1 else ["0"] * (bs // 2) + [None] * (bs - bs // 2)
 
         return ModelWorkerBatch(
             bid=1,
@@ -384,7 +387,7 @@ def generate_model_worker_batch(
             extend_logprob_start_lens=None,
             capture_hidden_mode=CaptureHiddenMode.NULL,
             spec_algorithm=speculative_algotithm,
-            lora_ids=lora_ids,
+            lora_ids=lora_ids if not enable_static_lora else ["0"] * bs,
         )
 
     def get_model_runner(self):
Original file line number	Diff line number	Diff line change
`@@ -267,7 +267,11 @@ def _run_forward_and_sample(model_runner, batch: ScheduleBatch, token_first_arg:`
`267`	`267`	`)`
`268`	`268`
`269`	`269`	`model_worker_batch = batch.get_model_worker_batch(`
`270`		`- [token_first_arg], [bs_needed], [cache_loc_needed], page_size`
	`270`	`+ [token_first_arg],`
	`271`	`+ [bs_needed],`
	`272`	`+ [cache_loc_needed],`
	`273`	`+ page_size,`
	`274`	`+ False,`
`271`	`275`	`)`
`272`	`276`
`273`	`277`	`# Prepare attention forward metadata (required by FlashAttention backend)`
Original file line number	Diff line number	Diff line change
`@@ -233,7 +233,7 @@ def prepare_lora_batch(`
`233`	`233`	`scalings_cpu, [0, num_to_pad], mode="constant", constant_values=0.0`
`234`	`234`	`)`
`235`	`235`	`padded_token_lora_indices_cpu = np.pad(`
`236`		`- token_lora_indices_cpu, [0, num_to_pad], mode="constant", constant_values=-1`
	`236`	`+ token_lora_indices_cpu, [0, num_to_pad], mode="constant", constant_values=0`
`237`	`237`	`)`
`238`	`238`	`padded_lora_ranks_cpu = np.pad(`
`239`	`239`	`lora_ranks_cpu, [0, num_to_pad], mode="constant", constant_values=0`
Original file line number	Diff line number	Diff line change
`@@ -1189,6 +1189,7 @@ def run_batch(self, batch: ScheduleBatch) -> GenerationBatchResult:`
`1189`	`1189`	`precompile_bs_paddings,`
`1190`	`1190`	`precompile_cache_loc_paddings,`
`1191`	`1191`	`self.page_size,`
	`1192`	`+ self.server_args.enable_static_lora,`
`1192`	`1193`	`)`
`1193`	`1194`
`1194`	`1195`	`if self.enable_overlap:`
`@@ -1229,6 +1230,7 @@ def run_batch(self, batch: ScheduleBatch) -> GenerationBatchResult:`
`1229`	`1230`	`precompile_bs_paddings,`
`1230`	`1231`	`precompile_cache_loc_paddings,`
`1231`	`1232`	`self.page_size,`
	`1233`	`+ self.server_args.enable_static_lora,`
`1232`	`1234`	`# eagle's model_worker_batch will be modified and repadding within eagle_worker`
`1233`	`1235`	`skip_padding=True,`
`1234`	`1236`	`)`