set lora

JamesBrianD · JamesBrianD · commit 983980918f79 · 2025-12-02T13:24:54.000+08:00
diff --git a/gemini.md b/gemini.md
@@ -0,0 +1,3 @@
+# Gemini CLI Constraints & Notes
+
+- **Testing:** I cannot execute tests locally. Any required testing will be deferred to the end of the task for the user to execute manually. I will provide the necessary commands to run the tests.
diff --git a/python/sgl_jax/srt/lora/backend/bgmv_backend.py b/python/sgl_jax/srt/lora/backend/bgmv_backend.py
@@ -249,7 +249,10 @@ def prepare_lora_batch(
             lora_ranks=jnp.array(padded_lora_ranks_cpu, dtype=jnp.int32),
         )
 
-        self.batch_info = BatchInfo(batch_info)
+        return batch_info
+
+    def set_batch_info(self, batch_info: LoRABatchInfo):
+        self.batch_info.value = batch_info
 
 
 def shrink(
diff --git a/python/sgl_jax/srt/lora/lora_manager.py b/python/sgl_jax/srt/lora/lora_manager.py
@@ -421,7 +421,7 @@ def prepare_lora_batch(self, model_worker_batch: ModelWorkerBatch):
         assert len(cur_uids) <= self.max_loras_per_batch
 
         # Load adapters into device memory pool (CPU -> device transfer)
-        self.memory_pool.prepare_lora_batch(
+        has_new_weights = self.memory_pool.prepare_lora_batch(
             cur_uids=cur_uids,
             lora_adapters=self.loras,
         )
@@ -437,20 +437,27 @@ def prepare_lora_batch(self, model_worker_batch: ModelWorkerBatch):
                 lora_ranks[weight_indices[i]] = lora.config.r
                 scalings[weight_indices[i]] = lora.scaling
 
-        self.lora_backend.prepare_lora_batch(
+        batch_info = self.lora_backend.prepare_lora_batch(
             model_worker_batch=model_worker_batch,
             weight_indices=weight_indices,
             lora_ranks=lora_ranks,
             scalings=scalings,
         )
+        model_worker_batch.lora_batch_info = batch_info
 
         # Update LoRA layer buffer references after loading new weights
         # This is necessary because JAX arrays are immutable, and load_lora_weight_to_buffer
         # creates new arrays. We need to update the references in LoRALinear layers.
-        self.update_lora_info()
+        if has_new_weights:
+            self.update_lora_info()
 
         logger.debug("Prepared LoRA batch: %d unique adapters", len(cur_uids))
 
+    def set_batch_info(self, batch_info):
+        """Set batch info in backend."""
+        if hasattr(self, "lora_backend"):
+            self.lora_backend.set_batch_info(batch_info)
+
     def get_buffer_id(self, lora_id: str | None) -> int:
         """Get buffer slot ID for a given LoRA adapter ID."""
         return self.memory_pool.get_buffer_id(lora_id)
diff --git a/python/sgl_jax/srt/lora/lora_memory_pool.py b/python/sgl_jax/srt/lora/lora_memory_pool.py
@@ -364,7 +364,7 @@ def prepare_lora_batch(
         self,
         cur_uids: set[str | None],
         lora_adapters: dict[str | None, LoRAAdapter],
-    ):
+    ) -> bool:
         """
         Prepare LoRA batch by loading adapters into buffer slots.
 
@@ -374,6 +374,9 @@ def prepare_lora_batch(
             cur_uids: Set of lora_ids needed for current batch
             lora_adapters: Dict mapping lora_id to LoRAAdapter
 
+        Returns:
+            bool: True if new weights were loaded (requires updating references), False otherwise.
+
         Raises:
             ValueError: If no buffer slots available
         """
@@ -389,6 +392,8 @@ def get_available_buffer_slot() -> int:
                 self.max_loras_per_batch,
             )
 
+        has_new_weights = False
+
         # Load each adapter that's not already loaded
         for uid in cur_uids:
             if uid not in self.uid_to_buffer_id:
@@ -397,10 +402,13 @@ def get_available_buffer_slot() -> int:
                 self.load_lora_weight_to_buffer(uid, buffer_id, lora_adapter)
                 self.uid_to_buffer_id[uid] = buffer_id
                 self.buffer_id_to_uid[buffer_id] = uid
+                has_new_weights = True
                 logger.info("Loaded LoRA %s into buffer slot %d", uid, buffer_id)
             else:
                 logger.debug("LoRA %s already in buffer slot %d", uid, self.uid_to_buffer_id[uid])
 
+        return has_new_weights
+
     def load_lora_weight_to_buffer(
         self,
         uid: str | None,
diff --git a/python/sgl_jax/srt/managers/schedule_batch.py b/python/sgl_jax/srt/managers/schedule_batch.py
@@ -1694,6 +1694,8 @@ class ModelWorkerBatch:
     # If set, the output of the batch contains the hidden states of the run.
     capture_hidden_mode: CaptureHiddenMode = None
 
+    lora_batch_info: Any | None = None
+
     tree_cache: BasePrefixCache = None
 
     def padding_model_worker_batch(
diff --git a/python/sgl_jax/srt/managers/tp_worker.py b/python/sgl_jax/srt/managers/tp_worker.py
@@ -461,8 +461,14 @@ def forward_batch_generation(
             forward_batch = ForwardBatch.init_new(model_worker_batch, self.model_runner)
 
         # Prepare LoRA batch if LoRA is enabled
-        if self.worker.server_args.enable_lora and self.need_prepare_lora_batch:
-            self.get_model_runner().lora_manager.prepare_lora_batch(model_worker_batch)
+        if self.worker.server_args.enable_lora:
+            if model_worker_batch.lora_batch_info is None:
+                self.get_model_runner().lora_manager.prepare_lora_batch(model_worker_batch)
+
+            if model_worker_batch.lora_batch_info is not None:
+                self.get_model_runner().lora_manager.set_batch_info(
+                    model_worker_batch.lora_batch_info
+                )
 
         if forward_metadata is None:
             forward_metadata = self.worker.model_runner.attn_backend.get_forward_metadata(
diff --git a/python/sgl_jax/srt/model_executor/model_runner.py b/python/sgl_jax/srt/model_executor/model_runner.py
@@ -162,7 +162,7 @@ def initialize(self):
 
     def initialize_jit(self):
         model_def, model_state = nnx.split(self.model)
-        _, model_state_def = jax.tree_util.tree_flatten(model_state)
+        model_state_leaves, model_state_def = jax.tree_util.tree_flatten(model_state)
         sampler_def, sampler_state = nnx.split(self.sampler)
         sampler_state_leaves, sampler_state_def = jax.tree_util.tree_flatten(sampler_state)
 
@@ -199,13 +199,17 @@ def run_model_wrapper(forward_batch, logits_metadata):
             token_to_kv_pool = self.token_to_kv_pool
 
             # Re-capture model state to get the latest LoRA weights
-            _, model_state = nnx.split(self.model)
-            model_state_leaves, _ = jax.tree_util.tree_flatten(model_state)
+            if self.server_args.enable_lora:
+                # Re-capture model state to get the latest LoRA weights
+                _, model_state = nnx.split(self.model)
+                current_model_state_leaves, _ = jax.tree_util.tree_flatten(model_state)
+            else:
+                current_model_state_leaves = model_state_leaves
 
             return jitted_run_model(
                 model_def,
                 model_state_def,
-                model_state_leaves,
+                current_model_state_leaves,
                 forward_batch,
                 token_to_kv_pool,
                 logits_metadata,
diff --git a/test/srt/lora/test_lora.py b/test/srt/lora/test_lora.py
diff --git a/test/srt/lora/test_lora_manager_optimization.py b/test/srt/lora/test_lora_manager_optimization.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+# Gemini CLI Constraints & Notes`
	`2`	`+`
	`3`	`+- Testing: I cannot execute tests locally. Any required testing will be deferred to the end of the task for the user to execute manually. I will provide the necessary commands to run the tests.`
Original file line number	Diff line number	Diff line change
`@@ -249,7 +249,10 @@ def prepare_lora_batch(`
`249`	`249`	`lora_ranks=jnp.array(padded_lora_ranks_cpu, dtype=jnp.int32),`
`250`	`250`	`)`
`251`	`251`
`252`		`- self.batch_info = BatchInfo(batch_info)`
	`252`	`+ return batch_info`
	`253`	`+`
	`254`	`+ def set_batch_info(self, batch_info: LoRABatchInfo):`
	`255`	`+ self.batch_info.value = batch_info`
`253`	`256`
`254`	`257`
`255`	`258`	`def shrink(`