NVIDIA-NeMo · bxyu-nvidia · Nov 19, 2025
diff --git a/nemo_gym/openai_utils.py b/nemo_gym/openai_utils.py
@@ -463,7 +463,7 @@ async def _request(self, **request_kwargs: Dict) -> ClientResponse:
                 return response
 
         # We've exited the loop
-        response.raise_for_status()
+        await raise_for_status(response)
 
     async def _raise_for_status(self, response: ClientResponse, request_kwargs: Dict[str, Any]) -> None:
         if not response.ok:

diff --git a/resources_servers/math_with_judge/configs/dapo17k.yaml b/resources_servers/math_with_judge/configs/dapo17k.yaml
@@ -39,3 +39,4 @@ math_with_judge_simple_agent:
           version: 0.0.1
           artifact_fpath: aime24.jsonl
         license: Apache 2.0
+        num_repeats: 32
diff --git a/resources_servers/math_with_judge/data/aime24_validation_metrics.json b/resources_servers/math_with_judge/data/aime24_validation_metrics.json
@@ -2,14 +2,14 @@
     "name": "validation",
     "type": "validation",
     "jsonl_fpath": "resources_servers/math_with_judge/data/aime24_validation.jsonl",
-    "num_repeats": 1,
+    "num_repeats": 32,
     "gitlab_identifier": {
         "dataset_name": "aime24",
         "version": "0.0.1",
         "artifact_fpath": "aime24.jsonl"
     },
     "license": "Apache 2.0",
-    "Number of examples": 30,
+    "Number of examples": 960,
     "Number of tools": {
         "Total # non-null values": 0,
         "Average": 0.0,
@@ -19,15 +19,15 @@
         "Standard deviation": 0.0
     },
     "Json-dumped number of words (proxy for token count)": {
-        "Total # non-null values": 30,
+        "Total # non-null values": 960,
         "Average": 80.47,
         "Min": 42.0,
         "Max": 149.0,
         "Median": 81.5,
-        "Standard deviation": 25.11
+        "Standard deviation": 24.7
     },
     "Number of turns": {
-        "Total # non-null values": 30,
+        "Total # non-null values": 960,
         "Average": 1.0,
         "Min": 1.0,
         "Max": 1.0,
@@ -44,10 +44,10 @@
     },
     "question": {
         "unique_count": 30,
-        "total_count": 30
+        "total_count": 960
     },
     "expected_answer": {
         "unique_count": 29,
-        "total_count": 30
+        "total_count": 960
     }
 }
diff --git a/resources_servers/math_with_judge/data/dapo17k_train_metrics.json b/resources_servers/math_with_judge/data/dapo17k_train_metrics.json
@@ -23,7 +23,7 @@
         "Average": 85.63,
         "Min": 45.0,
         "Max": 322.0,
-        "Median": 80.41,
+        "Median": 80.33,
         "Standard deviation": 26.94
     },
     "Number of turns": {

diff --git a/responses_api_agents/simple_agent/app.py b/responses_api_agents/simple_agent/app.py
@@ -101,6 +101,9 @@ async def responses(
             output = model_response.output
             new_outputs.extend(output)
 
+            if model_response.incomplete_details and model_response.incomplete_details.reason == "max_output_tokens":
+                break
+
             all_fn_calls: List[NeMoGymResponseFunctionToolCall] = [o for o in output if o.type == "function_call"]
             all_output_messages: List[NeMoGymResponseOutputMessage] = [
                 o for o in output if o.type == "message" and o.role == "assistant"

diff --git a/responses_api_models/vllm_model/app.py b/responses_api_models/vllm_model/app.py
@@ -13,11 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import re
+from copy import deepcopy
 from time import time
-from typing import ClassVar, Dict, List, Optional, Tuple, Union
+from typing import Any, ClassVar, Dict, List, Optional, Tuple, Union
 from uuid import uuid4
 
-from aiohttp.client_exceptions import ClientResponseError
 from fastapi import Request
 from pydantic import BaseModel, Field
 
@@ -34,7 +34,6 @@
     NeMoGymChatCompletionAssistantMessageParam,
     NeMoGymChatCompletionCreateParamsNonStreaming,
     NeMoGymChatCompletionDeveloperMessageParam,
-    NeMoGymChatCompletionMessage,
     NeMoGymChatCompletionMessageParam,
     NeMoGymChatCompletionMessageToolCallFunctionParam,
     NeMoGymChatCompletionMessageToolCallParam,
@@ -66,6 +65,8 @@ class VLLMModelConfig(BaseResponsesAPIModelConfig):
     uses_reasoning_parser: bool
     replace_developer_role_with_system: bool = False
 
+    chat_template_kwargs: Optional[Dict[str, Any]] = None
+
     def model_post_init(self, context):
         if isinstance(self.base_url, str):
             self.base_url = [self.base_url]
@@ -132,6 +133,7 @@ async def responses(
             metadata=body.metadata,
             instructions=body.instructions,
             user=body.user,
+            incomplete_details={"reason": "max_output_tokens"} if choice.finish_reason == "length" else None,
         )
 
     async def chat_completions(
@@ -144,6 +146,8 @@ async def chat_completions(
 
         body_dict = body.model_dump(exclude_unset=True)
         body_dict["model"] = self.config.model
+        if self.config.chat_template_kwargs:
+            body_dict["chat_template_kwargs"] = deepcopy(self.config.chat_template_kwargs)
 
         session_id = request.session[SESSION_ID_KEY]
         if session_id not in self._session_id_to_client:
@@ -198,45 +202,7 @@ async def chat_completions(
                 else:
                     raise NotImplementedError
 
-        try:
-            chat_completion_dict = await client.create_chat_completion(**create_params)
-        except ClientResponseError as e:
-            """
-            Example messages for out of context length:
-
-            1. https://github.com/vllm-project/vllm/blob/685c99ee77b4818dcdd15b30fe0e0eff0d5d22ec/vllm/entrypoints/openai/serving_engine.py#L914
-            ```json
-            {"object":"error","message":"This model\'s maximum context length is 32768 tokens. However, you requested 32818 tokens in the messages, Please reduce the length of the messages. None","type":"BadRequestError","param":null,"code":400}
-            ```
-            2. https://github.com/vllm-project/vllm/blob/685c99ee77b4818dcdd15b30fe0e0eff0d5d22ec/vllm/entrypoints/openai/serving_engine.py#L940
-            3. https://github.com/vllm-project/vllm/blob/685c99ee77b4818dcdd15b30fe0e0eff0d5d22ec/vllm/entrypoints/openai/serving_engine.py#L948
-            4. https://github.com/vllm-project/vllm/blob/685c99ee77b4818dcdd15b30fe0e0eff0d5d22ec/vllm/sampling_params.py#L463
-            """
-            result_content_str = e.response_content.decode()
-
-            is_out_of_context_length = e.status == 400 and (
-                "context length" in result_content_str or "max_tokens" in result_content_str
-            )
-            if is_out_of_context_length:
-                return NeMoGymChatCompletion(
-                    id="chtcmpl-123",
-                    object="chat.completion",
-                    created=int(time()),
-                    model=self.config.model,
-                    choices=[
-                        NeMoGymChoice(
-                            index=0,
-                            finish_reason="stop",
-                            message=NeMoGymChatCompletionMessage(
-                                role="assistant",
-                                content=None,
-                                tool_calls=None,
-                            ),
-                        )
-                    ],
-                )
-            else:
-                raise e
+        chat_completion_dict = await client.create_chat_completion(**create_params)
 
         choice_dict = chat_completion_dict["choices"][0]
         if self.config.uses_reasoning_parser: