Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion nemo_gym/openai_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -463,7 +463,7 @@ async def _request(self, **request_kwargs: Dict) -> ClientResponse:
return response

# We've exited the loop
response.raise_for_status()
await raise_for_status(response)

async def _raise_for_status(self, response: ClientResponse, request_kwargs: Dict[str, Any]) -> None:
if not response.ok:
Expand Down
1 change: 1 addition & 0 deletions resources_servers/math_with_judge/configs/dapo17k.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,4 @@ math_with_judge_simple_agent:
version: 0.0.1
artifact_fpath: aime24.jsonl
license: Apache 2.0
num_repeats: 32
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,14 @@
"name": "validation",
"type": "validation",
"jsonl_fpath": "resources_servers/math_with_judge/data/aime24_validation.jsonl",
"num_repeats": 1,
"num_repeats": 32,
"gitlab_identifier": {
"dataset_name": "aime24",
"version": "0.0.1",
"artifact_fpath": "aime24.jsonl"
},
"license": "Apache 2.0",
"Number of examples": 30,
"Number of examples": 960,
"Number of tools": {
"Total # non-null values": 0,
"Average": 0.0,
Expand All @@ -19,15 +19,15 @@
"Standard deviation": 0.0
},
"Json-dumped number of words (proxy for token count)": {
"Total # non-null values": 30,
"Total # non-null values": 960,
"Average": 80.47,
"Min": 42.0,
"Max": 149.0,
"Median": 81.5,
"Standard deviation": 25.11
"Standard deviation": 24.7
},
"Number of turns": {
"Total # non-null values": 30,
"Total # non-null values": 960,
"Average": 1.0,
"Min": 1.0,
"Max": 1.0,
Expand All @@ -44,10 +44,10 @@
},
"question": {
"unique_count": 30,
"total_count": 30
"total_count": 960
},
"expected_answer": {
"unique_count": 29,
"total_count": 30
"total_count": 960
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
"Average": 85.63,
"Min": 45.0,
"Max": 322.0,
"Median": 80.41,
"Median": 80.33,
"Standard deviation": 26.94
},
"Number of turns": {
Expand Down
3 changes: 3 additions & 0 deletions responses_api_agents/simple_agent/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,9 @@ async def responses(
output = model_response.output
new_outputs.extend(output)

if model_response.incomplete_details and model_response.incomplete_details.reason == "max_output_tokens":
break

all_fn_calls: List[NeMoGymResponseFunctionToolCall] = [o for o in output if o.type == "function_call"]
all_output_messages: List[NeMoGymResponseOutputMessage] = [
o for o in output if o.type == "message" and o.role == "assistant"
Expand Down
50 changes: 8 additions & 42 deletions responses_api_models/vllm_model/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import re
from copy import deepcopy
from time import time
from typing import ClassVar, Dict, List, Optional, Tuple, Union
from typing import Any, ClassVar, Dict, List, Optional, Tuple, Union
from uuid import uuid4

from aiohttp.client_exceptions import ClientResponseError
from fastapi import Request
from pydantic import BaseModel, Field

Expand All @@ -34,7 +34,6 @@
NeMoGymChatCompletionAssistantMessageParam,
NeMoGymChatCompletionCreateParamsNonStreaming,
NeMoGymChatCompletionDeveloperMessageParam,
NeMoGymChatCompletionMessage,
NeMoGymChatCompletionMessageParam,
NeMoGymChatCompletionMessageToolCallFunctionParam,
NeMoGymChatCompletionMessageToolCallParam,
Expand Down Expand Up @@ -66,6 +65,8 @@ class VLLMModelConfig(BaseResponsesAPIModelConfig):
uses_reasoning_parser: bool
replace_developer_role_with_system: bool = False

chat_template_kwargs: Optional[Dict[str, Any]] = None

def model_post_init(self, context):
if isinstance(self.base_url, str):
self.base_url = [self.base_url]
Expand Down Expand Up @@ -132,6 +133,7 @@ async def responses(
metadata=body.metadata,
instructions=body.instructions,
user=body.user,
incomplete_details={"reason": "max_output_tokens"} if choice.finish_reason == "length" else None,
)

async def chat_completions(
Expand All @@ -144,6 +146,8 @@ async def chat_completions(

body_dict = body.model_dump(exclude_unset=True)
body_dict["model"] = self.config.model
if self.config.chat_template_kwargs:
body_dict["chat_template_kwargs"] = deepcopy(self.config.chat_template_kwargs)

session_id = request.session[SESSION_ID_KEY]
if session_id not in self._session_id_to_client:
Expand Down Expand Up @@ -198,45 +202,7 @@ async def chat_completions(
else:
raise NotImplementedError

try:
chat_completion_dict = await client.create_chat_completion(**create_params)
except ClientResponseError as e:
"""
Example messages for out of context length:

1. https://github.com/vllm-project/vllm/blob/685c99ee77b4818dcdd15b30fe0e0eff0d5d22ec/vllm/entrypoints/openai/serving_engine.py#L914
```json
{"object":"error","message":"This model\'s maximum context length is 32768 tokens. However, you requested 32818 tokens in the messages, Please reduce the length of the messages. None","type":"BadRequestError","param":null,"code":400}
```
2. https://github.com/vllm-project/vllm/blob/685c99ee77b4818dcdd15b30fe0e0eff0d5d22ec/vllm/entrypoints/openai/serving_engine.py#L940
3. https://github.com/vllm-project/vllm/blob/685c99ee77b4818dcdd15b30fe0e0eff0d5d22ec/vllm/entrypoints/openai/serving_engine.py#L948
4. https://github.com/vllm-project/vllm/blob/685c99ee77b4818dcdd15b30fe0e0eff0d5d22ec/vllm/sampling_params.py#L463
"""
result_content_str = e.response_content.decode()

is_out_of_context_length = e.status == 400 and (
"context length" in result_content_str or "max_tokens" in result_content_str
)
if is_out_of_context_length:
return NeMoGymChatCompletion(
id="chtcmpl-123",
object="chat.completion",
created=int(time()),
model=self.config.model,
choices=[
NeMoGymChoice(
index=0,
finish_reason="stop",
message=NeMoGymChatCompletionMessage(
role="assistant",
content=None,
tool_calls=None,
),
)
],
)
else:
raise e
chat_completion_dict = await client.create_chat_completion(**create_params)

choice_dict = chat_completion_dict["choices"][0]
if self.config.uses_reasoning_parser:
Expand Down
Loading