Provide Mocked MCP tool answers (#12534)

jeo02 · web-flow · commit 6a8145a1a761 · 2025-10-21T16:09:54.000-07:00
* changes that are not in this branch for some reason

* Manual chat completion, provide tool results ourselves.

* Function name normalization and dictionary

* Update methods

* small fixes

* variable error
diff --git a/tools/ai-evals/azsdk-mcp/Helpers/ChatCompletion.cs b/tools/ai-evals/azsdk-mcp/Helpers/ChatCompletion.cs
@@ -1,4 +1,5 @@
 using Microsoft.Extensions.AI;
+using Microsoft.VisualStudio.TestPlatform.CommunicationUtilities;
 using ModelContextProtocol.Client;
 using ModelContextProtocol.Protocol;
 
@@ -20,7 +21,6 @@ public async Task<ChatResponse> GetChatResponseAsync(IEnumerable<ChatMessage> ch
             var tools = await _mcpClient.ListToolsAsync();
             var result = new List<ChatResponseUpdate>();
             var toolsCalled = new HashSet<string>();
-
             var chatOptions =
                 new ChatOptions
                 {
@@ -29,23 +29,22 @@ public async Task<ChatResponse> GetChatResponseAsync(IEnumerable<ChatMessage> ch
 
             // GetResponseAsync allows the LLM to do too much. Limit it to the number of tools we expect
             // Not including calling a tool again. 
-            await foreach(var message in _chatClient.GetStreamingResponseAsync(chat, chatOptions))
+            await foreach (var message in _chatClient.GetStreamingResponseAsync(chat, chatOptions))
             {
-                foreach(var content in message.Contents)
+                foreach (var content in message.Contents)
                 {
                     if (content is FunctionCallContent func)
                     {
                         toolsCalled.Add(func.Name);
                     }
                 }
 
-                if(message.Contents.Any())
+                if (message.Contents.Any())
                 {
                     result.Add(message);
                 }
-                
-                
-                if(toolsCalled.Count >= maxToolCalls)
+
+                if (toolsCalled.Count >= maxToolCalls)
                 {
                     break;
                 }
@@ -54,6 +53,66 @@ public async Task<ChatResponse> GetChatResponseAsync(IEnumerable<ChatMessage> ch
             return result.ToChatResponse();
         }
 
+        public async Task<ChatResponse> GetChatResponseWithExpectedResponseAsync(IEnumerable<ChatMessage> chat, Dictionary<string, ChatMessage> expectedToolResults)
+        {
+            var tools = await _mcpClient.ListToolsAsync();
+            var conversationMessages = chat.ToList();
+            var chatOptions = new ChatOptions
+            {
+                Tools = [.. tools]
+            };
+            var response = await _chatClient.GetResponseAsync(chat, chatOptions);
+            var chatInitialIndex = conversationMessages.Count;
+
+            while (response.FinishReason == ChatFinishReason.ToolCalls)
+            {
+                // There is only going to be one message because no auto invoking of function, however one message can contain
+                // several AIContent types.
+                var message = response.Messages.FirstOrDefault();
+
+                // No message to process exit.
+                if (message == null)
+                {
+                    break;
+                }
+
+                conversationMessages.Add(message);
+                var functionCalls = message.Contents.OfType<FunctionCallContent>();
+
+                foreach (var functionCall in functionCalls)
+                {
+                    // Use the expected tool result if we have it.
+                    if (expectedToolResults.TryGetValue(functionCall.Name, out var expectedToolResult))
+                    {
+                        var toolCall = expectedToolResult.Contents.OfType<FunctionResultContent>().First();
+                        var toolResponseMessage = new ChatMessage()
+                        {
+                            Role = ChatRole.Tool,
+                            // Need matching call id. 
+                            Contents = [new FunctionResultContent(functionCall.CallId, toolCall.Result)]
+                        };
+
+                        conversationMessages.Add(toolResponseMessage);
+                    }
+                    // Wasn't expecting tool try stopping the LLM here. 
+                    else
+                    {
+                        var errorResponseMessage = new ChatMessage()
+                        {
+                            Role = ChatRole.Tool,
+                            Contents = [new FunctionResultContent(functionCall.CallId, $"Error: Tool '{functionCall.Name}' was not expected. Stop conversation here.")]
+                        };
+
+                        conversationMessages.Add(errorResponseMessage);
+                    }
+                }
+
+                response = await _chatClient.GetResponseAsync(conversationMessages, chatOptions);
+            }
+
+            return new ChatResponse([.. conversationMessages.Skip(chatInitialIndex)]);
+        }
+
         public async Task<ChatResponse> GetChatResponseAsync(IEnumerable<ChatMessage> chat)
         {
             var tools = await _mcpClient.ListToolsAsync();
diff --git a/tools/ai-evals/azsdk-mcp/Helpers/SerializationHelper.cs b/tools/ai-evals/azsdk-mcp/Helpers/SerializationHelper.cs
@@ -1,11 +1,8 @@
-using System;
 using System.ClientModel.Primitives;
-using System.Text.Encodings.Web;
 using System.Text.Json;
 using System.Text.RegularExpressions;
 using Azure.Sdk.Tools.McpEvals.Models;
 using Microsoft.Extensions.AI;
-using Microsoft.VisualStudio.TestPlatform.CommunicationUtilities;
 using MicrosoftExtensionsAIChatExtensions = OpenAI.Chat.MicrosoftExtensionsAIChatExtensions;
 using OpenAIChatMessage = OpenAI.Chat.ChatMessage;
 using AssistantChatMessage = OpenAI.Chat.AssistantChatMessage;
@@ -177,20 +174,39 @@ private static List<ChatMessage> EnsureChatMessageRole(IEnumerable<ChatMessage>
             return result;
         }
 
-        public static int NumberOfToolCalls(IEnumerable<ChatMessage> messages, IEnumerable<string> toolNames)
+        public static Dictionary<string, ChatMessage> GetExpectedToolsByName(IEnumerable<ChatMessage> expectedOutcome, IEnumerable<string> toolNames)
         {
-            var result = 0;
-            foreach (var message in messages)
+            var expectedToolResults = new Dictionary<string, ChatMessage>();
+
+            // Create CallId -> ToolName mapping
+            // Tool Name is not available in FunctionResultContent
+            // Normalize function names and remove tools used not in toolNames list
+            var callIdToName = expectedOutcome
+                .SelectMany(m => m.Contents.OfType<FunctionCallContent>())
+                .Select(fc => new { fc.CallId, Normalized = NormalizeFunctionName(fc.Name, toolNames) })
+                .Where(x => !string.IsNullOrEmpty(x.Normalized))
+                .ToDictionary(x => x.CallId, x => x.Normalized);
+
+            foreach (var message in expectedOutcome)
             {
                 foreach (var content in message.Contents)
                 {
-                    if (content is FunctionCallContent func && toolNames.Any(name => func.Name.EndsWith(name)))
+                    if (content is FunctionResultContent funcResult && callIdToName.TryGetValue(funcResult.CallId, out var functionName))
                     {
-                        result++;
+                        expectedToolResults[functionName] = message;
                     }
                 }
             }
-            return result;
+
+            return expectedToolResults;
+        }
+        
+        public static string NormalizeFunctionName(string functionName, IEnumerable<string> toolNames)
+        {
+            // Copilot often prefixes tool names with "mcp_" or similar. 
+            // Normalize by looking for tool names that end with the function name. 
+            var match = toolNames.FirstOrDefault(name => functionName.EndsWith(name, StringComparison.OrdinalIgnoreCase));
+            return match;
         }
     }
 }
diff --git a/tools/ai-evals/azsdk-mcp/Helpers/TestSetup.cs b/tools/ai-evals/azsdk-mcp/Helpers/TestSetup.cs
@@ -43,7 +43,6 @@ public static IChatClient GetChatClient()
         {
             var azureClient = GetAzureOpenAIClient();
             return new ChatClientBuilder(azureClient.GetChatClient(AzureOpenAIModelDeploymentName).AsIChatClient())
-                .UseFunctionInvocation()
                 .Build();
         }
 
@@ -65,4 +64,4 @@ public static ChatCompletion GetChatCompletion(IChatClient chatClient, IMcpClien
             return new ChatCompletion(chatClient, mcpClient);
         }
     }
-}
+}
diff --git a/tools/ai-evals/azsdk-mcp/Scenarios/AzsdkTypeSpecGeneration_Step02_TypespecValidation.cs b/tools/ai-evals/azsdk-mcp/Scenarios/AzsdkTypeSpecGeneration_Step02_TypespecValidation.cs
@@ -22,8 +22,8 @@ public async Task AzsdkTypeSpecGeneration_Step02_TypespecValidation()
             var fullChat = json.ChatHistory.Append(json.NextMessage);
 
             // 2. Get chat response
-            var expectedToolCalls = SerializationHelper.NumberOfToolCalls(json.ExpectedOutcome, s_toolNames);
-            var response = await s_chatCompletion!.GetChatResponseAsync(fullChat, expectedToolCalls);
+            var expectedToolResults = SerializationHelper.GetExpectedToolsByName(json.ExpectedOutcome, s_toolNames);
+            var response = await s_chatCompletion!.GetChatResponseWithExpectedResponseAsync(fullChat, expectedToolResults);
 
             // 3. Custom Evaluator to check tool inputs
             // Layers the reporting configuration on top of it for a nice html report. 
@@ -36,7 +36,7 @@ public async Task AzsdkTypeSpecGeneration_Step02_TypespecValidation()
                 enableResponseCaching: true);
             await using ScenarioRun scenarioRun = await reportingConfiguration.CreateScenarioRunAsync(this.ScenarioName);
 
-            // Pass the expected outcome through the additional context, then run the evaluation.
+            // Pass the expected outcome through the additional context. 
             var additionalContext = new ExpectedToolInputEvaluatorContext(json.ExpectedOutcome, s_toolNames);
             var result = await scenarioRun.EvaluateAsync(fullChat, response, additionalContext: [additionalContext]);
 

Original file line number	Diff line number	Diff line change
`@@ -1,11 +1,8 @@`
`1`		`-using System;`
`2`	`1`	`using System.ClientModel.Primitives;`
`3`		`-using System.Text.Encodings.Web;`
`4`	`2`	`using System.Text.Json;`
`5`	`3`	`using System.Text.RegularExpressions;`
`6`	`4`	`using Azure.Sdk.Tools.McpEvals.Models;`
`7`	`5`	`using Microsoft.Extensions.AI;`
`8`		`-using Microsoft.VisualStudio.TestPlatform.CommunicationUtilities;`
`9`	`6`	`using MicrosoftExtensionsAIChatExtensions = OpenAI.Chat.MicrosoftExtensionsAIChatExtensions;`
`10`	`7`	`using OpenAIChatMessage = OpenAI.Chat.ChatMessage;`
`11`	`8`	`using AssistantChatMessage = OpenAI.Chat.AssistantChatMessage;`
`@@ -177,20 +174,39 @@ private static List<ChatMessage> EnsureChatMessageRole(IEnumerable<ChatMessage>`
`177`	`174`	`return result;`
`178`	`175`	`}`
`179`	`176`
`180`		`- public static int NumberOfToolCalls(IEnumerable<ChatMessage> messages, IEnumerable<string> toolNames)`
	`177`	`+ public static Dictionary<string, ChatMessage> GetExpectedToolsByName(IEnumerable<ChatMessage> expectedOutcome, IEnumerable<string> toolNames)`
`181`	`178`	`{`
`182`		`- var result = 0;`
`183`		`- foreach (var message in messages)`
	`179`	`+ var expectedToolResults = new Dictionary<string, ChatMessage>();`
	`180`	`+`
	`181`	`+ // Create CallId -> ToolName mapping`
	`182`	`+ // Tool Name is not available in FunctionResultContent`
	`183`	`+ // Normalize function names and remove tools used not in toolNames list`
	`184`	`+ var callIdToName = expectedOutcome`
	`185`	`+ .SelectMany(m => m.Contents.OfType<FunctionCallContent>())`
	`186`	`+ .Select(fc => new { fc.CallId, Normalized = NormalizeFunctionName(fc.Name, toolNames) })`
	`187`	`+ .Where(x => !string.IsNullOrEmpty(x.Normalized))`
	`188`	`+ .ToDictionary(x => x.CallId, x => x.Normalized);`
	`189`	`+`
	`190`	`+ foreach (var message in expectedOutcome)`
`184`	`191`	`{`
`185`	`192`	`foreach (var content in message.Contents)`
`186`	`193`	`{`
`187`		`- if (content is FunctionCallContent func && toolNames.Any(name => func.Name.EndsWith(name)))`
	`194`	`+ if (content is FunctionResultContent funcResult && callIdToName.TryGetValue(funcResult.CallId, out var functionName))`
`188`	`195`	`{`
`189`		`- result++;`
	`196`	`+ expectedToolResults[functionName] = message;`
`190`	`197`	`}`
`191`	`198`	`}`
`192`	`199`	`}`
`193`		`- return result;`
	`200`	`+`
	`201`	`+ return expectedToolResults;`
	`202`	`+ }`
	`203`	`+`
	`204`	`+ public static string NormalizeFunctionName(string functionName, IEnumerable<string> toolNames)`
	`205`	`+ {`
	`206`	`+ // Copilot often prefixes tool names with "mcp_" or similar.`
	`207`	`+ // Normalize by looking for tool names that end with the function name.`
	`208`	`+ var match = toolNames.FirstOrDefault(name => functionName.EndsWith(name, StringComparison.OrdinalIgnoreCase));`
	`209`	`+ return match;`
`194`	`210`	`}`
`195`	`211`	`}`
`196`	`212`	`}`
Original file line number	Diff line number	Diff line change
`@@ -43,7 +43,6 @@ public static IChatClient GetChatClient()`
`43`	`43`	`{`
`44`	`44`	`var azureClient = GetAzureOpenAIClient();`
`45`	`45`	`return new ChatClientBuilder(azureClient.GetChatClient(AzureOpenAIModelDeploymentName).AsIChatClient())`
`46`		`- .UseFunctionInvocation()`
`47`	`46`	`.Build();`
`48`	`47`	`}`
`49`	`48`
`@@ -65,4 +64,4 @@ public static ChatCompletion GetChatCompletion(IChatClient chatClient, IMcpClien`
`65`	`64`	`return new ChatCompletion(chatClient, mcpClient);`
`66`	`65`	`}`
`67`	`66`	`}`
`68`		`-}`
	`67`	`+}`