restructured azure API calls to match Azure AI Foundry docs

joeltancej · joeltancej · commit 29ce11a4f0c3 · 2025-09-17T05:24:54.000Z
diff --git a/providers/azure_provider.py b/providers/azure_provider.py
@@ -1,9 +1,10 @@
 import os
-import requests
 import numpy as np
 from providers.base_provider import ProviderInterface
 from time import perf_counter as timer
-import re
+from azure.ai.inference import ChatCompletionsClient
+from azure.ai.inference.models import SystemMessage, UserMessage
+from azure.core.credentials import AzureKeyCredential
 
 
 class Azure(ProviderInterface):
@@ -17,51 +18,63 @@ def __init__(self):
         # Map model names to Azure model IDs
         self.model_map = {
             # "mistral-7b-instruct-v0.1": "mistral-7b-instruct-v0.1",
-            "meta-llama-3.1-8b-instruct": "Meta-Llama-3-1-8B-Instruct-fyp",
+            "meta-llama-3.1-8b-instruct": "Meta-Llama-3.1-8B-Instruct-fyp",
             "meta-llama-3.1-70b-instruct": "Meta-Llama-3-1-70B-Instruct-fyp",
             "mistral-large": "Mistral-Large-2411-yatcd",
             "common-model": "Mistral-Large-2411-yatcd",
         }
 
+        self._client = None
+
+    def _ensure_client(self):
+        """
+        Create the Azure client only when first used.
+        Raise a clear error if env vars are missing.
+        """
+        if self._client is not None:
+            return
+
+        if not self.api_key or not isinstance(self.api_key, str):
+            raise RuntimeError(
+                "Azure provider misconfigured: AZURE_AI_API_KEY is missing or not a string."
+            )
+        if not self.endpoint:
+            raise RuntimeError(
+                "Azure provider misconfigured: AZURE_AI_ENDPOINT is missing."
+            )
+
+        credential = AzureKeyCredential(self.api_key)
+        self._client = ChatCompletionsClient(
+            endpoint=self.endpoint,
+            credential=credential,
+            api_version="2024-05-01-preview",
+        )
+
     def get_model_name(self, model):
         """Retrieve the model name based on the input key."""
         return self.model_map.get(model, None)
 
     def perform_inference(self, model, prompt, max_output=100, verbosity=True):
         """Performs non-streaming inference request to Azure."""
         try:
+            self._ensure_client()
+            client = self._client
             model_id = self.get_model_name(model)
             if model_id is None:
                 print(f"Model {model} not available.")
                 return None
-            endpoint = (self.endpoint or "https://example.invalid").rstrip("/")
-            api_key = self.api_key
             start_time = timer()
-            response = requests.post(
-                f"{endpoint}",
-                headers={
-                    "Authorization": f"Bearer {api_key}",
-                    "Content-Type": "application/json",
-                },
-                json={
-                    "model": model_id,
-                    "messages": [
-                        {"role": "system", "content": self.system_prompt},
-                        {"role": "user", "content": prompt},
-                    ],
-                    "max_tokens": max_output,
-                },
-                timeout=500,
+            response = client.complete(
+                messages=[
+                    SystemMessage(content=self.system_prompt),
+                    UserMessage(content=prompt),
+                ],
+                max_tokens=max_output,
+                model=model_id
             )
             elapsed = timer() - start_time
-            if response.status_code != 200:
-                print(f"Error: {response.status_code} - {response.text}")
-                return None
 
-            # Parse and display response
-            inference = response.json()
-
-            usage = inference.get("usage")
+            usage = response.get("usage")
             total_tokens = usage.get("completion_tokens") or 0
             tbt = elapsed / max(total_tokens, 1)
             tps = (total_tokens / elapsed)
@@ -73,8 +86,8 @@ def perform_inference(self, model, prompt, max_output=100, verbosity=True):
 
             if verbosity:
                 print(f"Tokens: {total_tokens}, Avg TBT: {tbt:.4f}s, TPS: {tps:.2f}")
-                print(f"Response: {inference['choices'][0]['message']['content']}")
-            return inference
+                print(f"Response: {response['choices'][0]['message']['content']}")
+            return response
         
         except Exception as e:
             print(f"[ERROR] Inference failed for model '{model}': {e}")
@@ -84,72 +97,46 @@ def perform_inference_streaming(
         self, model, prompt, max_output=100, verbosity=True
     ):
         """Performs streaming inference request to Azure."""
+        self._ensure_client()
+        client = self._client
         model_id = self.get_model_name(model)
-        api_key = self.get_model_api_key(model)
         if model_id is None:
             print(f"Model {model} not available.")
             return None
 
         inter_token_latencies = []
-        endpoint = f"https://{model_id}.eastus.models.ai.azure.com/chat/completions"
         start_time = timer()
         try:
-            response = requests.post(
-                f"{endpoint}",
-                headers={
-                    "Authorization": f"Bearer {api_key}",
-                    "Content-Type": "application/json",
-                },
-                json={
-                    "messages": [
-                        # {"role": "system", "content": self.system_prompt + "\nThe number appended at the end is not important."},
-                        # {"role": "user", "content": prompt + " " + str(timer())},
-                        {"role": "system", "content": self.system_prompt},
-                        {"role": "user", "content": prompt},
-                    ],
-                    "max_tokens": max_output,
-                    "stream": True,
-                },
-                stream=True,
-                timeout=500,
-            )
-
             first_token_time = None
-            for line in response.iter_lines():
-                if line:
-                    # print(line)
-                    if first_token_time is None:
-                        # print(line)
-                        first_token_time = timer()
-                        ttft = first_token_time - start_time
-                        prev_token_time = first_token_time
-                        if verbosity:
+            with client.complete(
+                stream=True,
+                messages=[
+                    SystemMessage(content=self.system_prompt),
+                    UserMessage(content=prompt),
+                ],
+                max_tokens=max_output,
+                model=model_id
+            ) as response:
+                for event in response:
+                    if not event.choices or not event.choices[0].delta:
+                        continue
+
+                    delta = event.choices[0].delta
+                    if delta.content:
+                        if first_token_time is None:
+                            first_token_time = timer()
+                            ttft = first_token_time - start_time
+                            prev_token_time = first_token_time
                             print(f"##### Time to First Token (TTFT): {ttft:.4f} seconds\n")
 
-                    line_str = line.decode("utf-8").strip()
-                    
-                    if line_str == "data: [DONE]":
-                        # print(line_str)
-                        # print("here")
-                        total_time = timer() - start_time
-                        break
-
-                    # Capture token timing
-                    time_to_next_token = timer()
-                    inter_token_latency = time_to_next_token - prev_token_time
-                    prev_token_time = time_to_next_token
-                    inter_token_latencies.append(inter_token_latency)
-
-                    # Display token if verbosity is enabled
-                    match = re.search(r'"content"\s*:\s*"(.*?)"', line_str)
-                    if match:
-                        print(match.group(1), end="")
-                    # if verbosity:
-                    #     if len(inter_token_latencies) < 20:
-                    #         print(line_str[19:].split('"')[5], end="")
-                    #     elif len(inter_token_latencies) == 20:
-                    #         print("...")
+                        time_to_next_token = timer()
+                        inter_token_latency = time_to_next_token - prev_token_time
+                        prev_token_time = time_to_next_token
+                        inter_token_latencies.append(inter_token_latency)
+
+                        print(delta.content, end="", flush=True)
 
+            total_time = timer() - start_time
             # Calculate total metrics
 
             if verbosity:
diff --git a/requirements.txt b/requirements.txt
@@ -18,4 +18,5 @@ groq==0.13.0
 google-generativeai==0.8.3
 fastapi==0.115.6
 uvicorn==0.32.1
-pytest-asyncio==0.25.0
+pytest-asyncio==0.25.0
+azure-ai-inference==1.0.0b9