AIRND-154 Change TTS interface to use voice_id (#44)

abuchnick-aiola · web-flow · commit a0dd13b78691 · 2025-11-12T15:39:42.000+02:00
* support new tts interface

* update api to use voice_id instead of persona

* fix error handling ןמ אאד בךןקמא

* minor fix in async error handling

* fix aikido gotcha
diff --git a/README.md b/README.md
@@ -230,8 +230,7 @@ def create_file():
 
         audio = client.tts.synthesize(
             text='Hello, how can I help you today?',
-            voice='jess',
-            language='en'
+            voice_id='en_us_male'
         )
 
         with open('./audio.wav', 'wb') as f:
@@ -263,8 +262,7 @@ def stream_tts():
 
         stream = client.tts.stream(
             text='Hello, how can I help you today?',
-            voice='jess',
-            language='en'
+            voice_id='en_us_male'
         )
 
         audio_chunks = []
@@ -330,8 +328,7 @@ async def create_audio_file():
 
         audio = client.tts.synthesize(
             text='Hello, how can I help you today?',
-            voice='jess',
-            language='en'
+            voice_id='en_us_male'
         )
 
         with open('./audio.wav', 'wb') as f:
@@ -365,8 +362,7 @@ async def stream_tts():
 
         stream = client.tts.stream(
             text='Hello, how can I help you today?',
-            voice='jess',
-            language='en'
+            voice_id='en_us_male'
         )
 
         audio_chunks = []
diff --git a/aiola/clients/tts/client.py b/aiola/clients/tts/client.py
@@ -22,14 +22,12 @@ def __init__(self, options: AiolaClientOptions, auth: AuthClient | AsyncAuthClie
     def _make_headers() -> dict[str, str]:
         return {"Accept": "audio/*"}
 
-    def _validate_tts_params(self, text: str, voice: str, language: str | None) -> None:
+    def _validate_tts_params(self, text: str, voice_id: str) -> None:
         """Validate TTS parameters."""
         if not text or not isinstance(text, str):
             raise AiolaValidationError("text must be a non-empty string")
-        if not voice or not isinstance(voice, str):
-            raise AiolaValidationError("voice must be a non-empty string")
-        if language is not None and not isinstance(language, str):
-            raise AiolaValidationError("language must be a string")
+        if not voice_id or not isinstance(voice_id, str):
+            raise AiolaValidationError("voice_id must be a non-empty string")
 
 
 class TtsClient(BaseTts):
@@ -39,9 +37,9 @@ def __init__(self, options: AiolaClientOptions, auth: AuthClient):
         super().__init__(options, auth)
         self._auth: AuthClient = auth  # Type narrowing
 
-    def stream(self, *, text: str, voice: str, language: str | None = None) -> Iterator[bytes]:
+    def stream(self, *, text: str, voice_id: str) -> Iterator[bytes]:
         """Stream synthesized audio in real-time."""
-        self._validate_tts_params(text, voice, language)
+        self._validate_tts_params(text, voice_id)
 
         try:
             # Create authenticated HTTP client and make the streaming request
@@ -52,13 +50,17 @@ def stream(self, *, text: str, voice: str, language: str | None = None) -> Itera
                     "/api/tts/stream",
                     json={
                         "text": text,
-                        "voice": voice,
-                        "language": language,
+                        "voice_id": voice_id,
                     },
                     headers=self._make_headers(),
                 ) as response,
             ):
-                response.raise_for_status()
+                try:
+                    response.raise_for_status()
+                except httpx.HTTPStatusError:
+                    response.read()
+                    raise
+
                 yield from response.iter_bytes()
 
         except AiolaError:
@@ -75,9 +77,9 @@ def stream(self, *, text: str, voice: str, language: str | None = None) -> Itera
         except Exception as exc:
             raise AiolaError(f"TTS streaming failed: {str(exc)}") from exc
 
-    def synthesize(self, *, text: str, voice: str, language: str | None = None) -> Iterator[bytes]:
+    def synthesize(self, *, text: str, voice_id: str) -> Iterator[bytes]:
         """Synthesize audio and return as iterator of bytes."""
-        self._validate_tts_params(text, voice, language)
+        self._validate_tts_params(text, voice_id)
 
         try:
             # Create authenticated HTTP client and make the streaming request
@@ -88,13 +90,17 @@ def synthesize(self, *, text: str, voice: str, language: str | None = None) -> I
                     "/api/tts/synthesize",
                     json={
                         "text": text,
-                        "voice": voice,
-                        "language": language,
+                        "voice_id": voice_id,
                     },
                     headers=self._make_headers(),
                 ) as response,
             ):
-                response.raise_for_status()
+                try:
+                    response.raise_for_status()
+                except httpx.HTTPStatusError:
+                    response.read()
+                    raise
+
                 yield from response.iter_bytes()
 
         except AiolaError:
@@ -119,9 +125,9 @@ def __init__(self, options: AiolaClientOptions, auth: AsyncAuthClient):
         super().__init__(options, auth)
         self._auth: AsyncAuthClient = auth  # Type narrowing
 
-    async def stream(self, *, text: str, voice: str, language: str | None = None) -> AsyncIterator[bytes]:
+    async def stream(self, *, text: str, voice_id: str) -> AsyncIterator[bytes]:
         """Stream synthesized audio in real-time (async)."""
-        self._validate_tts_params(text, voice, language)
+        self._validate_tts_params(text, voice_id)
 
         try:
             # Create authenticated HTTP client and make the streaming request
@@ -133,13 +139,17 @@ async def stream(self, *, text: str, voice: str, language: str | None = None) ->
                     "/api/tts/stream",
                     json={
                         "text": text,
-                        "voice": voice,
-                        "language": language,
+                        "voice_id": voice_id,
                     },
                     headers=self._make_headers(),
                 ) as response,
             ):
-                response.raise_for_status()
+                try:
+                    response.raise_for_status()
+                except httpx.HTTPStatusError:
+                    await response.aread()
+                    raise
+
                 async for chunk in response.aiter_bytes():
                     yield chunk
 
@@ -157,9 +167,9 @@ async def stream(self, *, text: str, voice: str, language: str | None = None) ->
         except Exception as exc:
             raise AiolaError(f"Async TTS streaming failed: {str(exc)}") from exc
 
-    async def synthesize(self, *, text: str, voice: str, language: str | None = None) -> AsyncIterator[bytes]:
+    async def synthesize(self, *, text: str, voice_id: str) -> AsyncIterator[bytes]:
         """Synthesize audio and return as async iterator of bytes."""
-        self._validate_tts_params(text, voice, language)
+        self._validate_tts_params(text, voice_id)
 
         try:
             # Create authenticated HTTP client and make the streaming request
@@ -171,13 +181,17 @@ async def synthesize(self, *, text: str, voice: str, language: str | None = None
                     "/api/tts/synthesize",
                     json={
                         "text": text,
-                        "voice": voice,
-                        "language": language,
+                        "voice_id": voice_id,
                     },
                     headers=self._make_headers(),
                 ) as response,
             ):
-                response.raise_for_status()
+                try:
+                    response.raise_for_status()
+                except httpx.HTTPStatusError:
+                    await response.aread()
+                    raise
+
                 async for chunk in response.aiter_bytes():
                     yield chunk
 
diff --git a/aiola/errors.py b/aiola/errors.py
@@ -16,12 +16,14 @@ def __init__(
         self,
         message: str,
         *,
+        reason: str | None = None,
         status: int | None = None,
         code: str | None = None,
         details: Any | None = None,
     ) -> None:
         super().__init__(message)
         self.message: str = message  # Keep an explicit attribute – ``Exception`` drops it under ``__str__``
+        self.reason: str | None = reason
         self.status: int | None = status
         self.code: str | None = code
         self.details: Any | None = details
@@ -38,27 +40,29 @@ def from_response(cls, response: httpx.Response) -> AiolaError:
         """
 
         message: str = f"Request failed with status {response.status_code}"
+        reason: str | None = None
         code: str | None = None
         details: Any | None = None
 
         try:
             payload = response.json()
             if isinstance(payload, dict):
-                err_payload = payload.get("error", payload)
-                if isinstance(err_payload, dict):
-                    message = err_payload.get("message", message)
-                    code = err_payload.get("code")
-                    details = err_payload.get("details", err_payload)
+                reason = payload.get("message")
+                code = payload.get("code")
+                details = payload.get("details", payload)
         except ValueError:
             # Not JSON – try plain text
-            text = response.text
-            if text:
-                message = text
+            reason = response.text
 
-        return cls(message, status=response.status_code, code=code, details=details)
+        return cls(message, reason=reason, status=response.status_code, code=code, details=details)
 
     def __str__(self) -> str:
-        return self.message
+        parts = [self.message]
+        
+        if self.reason is not None:
+            parts.append(f"Reason: {self.reason}")
+        
+        return " | ".join(parts)
 
 
 class AiolaConnectionError(AiolaError):
diff --git a/examples/tts/README.md b/examples/tts/README.md
@@ -21,8 +21,7 @@ def synthesize_to_file():
         # Step 3: Synthesize audio to a file
         audio_stream = client.tts.synthesize(
             text="Hello, how can I help you today?",
-            voice="jess",
-            language="en"
+            voice_id="en_us_male"
         )
 
         # Step 4: Save to file
@@ -67,8 +66,7 @@ def main():
     def synthesize_to_file():
         audio_stream = client.tts.synthesize(
             text="Hello, how can I help you today?",
-            voice="jess",
-            language="en"
+            voice_id="en_us_male"
         )
 
         # Save to file
@@ -83,8 +81,7 @@ def main():
     def stream_tts():
         stream = client.tts.stream(
             text="Hello, this is a streaming example of text-to-speech synthesis.",
-            voice="jess",
-            language="en"
+            voice_id="en_us_male"
         )
 
         # Collect audio chunks
@@ -122,7 +119,7 @@ async def async_tts_example():
     result = await AsyncAiolaClient.grant_token(api_key=os.getenv("AIOLA_API_KEY"))
     client = AsyncAiolaClient(access_token=result.access_token)
 
-    response = await client.tts.synthesize(text="Hello world", voice="jess", language="en")
+    response = await client.tts.synthesize(text="Hello world", voice_id="en_us_male")
 
     async for chunk in response:
         # Process audio chunk
diff --git a/examples/tts/async_tts.py b/examples/tts/async_tts.py
@@ -17,8 +17,7 @@ async def create_audio_file():
         # Step 3: Generate audio
         audio = client.tts.synthesize(
             text='Hello, how can I help you today?',
-            voice='jess',
-            language='en'
+            voice_id='en_us_male'
         )
 
         file_path = os.path.join(os.path.dirname(__file__), "async_audio.wav")
@@ -48,8 +47,7 @@ async def stream_tts():
         # Step 3: Stream audio
         stream = client.tts.stream(
             text='Hello, how can I help you today?',
-            voice='jess',
-            language='en'
+            voice_id='en_us_male'
         )
 
         audio_chunks = []
diff --git a/examples/tts/tts_file.py b/examples/tts/tts_file.py
@@ -16,8 +16,7 @@ def create_file():
         # Step 3: Generate audio
         audio = client.tts.synthesize(
             text='Hello, how can I help you today?',
-            voice='jess',
-            language='en'
+            voice_id='en_us_male'
         )
 
         output_path = os.path.join(os.path.dirname(__file__), "output_audio.wav")
diff --git a/examples/tts/tts_stream.py b/examples/tts/tts_stream.py
@@ -16,8 +16,7 @@ def stream_tts():
         # Step 3: Stream audio
         stream = client.tts.stream(
             text='Hello, how can I help you today?',
-            voice='jess',
-            language='en'
+            voice_id='en_us_male'
         )
 
         audio_chunks = []
diff --git a/tests/unit/tts/test_tts_client.py b/tests/unit/tts/test_tts_client.py
@@ -14,21 +14,21 @@ def test_tts_stream_makes_expected_http_request(dummy_http):
     """``TtsClient.stream`` should send POST /synthesize/stream and yield audio chunks."""
 
     client = AiolaClient(api_key="k", base_url="https://tts.example")
-    chunks = list(client.tts.stream(text="Hello", voice="voiceA"))
+    chunks = list(client.tts.stream(text="Hello", voice_id="en_us_male"))
 
     assert chunks == [b"chunk1", b"chunk2"]
 
     recorded = dummy_http.stream_calls.pop()
     assert recorded["method"] == "POST"
     assert recorded["path"] == "/api/tts/stream"
-    assert recorded["json"] == {"text": "Hello", "voice": "voiceA", "language": None}
+    assert recorded["json"] == {"text": "Hello", "voice_id": "en_us_male"}
 
 
 def test_tts_synthesize_makes_expected_http_request(dummy_http):
     """``TtsClient.synthesize`` must hit POST /synthesize (non-stream variant)."""
 
     client = AiolaClient(api_key="k")
-    list(client.tts.synthesize(text="Hi", voice="B"))  # exhaust generator
+    list(client.tts.synthesize(text="Hi", voice_id="de_female"))  # exhaust generator
 
     recorded = dummy_http.stream_calls.pop()
     assert recorded["path"] == "/api/tts/synthesize"
@@ -44,7 +44,7 @@ async def test_async_tts_stream(dummy_async_http):
     """``AsyncTtsClient.stream`` should work similarly using awaitables."""
 
     client = AsyncAiolaClient(api_key="k")
-    chunks = [c async for c in client.tts.stream(text="Async", voice="v")]  # exhaust
+    chunks = [c async for c in client.tts.stream(text="Async", voice_id="en_uk_female")]  # exhaust
 
     assert chunks == [b"chunk1", b"chunk2"]
 
@@ -57,7 +57,7 @@ async def test_async_tts_synthesize(dummy_async_http):
     """``AsyncTtsClient.synthesize`` POSTs to /synthesize endpoint."""
 
     client = AsyncAiolaClient(api_key="k")
-    _ = [c async for c in client.tts.synthesize(text="Async", voice="v")]
+    _ = [c async for c in client.tts.synthesize(text="Async", voice_id="de_female")]
 
     recorded = dummy_async_http.stream_calls.pop()
     assert recorded["path"] == "/api/tts/synthesize"
@@ -95,7 +95,7 @@ def mock_create_authenticated_client(*args, **kwargs):
 
     # Now wrapped in AiolaError instead of raw RuntimeError
     with pytest.raises(AiolaError, match="TTS streaming failed"):
-        list(client.tts.stream(text="x", voice="v"))
+        list(client.tts.stream(text="x", voice_id="en_us_male"))
 
 
 @pytest.mark.anyio
@@ -129,5 +129,5 @@ async def mock_create_async_authenticated_client(*args, **kwargs):
 
     # Now wrapped in AiolaError instead of raw RuntimeError
     with pytest.raises(AiolaError, match="Async TTS streaming failed"):
-        async for _ in client.tts.stream(text="fail", voice="v"):
+        async for _ in client.tts.stream(text="fail", voice_id="en_us_male"):
             pass
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -16,8 +16,7 @@ def create_file():`
`16`	`16`	`# Step 3: Generate audio`
`17`	`17`	`audio = client.tts.synthesize(`
`18`	`18`	`text='Hello, how can I help you today?',`
`19`		`- voice='jess',`
`20`		`- language='en'`
	`19`	`+ voice_id='en_us_male'`
`21`	`20`	`)`
`22`	`21`
`23`	`22`	`output_path = os.path.join(os.path.dirname(__file__), "output_audio.wav")`
Original file line number	Diff line number	Diff line change
`@@ -16,8 +16,7 @@ def stream_tts():`
`16`	`16`	`# Step 3: Stream audio`
`17`	`17`	`stream = client.tts.stream(`
`18`	`18`	`text='Hello, how can I help you today?',`
`19`		`- voice='jess',`
`20`		`- language='en'`
	`19`	`+ voice_id='en_us_male'`
`21`	`20`	`)`
`22`	`21`
`23`	`22`	`audio_chunks = []`