Source code for venice_ai.resources.audio

"""
Venice AI Audio API resources.

This module provides classes for interacting with the Venice AI Audio API,
supporting speech synthesis operations. The module includes both synchronous
and asynchronous interfaces for audio generation with various voice options
and output formats.

The audio API allows for:
- Converting text to natural-sounding speech (text-to-speech)
- Selecting from multiple voice options for speech synthesis
- Controlling speech speed and output format
- Both full and streaming response modes
"""

import httpx
from typing import List, Literal, Optional, Dict, Any, Union, Iterator, AsyncIterator, TYPE_CHECKING, cast, overload
from httpx import Response as HttpxResponse

from .._resource import APIResource, AsyncAPIResource
from ..types.audio import Voice, ResponseFormat, VoiceDetail, VoiceList
from ..types.models import ModelList as SDKModelList
from ..exceptions import _make_status_error

if TYPE_CHECKING:
    from .._client import VeniceClient
    from .._async_client import AsyncVeniceClient


REGION_LANGUAGE_MAPPING: Dict[str, Dict[str, str]] = {
    "a": {"language": "English", "accent": "American"},
    "b": {"language": "English", "accent": "British"},
    "c": {"language": "English", "accent": "Canadian"},
    "d": {"language": "German", "accent": "Standard"},
    "e": {"language": "Spanish", "accent": "European Standard"},
    "f": {"language": "French", "accent": "Standard"},
    "g": {"language": "English", "accent": "General"},
    "h": {"language": "English", "accent": "General"}, # Placeholder, can be refined
    "i": {"language": "Italian", "accent": "Standard"},
    "j": {"language": "Japanese", "accent": "Standard"},
    "k": {"language": "Korean", "accent": "Standard"},
    "p": {"language": "Portuguese", "accent": "Standard"},
    "r": {"language": "Russian", "accent": "Standard"},
    "s": {"language": "English", "accent": "Scottish"},
    "u": {"language": "English", "accent": "US"}, # Alternative/Specific US
    "w": {"language": "English", "accent": "Welsh"},
    "x": {"language": "English", "accent": "Australian"},
    "y": {"language": "English", "accent": "Indian"},
    "z": {"language": "Mandarin Chinese", "accent": "Standard"},
}


[docs] class Audio(APIResource): """ Provides access to text-to-speech (TTS) audio generation operations. This class handles synchronous audio generation requests, supporting both streaming and non-streaming modes. It allows conversion of text to natural-sounding speech using various voice models and output formats. :param client: The Venice AI client instance used for making API requests. :type client: VeniceClient .. note:: This class is typically accessed through the ``VeniceClient.audio`` property rather than being instantiated directly. """ @overload def create_speech( self, *, input: str, model: str, voice: Union[str, Voice], response_format: Optional[Union[str, ResponseFormat]] = "mp3", speed: Optional[float] = 1.0, stream: Literal[False] = False, timeout: Optional[Union[float, httpx.Timeout]] = None, ) -> bytes: ... @overload def create_speech( self, *, input: str, model: str, voice: Union[str, Voice], response_format: Optional[Union[str, ResponseFormat]] = "mp3", speed: Optional[float] = 1.0, stream: Literal[True], timeout: Optional[Union[float, httpx.Timeout]] = None, ) -> Iterator[bytes]: ...
[docs] def create_speech( self, *, input: str, model: str, voice: Union[str, Voice], response_format: Optional[Union[str, ResponseFormat]] = "mp3", speed: Optional[float] = 1.0, stream: bool = False, timeout: Optional[Union[float, httpx.Timeout]] = None, ) -> Union[bytes, Iterator[bytes]]: """ Generates audio from input text. Converts the provided text to speech using the specified model and voice. The audio can be returned either as complete binary data or as a stream of audio chunks for real-time processing. :param model: ID of the model to use for speech generation (e.g., "tts-kokoro"). :type model: str :param input: The text to convert to speech. Maximum length varies by model. :type input: str :param voice: The voice to use for the generated audio. Can be a string literal or a :class:`~venice_ai.types.audio.Voice` enum value (e.g., Voice.KOKORO_DEFAULT or "kokoro-default"). :type voice: Union[str, venice_ai.types.audio.Voice] :param response_format: The format to return the audio in. Can be a string literal or a :class:`~venice_ai.types.audio.ResponseFormat` enum value. Defaults to "mp3". :type response_format: Optional[Union[str, venice_ai.types.audio.ResponseFormat]] :param speed: The speed of the generated audio. Select a value from 0.25 to 4.0. Defaults to 1.0. :type speed: Optional[float] :param stream: Whether to stream the audio data. If True, returns an Iterator of audio chunks. If False, returns the complete audio data. Defaults to False. :type stream: Optional[bool] :param timeout: Request timeout in seconds or an httpx.Timeout object. If not provided, uses the client's default timeout. :type timeout: Optional[Union[float, httpx.Timeout]] :return: If stream is False, returns the audio data as bytes. If stream is True, returns an Iterator yielding chunks of audio data as bytes. :rtype: Union[bytes, Iterator[bytes]] :raises venice_ai.exceptions.APIError: If the API request fails. :raises ValueError: If the input text is empty or invalid parameters are provided. Example: Basic non-streaming text-to-speech: .. code-block:: python from venice_ai import VeniceClient from venice_ai.types.audio import Voice, ResponseFormat client = VeniceClient() # Generate speech with enum values audio_bytes = client.audio.create_speech( model="tts-kokoro", input="Hello, this is a test.", voice=Voice.KOKORO_DEFAULT ) # Save to file with open("speech.mp3", "wb") as f: f.write(audio_bytes) # Using string literals and different format audio_bytes = client.audio.create_speech( model="tts-kokoro", input="Hello with different settings.", voice="kokoro-default", response_format="wav", speed=1.2 ) Streaming text-to-speech: .. code-block:: python # Stream audio data stream = client.audio.create_speech( model="tts-kokoro", input="This is a streamed audio example.", voice="kokoro-default", stream=True ) # Write streamed chunks to file with open("streamed_speech.mp3", "wb") as f: for chunk in stream: f.write(chunk) """ # Validate input if not input: raise ValueError("Input text cannot be empty for speech generation") # Build request options options = { "headers": {"Accept": "audio/*"}, "body": { "input": input, "model": model, "voice": voice, "response_format": response_format, "speed": speed, }, "timeout": timeout, } if stream: # Use the client's streaming method for raw bytes return self._client._stream_request_raw( method="POST", path="audio/speech", json_data=options.get("body"), headers=options.get("headers"), timeout=options.get("timeout"), ) else: # Use the client's regular request method with raw_response=True return self._client._request( method="POST", path="audio/speech", json_data=options.get("body"), headers=options.get("headers"), raw_response=True, timeout=options.get("timeout"), )
[docs] def get_voices( self, *, model_id: Optional[str] = None, gender: Optional[Literal["male", "female", "unknown"]] = None, region_code: Optional[str] = None, # e.g., "af", "zm" ) -> VoiceList: """ Lists available text-to-speech (TTS) voices, with optional filtering. This method retrieves information about available voices for TTS models, allowing filtering by model ID, gender, and region code. Args: model_id: Optional. If provided, only voices for this specific TTS model ID will be returned. gender: Optional. Filter voices by gender ("male", "female", "unknown"). Gender is inferred from the voice ID prefix. region_code: Optional. Filter voices by the raw two-letter region/language prefix from the voice ID (e.g., "af" for American Female-sounding, "zm" for Chinese Male-sounding). Returns: A VoiceList object containing a list of VoiceDetail objects that match the filter criteria, along with information about the applied filters. Raises: venice_ai.exceptions.APIError: If an API error occurs during the request to the underlying models endpoint. """ all_voice_details: List[VoiceDetail] = [] # Type hint for clarity, self._client.models is Models resource instance sdk_models_list_response: SDKModelList = self._client.models.list(type="tts") for model_data in sdk_models_list_response.get("data", []): current_model_id = cast(Optional[str], model_data.get("id")) if not current_model_id: # Skip if model has no ID continue # Apply model_id filter if provided if model_id is not None and current_model_id != model_id: continue model_spec = cast(Dict[str, Any], model_data.get("model_spec", {})) voice_ids_from_api = cast(List[str], model_spec.get("voices", [])) for raw_voice_id in voice_ids_from_api: parsed_gender: Optional[Literal["male", "female", "unknown"]] = "unknown" parsed_region_code: Optional[str] = None parsed_language: Optional[str] = None parsed_accent: Optional[str] = None if "_" in raw_voice_id and len(raw_voice_id.split('_')[0]) >= 2: prefix = raw_voice_id.split('_')[0] parsed_region_code = prefix # Infer gender from the second character of the prefix gender_char = prefix[1:2].lower() # ensure lowercase for comparison if gender_char == 'm': parsed_gender = "male" elif gender_char == 'f': parsed_gender = "female" # Infer language and accent from the first character of the prefix lang_char = prefix[0:1].lower() # ensure lowercase for mapping lang_info = REGION_LANGUAGE_MAPPING.get(lang_char) if lang_info: parsed_language = lang_info["language"] parsed_accent = lang_info["accent"] # Apply gender filter if gender is not None and parsed_gender != gender: continue # Apply region_code filter if region_code is not None and parsed_region_code != region_code: continue voice_detail_obj: VoiceDetail = { "id": raw_voice_id, "model_id": current_model_id, "gender": parsed_gender, "region_code": parsed_region_code, "language": parsed_language, "accent": parsed_accent, } all_voice_details.append(voice_detail_obj) return { "object": "list", "data": all_voice_details, "model_id_filter": model_id, "gender_filter": gender, "region_code_filter": region_code, }
[docs] class AsyncAudio(AsyncAPIResource): """ Provides access to text-to-speech (TTS) audio generation operations asynchronously. This class handles asynchronous audio generation requests, supporting both streaming and non-streaming modes. It allows conversion of text to natural-sounding speech using various voice models and output formats in async applications. :param client: The async Venice AI client instance used for making API requests. :type client: AsyncVeniceClient .. note:: This class is typically accessed through the ``AsyncVeniceClient.audio`` property rather than being instantiated directly. """ @overload async def create_speech( self, *, input: str, model: str, voice: Union[str, Voice], response_format: Optional[Union[str, ResponseFormat]] = "mp3", speed: Optional[float] = 1.0, stream: Literal[False] = False, timeout: Optional[Union[float, httpx.Timeout]] = None, ) -> bytes: ... @overload async def create_speech( self, *, input: str, model: str, voice: Union[str, Voice], response_format: Optional[Union[str, ResponseFormat]] = "mp3", speed: Optional[float] = 1.0, stream: Literal[True], timeout: Optional[Union[float, httpx.Timeout]] = None, ) -> AsyncIterator[bytes]: ...
[docs] async def create_speech( self, *, input: str, model: str, voice: Union[str, Voice], response_format: Optional[Union[str, ResponseFormat]] = "mp3", speed: Optional[float] = 1.0, stream: bool = False, timeout: Optional[Union[float, httpx.Timeout]] = None, ) -> Union[bytes, AsyncIterator[bytes]]: """ Generates audio from input text asynchronously. Converts the provided text to speech using the specified model and voice using asynchronous requests. The audio can be returned either as complete binary data or as an async stream of audio chunks for real-time processing. :param model: ID of the model to use for speech generation (e.g., "tts-kokoro"). :type model: str :param input: The text to convert to speech. Maximum length varies by model. :type input: str :param voice: The voice to use for the generated audio. Can be a string literal or a :class:`~venice_ai.types.audio.Voice` enum value (e.g., Voice.KOKORO_DEFAULT or "kokoro-default"). :type voice: Union[str, venice_ai.types.audio.Voice] :param response_format: The format to return the audio in. Can be a string literal or a :class:`~venice_ai.types.audio.ResponseFormat` enum value. Defaults to "mp3". :type response_format: Optional[Union[str, venice_ai.types.audio.ResponseFormat]] :param speed: The speed of the generated audio. Select a value from 0.25 to 4.0. Defaults to 1.0. :type speed: Optional[float] :param stream: Whether to stream the audio data. If True, returns an AsyncIterator of audio chunks. If False, returns the complete audio data. Defaults to False. :type stream: Optional[bool] :param timeout: Request timeout in seconds or an httpx.Timeout object. If not provided, uses the client's default timeout. :type timeout: Optional[Union[float, httpx.Timeout]] :return: If stream is False, returns the audio data as bytes (awaitable). If stream is True, returns an AsyncIterator yielding chunks of audio data as bytes. :rtype: Union[bytes, AsyncIterator[bytes]] :raises venice_ai.exceptions.APIError: If the API request fails. :raises ValueError: If the input text is empty or invalid parameters are provided. Example: Basic non-streaming text-to-speech: .. code-block:: python import asyncio from venice_ai import AsyncVeniceClient from venice_ai.types.audio import Voice, ResponseFormat async def generate_speech(): client = AsyncVeniceClient() # Generate speech with enum values audio_bytes = await client.audio.create_speech( model="tts-kokoro", input="Hello, this is a test.", voice=Voice.KOKORO_DEFAULT ) # Save to file with open("speech.mp3", "wb") as f: f.write(audio_bytes) # Using string literals and different format audio_bytes = await client.audio.create_speech( model="tts-kokoro", input="Hello with different settings.", voice="kokoro-default", response_format="wav", speed=1.2 ) asyncio.run(generate_speech()) Streaming text-to-speech: .. code-block:: python async def stream_speech(): client = AsyncVeniceClient() # Stream audio data stream = client.audio.create_speech( model="tts-kokoro", input="This is a streamed audio example.", voice="kokoro-default", stream=True ) # Write streamed chunks to file with open("streamed_speech.mp3", "wb") as f: async for chunk in stream: f.write(chunk) asyncio.run(stream_speech()) """ # Validate input if not input: raise ValueError("Input text cannot be empty for speech generation") # Build request options options = { "headers": {"Accept": "audio/*"}, "body": { "input": input, "model": model, "voice": voice, "response_format": response_format, "speed": speed, }, "timeout": timeout, } if stream: # Make a request that returns the raw httpx.Response for streaming raw_response: HttpxResponse = await self._arequest_raw_response("POST", "audio/speech", options=options, stream_mode=True) # Check for errors before attempting to stream if raw_response.status_code >= 400: await raw_response.aread() # Consume body to release connection before raising raw_response.raise_for_status() return raw_response.aiter_bytes(chunk_size=4096) else: # For non-streaming, get the raw response and return content raw_response_non_stream: HttpxResponse = await self._arequest_raw_response("POST", "audio/speech", options=options, stream_mode=False) if raw_response_non_stream.status_code >= 400: await raw_response_non_stream.aread() # Ensure the response is read before raising/translating # Create an HTTPStatusError to leverage the client's main translation logic http_error = httpx.HTTPStatusError( message=f"HTTP {raw_response_non_stream.status_code} error while making API request to {raw_response_non_stream.request.url}", request=raw_response_non_stream.request, response=raw_response_non_stream ) # Use the client's translator. # default_request should be the request that led to this error. # is_stream is False for this non-streaming path. raise await self._client._translate_httpx_error_to_api_error(http_error, default_request=http_error.request, is_stream=False) # If not an error, it means the request was successful. return raw_response_non_stream.content
[docs] async def get_voices( self, *, model_id: Optional[str] = None, gender: Optional[Literal["male", "female", "unknown"]] = None, region_code: Optional[str] = None, # e.g., "af", "zm" ) -> VoiceList: """ Lists available text-to-speech (TTS) voices asynchronously, with optional filtering. This method retrieves information about available voices for TTS models, allowing filtering by model ID, gender, and region code. Args: model_id: Optional. If provided, only voices for this specific TTS model ID will be returned. gender: Optional. Filter voices by gender ("male", "female", "unknown"). Gender is inferred from the voice ID prefix. region_code: Optional. Filter voices by the raw two-letter region/language prefix from the voice ID (e.g., "af" for American Female-sounding, "zm" for Chinese Male-sounding). Returns: A VoiceList object containing a list of VoiceDetail objects that match the filter criteria, along with information about the applied filters. Raises: venice_ai.exceptions.APIError: If an API error occurs during the request to the underlying models endpoint. """ all_voice_details: List[VoiceDetail] = [] # Type hint for clarity, self._client.models is AsyncModels resource instance sdk_models_list_response: SDKModelList = await self._client.models.list(type="tts") for model_data in sdk_models_list_response.get("data", []): current_model_id = cast(Optional[str], model_data.get("id")) if not current_model_id: # Skip if model has no ID continue # Apply model_id filter if provided if model_id is not None and current_model_id != model_id: continue model_spec = cast(Dict[str, Any], model_data.get("model_spec", {})) voice_ids_from_api = cast(List[str], model_spec.get("voices", [])) for raw_voice_id in voice_ids_from_api: parsed_gender: Optional[Literal["male", "female", "unknown"]] = "unknown" parsed_region_code: Optional[str] = None parsed_language: Optional[str] = None parsed_accent: Optional[str] = None if "_" in raw_voice_id and len(raw_voice_id.split('_')[0]) >= 2: prefix = raw_voice_id.split('_')[0] parsed_region_code = prefix gender_char = prefix[1:2].lower() if gender_char == 'm': parsed_gender = "male" elif gender_char == 'f': parsed_gender = "female" lang_char = prefix[0:1].lower() lang_info = REGION_LANGUAGE_MAPPING.get(lang_char) if lang_info: parsed_language = lang_info["language"] parsed_accent = lang_info["accent"] if gender is not None and parsed_gender != gender: continue if region_code is not None and parsed_region_code != region_code: continue voice_detail_obj: VoiceDetail = { "id": raw_voice_id, "model_id": current_model_id, "gender": parsed_gender, "region_code": parsed_region_code, "language": parsed_language, "accent": parsed_accent, } all_voice_details.append(voice_detail_obj) return { "object": "list", "data": all_voice_details, "model_id_filter": model_id, "gender_filter": gender, "region_code_filter": region_code, }