Source code for venice_ai.resources.audio

"""
Venice AI Audio API resources.

This module provides classes for interacting with the Venice AI Audio API,
supporting speech synthesis operations. The module includes both synchronous
and asynchronous interfaces for audio generation with various voice options
and output formats.

The audio API allows for:
- Converting text to natural-sounding speech (text-to-speech)
- Selecting from multiple voice options for speech synthesis
- Controlling speech speed and output format
- Both full and streaming response modes
"""

import httpx
from typing import List, Literal, Optional, Dict, Any, Union, Iterator, AsyncIterator, TYPE_CHECKING, cast, overload
from httpx import Response as HttpxResponse

from .._resource import APIResource, AsyncAPIResource
from ..types.audio import Voice, ResponseFormat, VoiceDetail, VoiceList
from ..types.models import ModelList as SDKModelList
from ..exceptions import _make_status_error

if TYPE_CHECKING:
    from .._client import VeniceClient
    from .._async_client import AsyncVeniceClient


REGION_LANGUAGE_MAPPING: Dict[str, Dict[str, str]] = {
    "a": {"language": "English", "accent": "American"},
    "b": {"language": "English", "accent": "British"},
    "c": {"language": "English", "accent": "Canadian"},
    "d": {"language": "German", "accent": "Standard"},
    "e": {"language": "Spanish", "accent": "European Standard"},
    "f": {"language": "French", "accent": "Standard"},
    "g": {"language": "English", "accent": "General"},
    "h": {"language": "English", "accent": "General"}, # Placeholder, can be refined
    "i": {"language": "Italian", "accent": "Standard"},
    "j": {"language": "Japanese", "accent": "Standard"},
    "k": {"language": "Korean", "accent": "Standard"},
    "p": {"language": "Portuguese", "accent": "Standard"},
    "r": {"language": "Russian", "accent": "Standard"},
    "s": {"language": "English", "accent": "Scottish"},
    "u": {"language": "English", "accent": "US"}, # Alternative/Specific US
    "w": {"language": "English", "accent": "Welsh"},
    "x": {"language": "English", "accent": "Australian"},
    "y": {"language": "English", "accent": "Indian"},
    "z": {"language": "Mandarin Chinese", "accent": "Standard"},
}



[docs]
class Audio(APIResource):
    """
    Provides access to text-to-speech (TTS) audio generation operations.
    
    This class handles synchronous audio generation requests, supporting both
    streaming and non-streaming modes. It allows conversion of text to natural-sounding
    speech using various voice models and output formats.
    
    :param client: The Venice AI client instance used for making API requests.
    :type client: VeniceClient
    
    .. note::
        This class is typically accessed through the ``VeniceClient.audio`` property
        rather than being instantiated directly.
    """
    
    @overload
    def create_speech(
        self,
        *,
        input: str,
        model: str,
        voice: Union[str, Voice],
        response_format: Optional[Union[str, ResponseFormat]] = "mp3",
        speed: Optional[float] = 1.0,
        stream: Literal[False] = False,
        timeout: Optional[Union[float, httpx.Timeout]] = None,
    ) -> bytes: ...

    @overload
    def create_speech(
        self,
        *,
        input: str,
        model: str,
        voice: Union[str, Voice],
        response_format: Optional[Union[str, ResponseFormat]] = "mp3",
        speed: Optional[float] = 1.0,
        stream: Literal[True],
        timeout: Optional[Union[float, httpx.Timeout]] = None,
    ) -> Iterator[bytes]: ...


[docs]
    def create_speech(
        self,
        *,
        input: str,
        model: str,
        voice: Union[str, Voice],
        response_format: Optional[Union[str, ResponseFormat]] = "mp3",
        speed: Optional[float] = 1.0,
        stream: bool = False,
        timeout: Optional[Union[float, httpx.Timeout]] = None,
    ) -> Union[bytes, Iterator[bytes]]:
        """
        Generates audio from input text.
        
        Converts the provided text to speech using the specified model and voice.
        The audio can be returned either as complete binary data or as a stream
        of audio chunks for real-time processing.
        
        :param model: ID of the model to use for speech generation (e.g., "tts-kokoro").
        :type model: str
        :param input: The text to convert to speech. Maximum length varies by model.
        :type input: str
        :param voice: The voice to use for the generated audio. Can be a string literal
            or a :class:`~venice_ai.types.audio.Voice` enum value (e.g., Voice.KOKORO_DEFAULT
            or "kokoro-default").
        :type voice: Union[str, venice_ai.types.audio.Voice]
        :param response_format: The format to return the audio in. Can be a string literal or a
            :class:`~venice_ai.types.audio.ResponseFormat` enum value. Defaults to "mp3".
        :type response_format: Optional[Union[str, venice_ai.types.audio.ResponseFormat]]
        :param speed: The speed of the generated audio. Select a value from 0.25 to 4.0.
            Defaults to 1.0.
        :type speed: Optional[float]
        :param stream: Whether to stream the audio data. If True, returns an Iterator
            of audio chunks. If False, returns the complete audio data. Defaults to False.
        :type stream: Optional[bool]
        :param timeout: Request timeout in seconds or an httpx.Timeout object.
            If not provided, uses the client's default timeout.
        :type timeout: Optional[Union[float, httpx.Timeout]]
        
        :return: If stream is False, returns the audio data as bytes. If stream is True,
            returns an Iterator yielding chunks of audio data as bytes.
        :rtype: Union[bytes, Iterator[bytes]]
        
        :raises venice_ai.exceptions.APIError: If the API request fails.
        :raises ValueError: If the input text is empty or invalid parameters are provided.
        
        Example:
            Basic non-streaming text-to-speech:
            
            .. code-block:: python
            
                from venice_ai import VeniceClient
                from venice_ai.types.audio import Voice, ResponseFormat
                
                client = VeniceClient()
                
                # Generate speech with enum values
                audio_bytes = client.audio.create_speech(
                    model="tts-kokoro",
                    input="Hello, this is a test.",
                    voice=Voice.KOKORO_DEFAULT
                )
                
                # Save to file
                with open("speech.mp3", "wb") as f:
                    f.write(audio_bytes)
                
                # Using string literals and different format
                audio_bytes = client.audio.create_speech(
                    model="tts-kokoro",
                    input="Hello with different settings.",
                    voice="kokoro-default",
                    response_format="wav",
                    speed=1.2
                )
            
            Streaming text-to-speech:
            
            .. code-block:: python
            
                # Stream audio data
                stream = client.audio.create_speech(
                    model="tts-kokoro",
                    input="This is a streamed audio example.",
                    voice="kokoro-default",
                    stream=True
                )
                
                # Write streamed chunks to file
                with open("streamed_speech.mp3", "wb") as f:
                    for chunk in stream:
                        f.write(chunk)
        """
        # Validate input
        if not input:
            raise ValueError("Input text cannot be empty for speech generation")

        # Build request options
        options = {
            "headers": {"Accept": "audio/*"},
            "body": {
                "input": input,
                "model": model,
                "voice": voice,
                "response_format": response_format,
                "speed": speed,
            },
            "timeout": timeout,
        }

        

        if stream:
            # Use the client's streaming method for raw bytes
            return self._client._stream_request_raw(
                method="POST",
                path="audio/speech",
                json_data=options.get("body"),
                headers=options.get("headers"),
                timeout=options.get("timeout"),
            )
        else:
            # Use the client's regular request method with raw_response=True
            return self._client._request(
                method="POST",
                path="audio/speech",
                json_data=options.get("body"),
                headers=options.get("headers"),
                raw_response=True,
                timeout=options.get("timeout"),
            )



[docs]
    def get_voices(
        self,
        *,
        model_id: Optional[str] = None,
        gender: Optional[Literal["male", "female", "unknown"]] = None,
        region_code: Optional[str] = None, # e.g., "af", "zm"
    ) -> VoiceList:
        """
        Lists available text-to-speech (TTS) voices, with optional filtering.

        This method retrieves information about available voices for TTS models,
        allowing filtering by model ID, gender, and region code.

        Args:
            model_id: Optional. If provided, only voices for this specific TTS model ID
                will be returned.
            gender: Optional. Filter voices by gender ("male", "female", "unknown").
                Gender is inferred from the voice ID prefix.
            region_code: Optional. Filter voices by the raw two-letter region/language
                prefix from the voice ID (e.g., "af" for American Female-sounding,
                "zm" for Chinese Male-sounding).

        Returns:
            A VoiceList object containing a list of VoiceDetail objects that match
            the filter criteria, along with information about the applied filters.

        Raises:
            venice_ai.exceptions.APIError: If an API error occurs during the request
                to the underlying models endpoint.
        """
        all_voice_details: List[VoiceDetail] = []
        
        # Type hint for clarity, self._client.models is Models resource instance
        sdk_models_list_response: SDKModelList = self._client.models.list(type="tts")

        for model_data in sdk_models_list_response.get("data", []):
            current_model_id = cast(Optional[str], model_data.get("id"))

            if not current_model_id: # Skip if model has no ID
                continue

            # Apply model_id filter if provided
            if model_id is not None and current_model_id != model_id:
                continue

            model_spec = cast(Dict[str, Any], model_data.get("model_spec", {}))
            voice_ids_from_api = cast(List[str], model_spec.get("voices", []))

            for raw_voice_id in voice_ids_from_api:
                parsed_gender: Optional[Literal["male", "female", "unknown"]] = "unknown"
                parsed_region_code: Optional[str] = None
                parsed_language: Optional[str] = None
                parsed_accent: Optional[str] = None

                if "_" in raw_voice_id and len(raw_voice_id.split('_')[0]) >= 2:
                    prefix = raw_voice_id.split('_')[0]
                    parsed_region_code = prefix
                    
                    # Infer gender from the second character of the prefix
                    gender_char = prefix[1:2].lower() # ensure lowercase for comparison
                    if gender_char == 'm':
                        parsed_gender = "male"
                    elif gender_char == 'f':
                        parsed_gender = "female"
                    
                    # Infer language and accent from the first character of the prefix
                    lang_char = prefix[0:1].lower() # ensure lowercase for mapping
                    lang_info = REGION_LANGUAGE_MAPPING.get(lang_char)
                    if lang_info:
                        parsed_language = lang_info["language"]
                        parsed_accent = lang_info["accent"]
                
                # Apply gender filter
                if gender is not None and parsed_gender != gender:
                    continue
                
                # Apply region_code filter
                if region_code is not None and parsed_region_code != region_code:
                    continue

                voice_detail_obj: VoiceDetail = {
                    "id": raw_voice_id,
                    "model_id": current_model_id,
                    "gender": parsed_gender,
                    "region_code": parsed_region_code,
                    "language": parsed_language,
                    "accent": parsed_accent,
                }
                all_voice_details.append(voice_detail_obj)

        return {
            "object": "list",
            "data": all_voice_details,
            "model_id_filter": model_id,
            "gender_filter": gender,
            "region_code_filter": region_code,
        }





[docs]
class AsyncAudio(AsyncAPIResource):
    """
    Provides access to text-to-speech (TTS) audio generation operations asynchronously.
    
    This class handles asynchronous audio generation requests, supporting both
    streaming and non-streaming modes. It allows conversion of text to natural-sounding
    speech using various voice models and output formats in async applications.
    
    :param client: The async Venice AI client instance used for making API requests.
    :type client: AsyncVeniceClient
    
    .. note::
        This class is typically accessed through the ``AsyncVeniceClient.audio`` property
        rather than being instantiated directly.
    """
    
    @overload
    async def create_speech(
        self,
        *,
        input: str,
        model: str,
        voice: Union[str, Voice],
        response_format: Optional[Union[str, ResponseFormat]] = "mp3",
        speed: Optional[float] = 1.0,
        stream: Literal[False] = False,
        timeout: Optional[Union[float, httpx.Timeout]] = None,
    ) -> bytes: ...

    @overload
    async def create_speech(
        self,
        *,
        input: str,
        model: str,
        voice: Union[str, Voice],
        response_format: Optional[Union[str, ResponseFormat]] = "mp3",
        speed: Optional[float] = 1.0,
        stream: Literal[True],
        timeout: Optional[Union[float, httpx.Timeout]] = None,
    ) -> AsyncIterator[bytes]: ...


[docs]
    async def create_speech(
        self,
        *,
        input: str,
        model: str,
        voice: Union[str, Voice],
        response_format: Optional[Union[str, ResponseFormat]] = "mp3",
        speed: Optional[float] = 1.0,
        stream: bool = False,
        timeout: Optional[Union[float, httpx.Timeout]] = None,
    ) -> Union[bytes, AsyncIterator[bytes]]:
        """
        Generates audio from input text asynchronously.
        
        Converts the provided text to speech using the specified model and voice
        using asynchronous requests. The audio can be returned either as complete
        binary data or as an async stream of audio chunks for real-time processing.
        
        :param model: ID of the model to use for speech generation (e.g., "tts-kokoro").
        :type model: str
        :param input: The text to convert to speech. Maximum length varies by model.
        :type input: str
        :param voice: The voice to use for the generated audio. Can be a string literal
            or a :class:`~venice_ai.types.audio.Voice` enum value (e.g., Voice.KOKORO_DEFAULT
            or "kokoro-default").
        :type voice: Union[str, venice_ai.types.audio.Voice]
        :param response_format: The format to return the audio in. Can be a string literal or a
            :class:`~venice_ai.types.audio.ResponseFormat` enum value. Defaults to "mp3".
        :type response_format: Optional[Union[str, venice_ai.types.audio.ResponseFormat]]
        :param speed: The speed of the generated audio. Select a value from 0.25 to 4.0.
            Defaults to 1.0.
        :type speed: Optional[float]
        :param stream: Whether to stream the audio data. If True, returns an AsyncIterator
            of audio chunks. If False, returns the complete audio data. Defaults to False.
        :type stream: Optional[bool]
        :param timeout: Request timeout in seconds or an httpx.Timeout object.
            If not provided, uses the client's default timeout.
        :type timeout: Optional[Union[float, httpx.Timeout]]
        
        :return: If stream is False, returns the audio data as bytes (awaitable). If stream is True,
            returns an AsyncIterator yielding chunks of audio data as bytes.
        :rtype: Union[bytes, AsyncIterator[bytes]]
        
        :raises venice_ai.exceptions.APIError: If the API request fails.
        :raises ValueError: If the input text is empty or invalid parameters are provided.
        
        Example:
            Basic non-streaming text-to-speech:
            
            .. code-block:: python
            
                import asyncio
                from venice_ai import AsyncVeniceClient
                from venice_ai.types.audio import Voice, ResponseFormat
                
                async def generate_speech():
                    client = AsyncVeniceClient()
                    
                    # Generate speech with enum values
                    audio_bytes = await client.audio.create_speech(
                        model="tts-kokoro",
                        input="Hello, this is a test.",
                        voice=Voice.KOKORO_DEFAULT
                    )
                    
                    # Save to file
                    with open("speech.mp3", "wb") as f:
                        f.write(audio_bytes)
                    
                    # Using string literals and different format
                    audio_bytes = await client.audio.create_speech(
                        model="tts-kokoro",
                        input="Hello with different settings.",
                        voice="kokoro-default",
                        response_format="wav",
                        speed=1.2
                    )
                
                asyncio.run(generate_speech())
            
            Streaming text-to-speech:
            
            .. code-block:: python
            
                async def stream_speech():
                    client = AsyncVeniceClient()
                    
                    # Stream audio data
                    stream = client.audio.create_speech(
                        model="tts-kokoro",
                        input="This is a streamed audio example.",
                        voice="kokoro-default",
                        stream=True
                    )
                    
                    # Write streamed chunks to file
                    with open("streamed_speech.mp3", "wb") as f:
                        async for chunk in stream:
                            f.write(chunk)
                
                asyncio.run(stream_speech())
        """
        # Validate input
        if not input:
            raise ValueError("Input text cannot be empty for speech generation")

        # Build request options
        options = {
            "headers": {"Accept": "audio/*"},
            "body": {
                "input": input,
                "model": model,
                "voice": voice,
                "response_format": response_format,
                "speed": speed,
            },
            "timeout": timeout,
        }
        

        if stream:
            # Make a request that returns the raw httpx.Response for streaming
            raw_response: HttpxResponse = await self._arequest_raw_response("POST", "audio/speech", options=options, stream_mode=True)
            
            # Check for errors before attempting to stream
            if raw_response.status_code >= 400:
                await raw_response.aread()  # Consume body to release connection before raising
                raw_response.raise_for_status()

            return raw_response.aiter_bytes(chunk_size=4096)
        else:
            # For non-streaming, get the raw response and return content
            raw_response_non_stream: HttpxResponse = await self._arequest_raw_response("POST", "audio/speech", options=options, stream_mode=False)

            if raw_response_non_stream.status_code >= 400:
                await raw_response_non_stream.aread()  # Ensure the response is read before raising/translating
                # Create an HTTPStatusError to leverage the client's main translation logic
                http_error = httpx.HTTPStatusError(
                    message=f"HTTP {raw_response_non_stream.status_code} error while making API request to {raw_response_non_stream.request.url}",
                    request=raw_response_non_stream.request,
                    response=raw_response_non_stream
                )
                # Use the client's translator.
                # default_request should be the request that led to this error.
                # is_stream is False for this non-streaming path.
                raise await self._client._translate_httpx_error_to_api_error(http_error, default_request=http_error.request, is_stream=False)
                
            # If not an error, it means the request was successful.
            return raw_response_non_stream.content



[docs]
    async def get_voices(
        self,
        *,
        model_id: Optional[str] = None,
        gender: Optional[Literal["male", "female", "unknown"]] = None,
        region_code: Optional[str] = None, # e.g., "af", "zm"
    ) -> VoiceList:
        """
        Lists available text-to-speech (TTS) voices asynchronously, with optional filtering.

        This method retrieves information about available voices for TTS models,
        allowing filtering by model ID, gender, and region code.

        Args:
            model_id: Optional. If provided, only voices for this specific TTS model ID
                will be returned.
            gender: Optional. Filter voices by gender ("male", "female", "unknown").
                Gender is inferred from the voice ID prefix.
            region_code: Optional. Filter voices by the raw two-letter region/language
                prefix from the voice ID (e.g., "af" for American Female-sounding,
                "zm" for Chinese Male-sounding).

        Returns:
            A VoiceList object containing a list of VoiceDetail objects that match
            the filter criteria, along with information about the applied filters.

        Raises:
            venice_ai.exceptions.APIError: If an API error occurs during the request
                to the underlying models endpoint.
        """
        all_voice_details: List[VoiceDetail] = []

        # Type hint for clarity, self._client.models is AsyncModels resource instance
        sdk_models_list_response: SDKModelList = await self._client.models.list(type="tts")

        for model_data in sdk_models_list_response.get("data", []):
            current_model_id = cast(Optional[str], model_data.get("id"))

            if not current_model_id: # Skip if model has no ID
                continue

            # Apply model_id filter if provided
            if model_id is not None and current_model_id != model_id:
                continue
            
            model_spec = cast(Dict[str, Any], model_data.get("model_spec", {}))
            voice_ids_from_api = cast(List[str], model_spec.get("voices", []))

            for raw_voice_id in voice_ids_from_api:
                parsed_gender: Optional[Literal["male", "female", "unknown"]] = "unknown"
                parsed_region_code: Optional[str] = None
                parsed_language: Optional[str] = None
                parsed_accent: Optional[str] = None

                if "_" in raw_voice_id and len(raw_voice_id.split('_')[0]) >= 2:
                    prefix = raw_voice_id.split('_')[0]
                    parsed_region_code = prefix
                    
                    gender_char = prefix[1:2].lower()
                    if gender_char == 'm':
                        parsed_gender = "male"
                    elif gender_char == 'f':
                        parsed_gender = "female"
                    
                    lang_char = prefix[0:1].lower()
                    lang_info = REGION_LANGUAGE_MAPPING.get(lang_char)
                    if lang_info:
                        parsed_language = lang_info["language"]
                        parsed_accent = lang_info["accent"]
                
                if gender is not None and parsed_gender != gender:
                    continue
                
                if region_code is not None and parsed_region_code != region_code:
                    continue

                voice_detail_obj: VoiceDetail = {
                    "id": raw_voice_id,
                    "model_id": current_model_id,
                    "gender": parsed_gender,
                    "region_code": parsed_region_code,
                    "language": parsed_language,
                    "accent": parsed_accent,
                }
                all_voice_details.append(voice_detail_obj)

        return {
            "object": "list",
            "data": all_voice_details,
            "model_id_filter": model_id,
            "gender_filter": gender,
            "region_code_filter": region_code,
        }