Source code for venice_ai.types.audio

"""
Type definitions for Venice AI Audio API.

This module contains TypedDict definitions and Enums for request objects
in the Venice AI Audio API, covering the speech creation endpoint.
"""

from enum import Enum
from typing import List, Literal, Optional, TypedDict

__all__ = [
    "Voice",
    "ResponseFormat",
    "CreateSpeechRequest",
    "VoiceDetail",
    "VoiceList",
]



[docs]
class Voice(str, Enum):
    """
    Available voices for speech generation in the Venice AI Audio API.
    
    This enumeration defines the complete set of voice options that can be used
    when generating speech from text via the text-to-speech endpoint. Each voice
    represents different speaker characteristics including gender, accent, and
    vocal qualities. Voice names follow a pattern indicating language/region
    and gender (e.g., ``af`` for American Female, ``am`` for American Male).
    """
    
    AF_ALLOY = "af_alloy"
    AF_AOEDE = "af_aoede"
    AF_BELLA = "af_bella"
    AF_HEART = "af_heart"
    AF_JADZIA = "af_jadzia"
    AF_JESSICA = "af_jessica"
    AF_KORE = "af_kore"
    AF_NICOLE = "af_nicole"
    AF_NOVA = "af_nova"
    AF_RIVER = "af_river"
    AF_SARAH = "af_sarah"
    AF_SKY = "af_sky"
    AM_ADAM = "am_adam"
    AM_ECHO = "am_echo"
    AM_ERIC = "am_eric"
    AM_FENRIR = "am_fenrir"
    AM_LIAM = "am_liam"
    AM_MICHAEL = "am_michael"
    AM_ONYX = "am_onyx"
    AM_PUCK = "am_puck"
    AM_SANTA = "am_santa"
    BF_ALICE = "bf_alice"
    BF_EMMA = "bf_emma"
    BF_LILY = "bf_lily"
    BM_DANIEL = "bm_daniel"
    BM_FABLE = "bm_fable"
    BM_GEORGE = "bm_george"
    BM_LEWIS = "bm_lewis"
    ZF_XIAOBEI = "zf_xiaobei"
    ZF_XIAONI = "zf_xiaoni"
    ZF_XIAOXIAO = "zf_xiaoxiao"
    ZF_XIAOYI = "zf_xiaoyi"
    ZM_YUNJIAN = "zm_yunjian"
    ZM_YUNXI = "zm_yunxi"
    ZM_YUNXIA = "zm_yunxia"
    ZM_YUNYANG = "zm_yunyang"
    FF_SIWIS = "ff_siwis"
    HF_ALPHA = "hf_alpha"
    HF_BETA = "hf_beta"
    HM_OMEGA = "hm_omega"
    HM_PSI = "hm_psi"
    IF_SARA = "if_sara"
    IM_NICOLA = "im_nicola"
    JF_ALPHA = "jf_alpha"
    JF_GONGITSUNE = "jf_gongitsune"
    JF_NEZUMI = "jf_nezumi"
    JF_TEBUKURO = "jf_tebukuro"
    JM_KUMO = "jm_kumo"
    PF_DORA = "pf_dora"
    PM_ALEX = "pm_alex"
    PM_SANTA = "pm_santa"
    EF_DORA = "ef_dora"
    EM_ALEX = "em_alex"
    EM_SANTA = "em_santa"
    
    # Aliases for backward compatibility with existing code and tests
    NOVA = AF_NOVA  # Alias for AF_NOVA (American Female Nova voice)
    ALLOY = AF_ALLOY  # Alias for AF_ALLOY (American Female Alloy voice)
    ONYX = AM_ONYX  # Alias for AM_ONYX (American Male Onyx voice)
    SHIMMER = AF_RIVER  # Alias for AF_RIVER (approximation for legacy SHIMMER voice)




[docs]
class ResponseFormat(str, Enum):
    """
    Available audio response formats for speech generation output.
    
    This enumeration defines the supported audio file formats that can be
    requested when generating speech from text. The format determines the
    encoding, compression, and quality characteristics of the returned audio
    data from the text-to-speech endpoint. Different formats offer trade-offs
    between file size, quality, and compatibility.
    """
    
    MP3 = "mp3"
    AAC = "aac"
    OPUS = "opus"
    FLAC = "flac"
    WAV = "wav"




[docs]
class CreateSpeechRequest(TypedDict, total=False):
    """
    Request parameters for creating speech audio from text input.
    
    This TypedDict defines the structure for requests to the POST /audio/speech
    endpoint, which converts text into spoken audio using specified voice
    characteristics and output format. The request allows customization of
    voice selection, audio format, playback speed, and user identification
    for tracking purposes.
    
    Attributes:
        model: ID of the model to use for speech generation (e.g., "tts-kokoro").
        input: The text to convert to speech. Maximum length varies by model.
        voice: The voice to use for the generated audio. See :class:`~Voice` for available options.
        response_format: Optional. The format to return the audio in. Defaults to "mp3".
            See :class:`~ResponseFormat` for available formats.
        speed: Optional. The speed of the generated audio. Select a value from 0.25 to 4.0.
            Defaults to 1.0.
        user: Optional. A unique identifier representing the end-user, which can help Venice AI
            to monitor and detect abuse.
    """
    
    model: str  # Required: ID of the speech model to use for generation
    input: str  # Required: The text to convert to speech
    voice: Voice  # Required: Voice to use for the generated audio
    response_format: ResponseFormat  # Optional: Format of returned audio (defaults to "mp3")
    speed: Optional[float]  # Optional: Speed of the generated audio (0.25-4.0, defaults to 1.0)
    user: Optional[str]  # Optional: Unique identifier representing the end-user for monitoring


[docs]
class VoiceDetail(TypedDict):
    """
    Detailed information about a single text-to-speech voice.
    
    This TypedDict represents the structure of voice information returned by the
    get_voices() method. It contains metadata about a voice including its unique
    identifier, associated model, gender characteristics, and regional/language
    information derived from the voice ID.
    
    Attributes:
        id: The unique identifier for the voice as provided by the API (e.g., "af_alloy", "zm_yunjian").
        model_id: The ID of the TTS model this voice is associated with (e.g., "tts-kokoro").
        gender: The perceived gender of the voice, parsed from the voice ID prefix. 
            "unknown" if the prefix is not recognized or ambiguous.
        region_code: The raw two-letter prefix from the voice ID that typically indicates 
            region/language and gender (e.g., "af", "zm").
        language: A descriptive name of the primary language associated with the voice, 
            derived from the region_code (e.g., "American English", "Mandarin Chinese").
        accent: A descriptive name of the accent or locale associated with the voice, 
            derived from the region_code (e.g., "US", "Standard Chinese").
    """
    
    id: str
    model_id: str
    gender: Optional[Literal["male", "female", "unknown"]]
    region_code: Optional[str]
    language: Optional[str]
    accent: Optional[str]




[docs]
class VoiceList(TypedDict):
    """
    A list of voice details with optional filtering metadata.
    
    This TypedDict represents the structure returned by the get_voices() method,
    containing a list of VoiceDetail objects along with metadata about any filters
    that were applied to generate the list. This follows the standard API pattern
    for list responses.
    
    Attributes:
        object: A string indicating the type of API object, always "list" for lists.
        data: A list containing VoiceDetail objects.
        model_id_filter: The model_id that was used to filter the voices, if any. 
            None if no model ID filter was applied.
        gender_filter: The gender that was used to filter the voices, if any. 
            None if no gender filter was applied.
        region_code_filter: The region_code (e.g., "af", "zm") that was used to filter 
            the voices, if any. None if no region code filter was applied.
    """
    
    object: Literal["list"]
    data: List[VoiceDetail]
    model_id_filter: Optional[str]
    gender_filter: Optional[Literal["male", "female", "unknown"]]
    region_code_filter: Optional[str]