TTS service and example updates
This commit is contained in:
@@ -25,7 +25,7 @@ from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.openai.llm import OpenAILLMService, OpenAILLMSettings
|
||||
from pipecat.services.xtts.tts import XTTSService, XTTSSettings
|
||||
from pipecat.services.xtts.tts import XTTSService, XTTSTTSSettings
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
@@ -59,7 +59,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
|
||||
tts = XTTSService(
|
||||
aiohttp_session=session,
|
||||
settings=XTTSSettings(
|
||||
settings=XTTSTTSSettings(
|
||||
voice="Claribel Dervla",
|
||||
),
|
||||
base_url="http://localhost:8000",
|
||||
|
||||
@@ -72,7 +72,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
stt = GoogleSTTService(
|
||||
credentials=os.getenv("GOOGLE_TEST_CREDENTIALS"),
|
||||
settings=GoogleSTTSettings(
|
||||
languages=Language.EN_US,
|
||||
languages=[Language.EN_US],
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@@ -55,7 +55,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
|
||||
stt = GoogleSTTService(
|
||||
settings=GoogleSTTSettings(
|
||||
languages=Language.EN_US,
|
||||
languages=[Language.EN_US],
|
||||
),
|
||||
credentials=os.getenv("GOOGLE_TEST_CREDENTIALS"),
|
||||
)
|
||||
|
||||
@@ -55,8 +55,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
|
||||
stt = GoogleSTTService(
|
||||
settings=GoogleSTTSettings(
|
||||
languages=Language.EN_US,
|
||||
model="chirp_3",
|
||||
languages=[Language.EN_US],
|
||||
# Add model to use a specific model
|
||||
# model="chirp_3",
|
||||
),
|
||||
credentials=os.getenv("GOOGLE_TEST_CREDENTIALS"),
|
||||
location="us",
|
||||
|
||||
@@ -58,7 +58,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
tts = FishAudioTTSService(
|
||||
api_key=os.getenv("FISH_API_KEY"),
|
||||
settings=FishAudioTTSSettings(
|
||||
model="4ce7e917cedd4bc2bb2e6ff3a46acaa1", # Barack Obama
|
||||
voice="4ce7e917cedd4bc2bb2e6ff3a46acaa1", # Barack Obama
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@@ -24,7 +24,7 @@ from pipecat.processors.aggregators.llm_response_universal import (
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.openai.llm import OpenAILLMService, OpenAILLMSettings
|
||||
from pipecat.services.sarvam.stt import SarvamSTTService
|
||||
from pipecat.services.sarvam.stt import SarvamSTTService, SarvamSTTSettings
|
||||
from pipecat.services.sarvam.tts import SarvamHttpTTSService, SarvamHttpTTSSettings
|
||||
from pipecat.transcriptions.language import Language
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
@@ -59,14 +59,16 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
stt = SarvamSTTService(
|
||||
api_key=os.getenv("SARVAM_API_KEY"),
|
||||
model="saarika:v2.5",
|
||||
settings=SarvamSTTSettings(
|
||||
model="saarika:v2.5",
|
||||
),
|
||||
)
|
||||
|
||||
tts = SarvamHttpTTSService(
|
||||
api_key=os.getenv("SARVAM_API_KEY"),
|
||||
aiohttp_session=session,
|
||||
settings=SarvamHttpTTSSettings(
|
||||
language=Language.EN,
|
||||
language=Language.EN_IN,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@@ -7,7 +7,6 @@
|
||||
|
||||
import os
|
||||
|
||||
from deepgram import LiveOptions
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
@@ -28,7 +27,7 @@ from pipecat.processors.filters.function_filter import FunctionFilter
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService, CartesiaTTSSettings
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService, DeepgramSTTSettings
|
||||
from pipecat.services.llm_service import FunctionCallParams
|
||||
from pipecat.services.openai.llm import OpenAILLMService, OpenAILLMSettings
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
@@ -102,7 +101,10 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = DeepgramSTTService(
|
||||
api_key=os.getenv("DEEPGRAM_API_KEY"), live_options=LiveOptions(language="multi")
|
||||
api_key=os.getenv("DEEPGRAM_API_KEY"),
|
||||
settings=DeepgramSTTSettings(
|
||||
language="multi",
|
||||
),
|
||||
)
|
||||
|
||||
tts = SwitchLanguage()
|
||||
|
||||
@@ -146,6 +146,7 @@ TESTS_07 = [
|
||||
("07zg-interruptible-camb.py", EVAL_SIMPLE_MATH),
|
||||
("07zi-interruptible-piper.py", EVAL_SIMPLE_MATH),
|
||||
("07zj-interruptible-kokoro.py", EVAL_SIMPLE_MATH),
|
||||
("07zk-interruptible-resembleai.py", EVAL_SIMPLE_MATH),
|
||||
# Needs a local XTTS docker instance running.
|
||||
# ("07i-interruptible-xtts.py", EVAL_SIMPLE_MATH),
|
||||
]
|
||||
|
||||
@@ -23,7 +23,6 @@ from pipecat.frames.frames import (
|
||||
Frame,
|
||||
StartFrame,
|
||||
TTSAudioRawFrame,
|
||||
TTSStartedFrame,
|
||||
TTSStoppedFrame,
|
||||
)
|
||||
from pipecat.services.settings import NOT_GIVEN, TTSSettings, _NotGiven, _warn_deprecated_param
|
||||
@@ -705,7 +704,7 @@ class CartesiaHttpTTSService(TTSService):
|
||||
voice_id: Optional[str] = None,
|
||||
model: Optional[str] = None,
|
||||
base_url: str = "https://api.cartesia.ai",
|
||||
cartesia_version: str = "2024-11-13",
|
||||
cartesia_version: str = "2026-03-01",
|
||||
aiohttp_session: Optional[aiohttp.ClientSession] = None,
|
||||
sample_rate: Optional[int] = None,
|
||||
encoding: str = "pcm_s16le",
|
||||
|
||||
@@ -200,18 +200,12 @@ class ElevenLabsRealtimeSTTSettings(STTSettings):
|
||||
vad_threshold: VAD sensitivity (0.1-0.9, lower is more sensitive).
|
||||
min_speech_duration_ms: Minimum speech duration for VAD (50-2000ms).
|
||||
min_silence_duration_ms: Minimum silence duration for VAD (50-2000ms).
|
||||
include_timestamps: Whether to include word-level timestamps in transcripts.
|
||||
enable_logging: Whether to enable logging on ElevenLabs' side.
|
||||
include_language_detection: Whether to include language detection in transcripts.
|
||||
"""
|
||||
|
||||
vad_silence_threshold_secs: float | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
|
||||
vad_threshold: float | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
|
||||
min_speech_duration_ms: int | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
|
||||
min_silence_duration_ms: int | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
|
||||
include_timestamps: bool | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
|
||||
enable_logging: bool | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
|
||||
include_language_detection: bool | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
|
||||
|
||||
|
||||
class ElevenLabsSTTService(SegmentedSTTService):
|
||||
@@ -496,6 +490,9 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
|
||||
commit_strategy: CommitStrategy = CommitStrategy.MANUAL,
|
||||
model: Optional[str] = None,
|
||||
sample_rate: Optional[int] = None,
|
||||
include_timestamps: bool = False,
|
||||
enable_logging: bool = False,
|
||||
include_language_detection: bool = False,
|
||||
params: Optional[InputParams] = None,
|
||||
settings: Optional[ElevenLabsRealtimeSTTSettings] = None,
|
||||
ttfs_p99_latency: Optional[float] = ELEVENLABS_REALTIME_TTFS_P99,
|
||||
@@ -515,6 +512,9 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
|
||||
Use ``settings=ElevenLabsRealtimeSTTSettings(model=...)`` instead.
|
||||
|
||||
sample_rate: Audio sample rate in Hz. If not provided, uses the pipeline's rate.
|
||||
include_timestamps: Whether to include word-level timestamps in transcripts.
|
||||
enable_logging: Whether to enable logging on ElevenLabs' side.
|
||||
include_language_detection: Whether to include language detection in transcripts.
|
||||
params: Configuration parameters for the STT service.
|
||||
|
||||
.. deprecated:: 0.0.105
|
||||
@@ -534,9 +534,6 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
|
||||
vad_threshold=None,
|
||||
min_speech_duration_ms=None,
|
||||
min_silence_duration_ms=None,
|
||||
include_timestamps=False,
|
||||
enable_logging=False,
|
||||
include_language_detection=False,
|
||||
)
|
||||
|
||||
# 2. Apply direct init arg overrides (deprecated)
|
||||
@@ -555,9 +552,9 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
|
||||
default_settings.vad_threshold = params.vad_threshold
|
||||
default_settings.min_speech_duration_ms = params.min_speech_duration_ms
|
||||
default_settings.min_silence_duration_ms = params.min_silence_duration_ms
|
||||
default_settings.include_timestamps = params.include_timestamps
|
||||
default_settings.enable_logging = params.enable_logging
|
||||
default_settings.include_language_detection = params.include_language_detection
|
||||
include_timestamps = params.include_timestamps
|
||||
enable_logging = params.enable_logging
|
||||
include_language_detection = params.include_language_detection
|
||||
|
||||
# 4. Apply settings delta (canonical API, always wins)
|
||||
if settings is not None:
|
||||
@@ -579,6 +576,9 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
|
||||
|
||||
# Init-only config (not runtime-updatable).
|
||||
self._commit_strategy = commit_strategy
|
||||
self._include_timestamps = include_timestamps
|
||||
self._enable_logging = enable_logging
|
||||
self._include_language_detection = include_language_detection
|
||||
|
||||
self._connected_event = asyncio.Event()
|
||||
self._connected_event.set()
|
||||
@@ -762,17 +762,15 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
|
||||
params.append(f"commit_strategy={self._commit_strategy.value}")
|
||||
|
||||
# Add optional parameters
|
||||
if self._settings.include_timestamps:
|
||||
params.append(
|
||||
f"include_timestamps={str(self._settings.include_timestamps).lower()}"
|
||||
)
|
||||
if self._include_timestamps:
|
||||
params.append(f"include_timestamps={str(self._include_timestamps).lower()}")
|
||||
|
||||
if self._settings.enable_logging:
|
||||
params.append(f"enable_logging={str(self._settings.enable_logging).lower()}")
|
||||
if self._enable_logging:
|
||||
params.append(f"enable_logging={str(self._enable_logging).lower()}")
|
||||
|
||||
if self._settings.include_language_detection:
|
||||
if self._include_language_detection:
|
||||
params.append(
|
||||
f"include_language_detection={str(self._settings.include_language_detection).lower()}"
|
||||
f"include_language_detection={str(self._include_language_detection).lower()}"
|
||||
)
|
||||
|
||||
# Add VAD parameters if using VAD commit strategy and values are specified
|
||||
@@ -920,7 +918,7 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
|
||||
"""
|
||||
# If timestamps are enabled, skip this message and wait for the
|
||||
# committed_transcript_with_timestamps message which contains all the data
|
||||
if self._settings.include_timestamps:
|
||||
if self._include_timestamps:
|
||||
return
|
||||
|
||||
text = data.get("text", "").strip()
|
||||
|
||||
@@ -358,6 +358,9 @@ class ElevenLabsTTSService(WebsocketTTSService):
|
||||
model: Optional[str] = None,
|
||||
url: str = "wss://api.elevenlabs.io",
|
||||
sample_rate: Optional[int] = None,
|
||||
auto_mode: bool = True,
|
||||
enable_ssml_parsing: Optional[bool] = None,
|
||||
enable_logging: Optional[bool] = None,
|
||||
pronunciation_dictionary_locators: Optional[List[PronunciationDictionaryLocator]] = None,
|
||||
params: Optional[InputParams] = None,
|
||||
settings: Optional[ElevenLabsTTSSettings] = None,
|
||||
@@ -381,6 +384,9 @@ class ElevenLabsTTSService(WebsocketTTSService):
|
||||
|
||||
url: WebSocket URL for ElevenLabs TTS API.
|
||||
sample_rate: Audio sample rate. If None, uses default.
|
||||
auto_mode: Whether to enable automatic mode optimization.
|
||||
enable_ssml_parsing: Whether to parse SSML tags in text.
|
||||
enable_logging: Whether to enable ElevenLabs server-side logging.
|
||||
pronunciation_dictionary_locators: List of pronunciation dictionary
|
||||
locators to use.
|
||||
params: Additional input parameters for voice customization.
|
||||
@@ -428,11 +434,6 @@ class ElevenLabsTTSService(WebsocketTTSService):
|
||||
apply_text_normalization=None,
|
||||
)
|
||||
|
||||
# Track init-only URL params through the override chain
|
||||
_auto_mode = True
|
||||
_enable_ssml_parsing = None
|
||||
_enable_logging = None
|
||||
|
||||
# 2. Apply direct init arg overrides (deprecated)
|
||||
if voice_id is not None:
|
||||
_warn_deprecated_param("voice_id", ElevenLabsTTSSettings, "voice")
|
||||
@@ -459,11 +460,11 @@ class ElevenLabsTTSService(WebsocketTTSService):
|
||||
if params.speed is not None:
|
||||
default_settings.speed = params.speed
|
||||
if params.auto_mode is not None:
|
||||
_auto_mode = str(params.auto_mode).lower()
|
||||
auto_mode = params.auto_mode
|
||||
if params.enable_ssml_parsing is not None:
|
||||
_enable_ssml_parsing = params.enable_ssml_parsing
|
||||
enable_ssml_parsing = params.enable_ssml_parsing
|
||||
if params.enable_logging is not None:
|
||||
_enable_logging = params.enable_logging
|
||||
enable_logging = params.enable_logging
|
||||
if params.apply_text_normalization is not None:
|
||||
default_settings.apply_text_normalization = params.apply_text_normalization
|
||||
if _pronunciation_dictionary_locators is None:
|
||||
@@ -488,9 +489,9 @@ class ElevenLabsTTSService(WebsocketTTSService):
|
||||
self._url = url
|
||||
|
||||
# Init-only WebSocket URL params (not runtime-updatable).
|
||||
self._auto_mode = _auto_mode
|
||||
self._enable_ssml_parsing = _enable_ssml_parsing
|
||||
self._enable_logging = _enable_logging
|
||||
self._auto_mode = auto_mode
|
||||
self._enable_ssml_parsing = enable_ssml_parsing
|
||||
self._enable_logging = enable_logging
|
||||
|
||||
self._output_format = "" # initialized in start()
|
||||
self._voice_settings = self._set_voice_settings()
|
||||
@@ -664,7 +665,7 @@ class ElevenLabsTTSService(WebsocketTTSService):
|
||||
voice_id = self._settings.voice
|
||||
model = self._settings.model
|
||||
output_format = self._output_format
|
||||
url = f"{self._url}/v1/text-to-speech/{voice_id}/multi-stream-input?model_id={model}&output_format={output_format}&auto_mode={self._auto_mode}"
|
||||
url = f"{self._url}/v1/text-to-speech/{voice_id}/multi-stream-input?model_id={model}&output_format={output_format}&auto_mode={str(self._auto_mode).lower()}"
|
||||
|
||||
if self._enable_ssml_parsing:
|
||||
url += f"&enable_ssml_parsing={self._enable_ssml_parsing}"
|
||||
|
||||
@@ -10,9 +10,8 @@ This module provides integration with Fish Audio's real-time TTS WebSocket API
|
||||
for streaming text-to-speech synthesis with customizable voice parameters.
|
||||
"""
|
||||
|
||||
import uuid
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, AsyncGenerator, ClassVar, Dict, Literal, Mapping, Optional, Self
|
||||
from typing import Any, AsyncGenerator, Literal, Mapping, Optional, Self
|
||||
|
||||
from loguru import logger
|
||||
from pydantic import BaseModel
|
||||
@@ -25,7 +24,6 @@ from pipecat.frames.frames import (
|
||||
InterruptionFrame,
|
||||
StartFrame,
|
||||
TTSAudioRawFrame,
|
||||
TTSStartedFrame,
|
||||
TTSStoppedFrame,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
@@ -52,18 +50,20 @@ class FishAudioTTSSettings(TTSSettings):
|
||||
"""Settings for FishAudioTTSService.
|
||||
|
||||
Parameters:
|
||||
latency: Latency mode ("normal" or "balanced"). Defaults to "normal".
|
||||
latency: Latency mode ("normal" or "balanced"). Defaults to "balanced".
|
||||
normalize: Whether to normalize audio output. Defaults to True.
|
||||
temperature: Controls randomness in speech generation (0.0-1.0).
|
||||
top_p: Controls diversity via nucleus sampling (0.0-1.0).
|
||||
prosody_speed: Speech speed multiplier (0.5-2.0). Defaults to 1.0.
|
||||
prosody_volume: Volume adjustment in dB. Defaults to 0.
|
||||
reference_id: Reference ID of the voice model.
|
||||
prosody_volume: Volume adjustment in dB (-20 to 20). Defaults to 0.
|
||||
"""
|
||||
|
||||
latency: str | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
|
||||
normalize: bool | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
|
||||
temperature: float | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
|
||||
top_p: float | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
|
||||
prosody_speed: float | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
|
||||
prosody_volume: int | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
|
||||
reference_id: str | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
|
||||
|
||||
@classmethod
|
||||
def from_mapping(cls, settings: Mapping[str, Any]) -> Self:
|
||||
@@ -174,18 +174,18 @@ class FishAudioTTSService(InterruptibleTTSService):
|
||||
model="s1",
|
||||
voice=None,
|
||||
language=None,
|
||||
latency="normal",
|
||||
latency="balanced",
|
||||
normalize=True,
|
||||
temperature=None,
|
||||
top_p=None,
|
||||
prosody_speed=1.0,
|
||||
prosody_volume=0,
|
||||
reference_id=None,
|
||||
)
|
||||
|
||||
# 2. Apply direct init arg overrides (deprecated)
|
||||
if reference_id is not None:
|
||||
_warn_deprecated_param("reference_id", FishAudioTTSSettings, "voice")
|
||||
default_settings.voice = reference_id
|
||||
default_settings.reference_id = reference_id
|
||||
if model_id is not None:
|
||||
_warn_deprecated_param("model_id", FishAudioTTSSettings, "model")
|
||||
default_settings.model = model_id
|
||||
@@ -317,8 +317,12 @@ class FishAudioTTSService(InterruptibleTTSService):
|
||||
"speed": self._settings.prosody_speed,
|
||||
"volume": self._settings.prosody_volume,
|
||||
},
|
||||
"reference_id": self._settings.reference_id,
|
||||
"reference_id": self._settings.voice,
|
||||
}
|
||||
if self._settings.temperature is not None:
|
||||
request_settings["temperature"] = self._settings.temperature
|
||||
if self._settings.top_p is not None:
|
||||
request_settings["top_p"] = self._settings.top_p
|
||||
start_message = {"event": "start", "request": {"text": "", **request_settings}}
|
||||
await self._websocket.send(ormsgpack.packb(start_message))
|
||||
logger.debug("Sent start message to Fish Audio")
|
||||
@@ -375,7 +379,14 @@ class FishAudioTTSService(InterruptibleTTSService):
|
||||
frame = TTSAudioRawFrame(audio_data, self.sample_rate, 1)
|
||||
await self.push_frame(frame)
|
||||
await self.stop_ttfb_metrics()
|
||||
continue
|
||||
elif event == "finish":
|
||||
reason = msg.get("reason", "unknown")
|
||||
if reason == "error":
|
||||
await self.push_error(
|
||||
error_msg="Fish Audio server error during synthesis"
|
||||
)
|
||||
else:
|
||||
logger.debug(f"Fish Audio session finished: {reason}")
|
||||
|
||||
except Exception as e:
|
||||
await self.push_error(error_msg=f"Unknown error occurred: {e}", exception=e)
|
||||
|
||||
@@ -73,27 +73,14 @@ class InworldTTSSettings(TTSSettings):
|
||||
Parameters:
|
||||
speaking_rate: Speaking rate for speech synthesis.
|
||||
temperature: Temperature for speech synthesis.
|
||||
auto_mode: Whether to use auto mode. Recommended when texts are sent
|
||||
in full sentences/phrases. When enabled, the server controls
|
||||
flushing of buffered text to achieve minimal latency while
|
||||
maintaining high quality audio output. If None (default),
|
||||
automatically set based on aggregate_sentences.
|
||||
apply_text_normalization: Whether to apply text normalization.
|
||||
timestamp_transport_strategy: Strategy for timestamp transport ("ASYNC" or "SYNC").
|
||||
"""
|
||||
|
||||
speaking_rate: float | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
|
||||
temperature: float | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
|
||||
auto_mode: bool | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
|
||||
apply_text_normalization: str | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
|
||||
timestamp_transport_strategy: str | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
|
||||
|
||||
_aliases: ClassVar[Dict[str, str]] = {
|
||||
"voiceId": "voice",
|
||||
"modelId": "model",
|
||||
"applyTextNormalization": "apply_text_normalization",
|
||||
"autoMode": "auto_mode",
|
||||
"timestampTransportStrategy": "timestamp_transport_strategy",
|
||||
}
|
||||
|
||||
@classmethod
|
||||
@@ -141,6 +128,7 @@ class InworldHttpTTSService(TTSService):
|
||||
streaming: bool = True,
|
||||
sample_rate: Optional[int] = None,
|
||||
encoding: str = "LINEAR16",
|
||||
timestamp_transport_strategy: Optional[Literal["ASYNC", "SYNC"]] = "ASYNC",
|
||||
params: Optional[InputParams] = None,
|
||||
settings: Optional[InworldTTSSettings] = None,
|
||||
**kwargs,
|
||||
@@ -163,6 +151,8 @@ class InworldHttpTTSService(TTSService):
|
||||
streaming: Whether to use streaming mode.
|
||||
sample_rate: Audio sample rate in Hz.
|
||||
encoding: Audio encoding format.
|
||||
timestamp_transport_strategy: Strategy for timestamp transport
|
||||
("ASYNC" or "SYNC"). Defaults to "ASYNC".
|
||||
params: Input parameters for Inworld TTS configuration.
|
||||
|
||||
.. deprecated:: 0.0.105
|
||||
@@ -179,9 +169,6 @@ class InworldHttpTTSService(TTSService):
|
||||
language=None,
|
||||
speaking_rate=None,
|
||||
temperature=None,
|
||||
timestamp_transport_strategy="ASYNC",
|
||||
auto_mode=None, # Not applicable for HTTP TTS
|
||||
apply_text_normalization=None, # Not applicable for HTTP TTS
|
||||
)
|
||||
|
||||
# 2. Apply direct init arg overrides (deprecated)
|
||||
@@ -201,9 +188,7 @@ class InworldHttpTTSService(TTSService):
|
||||
if params.temperature is not None:
|
||||
default_settings.temperature = params.temperature
|
||||
if params.timestamp_transport_strategy is not None:
|
||||
default_settings.timestamp_transport_strategy = (
|
||||
params.timestamp_transport_strategy
|
||||
)
|
||||
timestamp_transport_strategy = params.timestamp_transport_strategy
|
||||
|
||||
# 4. Apply settings delta (canonical API, always wins)
|
||||
if settings is not None:
|
||||
@@ -230,9 +215,10 @@ class InworldHttpTTSService(TTSService):
|
||||
|
||||
self._cumulative_time = 0.0
|
||||
|
||||
# Init-only audio format config (not runtime-updatable).
|
||||
# Init-only config (not runtime-updatable).
|
||||
self._audio_encoding = encoding
|
||||
self._audio_sample_rate = 0 # Set in start()
|
||||
self._timestamp_transport_strategy = timestamp_transport_strategy
|
||||
|
||||
def can_generate_metrics(self) -> bool:
|
||||
"""Check if this service can generate processing metrics.
|
||||
@@ -251,22 +237,6 @@ class InworldHttpTTSService(TTSService):
|
||||
await super().start(frame)
|
||||
self._audio_sample_rate = self.sample_rate
|
||||
|
||||
async def stop(self, frame: EndFrame):
|
||||
"""Stop the Inworld TTS service.
|
||||
|
||||
Args:
|
||||
frame: The end frame.
|
||||
"""
|
||||
await super().stop(frame)
|
||||
|
||||
async def cancel(self, frame: CancelFrame):
|
||||
"""Cancel the Inworld TTS service.
|
||||
|
||||
Args:
|
||||
frame: The cancel frame.
|
||||
"""
|
||||
await super().cancel(frame)
|
||||
|
||||
async def push_frame(self, frame: Frame, direction: FrameDirection = FrameDirection.DOWNSTREAM):
|
||||
"""Push a frame and handle state changes.
|
||||
|
||||
@@ -347,8 +317,8 @@ class InworldHttpTTSService(TTSService):
|
||||
|
||||
# Use WORD timestamps for simplicity and correct spacing/capitalization
|
||||
payload["timestampType"] = self._timestamp_type
|
||||
if self._settings.timestamp_transport_strategy is not None:
|
||||
payload["timestampTransportStrategy"] = self._settings.timestamp_transport_strategy
|
||||
if self._timestamp_transport_strategy is not None:
|
||||
payload["timestampTransportStrategy"] = self._timestamp_transport_strategy
|
||||
|
||||
request_id = str(uuid.uuid4())
|
||||
headers = {
|
||||
@@ -556,6 +526,9 @@ class InworldTTSService(WebsocketTTSService):
|
||||
url: str = "wss://api.inworld.ai/tts/v1/voice:streamBidirectional",
|
||||
sample_rate: Optional[int] = None,
|
||||
encoding: str = "LINEAR16",
|
||||
auto_mode: Optional[bool] = None,
|
||||
apply_text_normalization: Optional[str] = None,
|
||||
timestamp_transport_strategy: Optional[Literal["ASYNC", "SYNC"]] = "ASYNC",
|
||||
params: Optional[InputParams] = None,
|
||||
settings: Optional[InworldTTSSettings] = None,
|
||||
aggregate_sentences: Optional[bool] = None,
|
||||
@@ -580,6 +553,12 @@ class InworldTTSService(WebsocketTTSService):
|
||||
url: URL of the Inworld WebSocket API.
|
||||
sample_rate: Audio sample rate in Hz.
|
||||
encoding: Audio encoding format.
|
||||
auto_mode: Whether to use auto mode. When enabled, the server
|
||||
controls flushing of buffered text. If None (default),
|
||||
automatically set based on ``aggregate_sentences``.
|
||||
apply_text_normalization: Whether to apply text normalization.
|
||||
timestamp_transport_strategy: Strategy for timestamp transport
|
||||
("ASYNC" or "SYNC"). Defaults to "ASYNC".
|
||||
params: Input parameters for Inworld WebSocket TTS configuration.
|
||||
|
||||
.. deprecated:: 0.0.105
|
||||
@@ -596,6 +575,10 @@ class InworldTTSService(WebsocketTTSService):
|
||||
append_trailing_space: Whether to append a trailing space to text before sending to TTS.
|
||||
**kwargs: Additional arguments passed to the parent class.
|
||||
"""
|
||||
# Derive auto_mode from aggregate_sentences if not explicitly set
|
||||
if auto_mode is None:
|
||||
auto_mode = True if aggregate_sentences is None else aggregate_sentences
|
||||
|
||||
# 1. Initialize default_settings with hardcoded defaults
|
||||
default_settings = InworldTTSSettings(
|
||||
model="inworld-tts-1.5-max",
|
||||
@@ -603,9 +586,6 @@ class InworldTTSService(WebsocketTTSService):
|
||||
language=None,
|
||||
speaking_rate=None,
|
||||
temperature=None,
|
||||
apply_text_normalization=None,
|
||||
timestamp_transport_strategy="ASYNC",
|
||||
auto_mode=True if aggregate_sentences is None else aggregate_sentences,
|
||||
)
|
||||
|
||||
# 2. Apply direct init arg overrides (deprecated)
|
||||
@@ -627,13 +607,11 @@ class InworldTTSService(WebsocketTTSService):
|
||||
if params.temperature is not None:
|
||||
default_settings.temperature = params.temperature
|
||||
if params.apply_text_normalization is not None:
|
||||
default_settings.apply_text_normalization = params.apply_text_normalization
|
||||
apply_text_normalization = params.apply_text_normalization
|
||||
if params.timestamp_transport_strategy is not None:
|
||||
default_settings.timestamp_transport_strategy = (
|
||||
params.timestamp_transport_strategy
|
||||
)
|
||||
timestamp_transport_strategy = params.timestamp_transport_strategy
|
||||
if params.auto_mode is not None:
|
||||
default_settings.auto_mode = params.auto_mode
|
||||
auto_mode = params.auto_mode
|
||||
_buffer_max_delay_ms = params.max_buffer_delay_ms
|
||||
_buffer_char_threshold = params.buffer_char_threshold
|
||||
|
||||
@@ -673,9 +651,12 @@ class InworldTTSService(WebsocketTTSService):
|
||||
# Track the end time of the last word in the current generation
|
||||
self._generation_end_time = 0.0
|
||||
|
||||
# Init-only audio format config (not runtime-updatable).
|
||||
# Init-only config (not runtime-updatable).
|
||||
self._audio_encoding = encoding
|
||||
self._audio_sample_rate = 0 # Set in start()
|
||||
self._auto_mode = auto_mode
|
||||
self._apply_text_normalization = apply_text_normalization
|
||||
self._timestamp_transport_strategy = timestamp_transport_strategy
|
||||
|
||||
def can_generate_metrics(self) -> bool:
|
||||
"""Check if this service can generate processing metrics.
|
||||
@@ -1036,14 +1017,12 @@ class InworldTTSService(WebsocketTTSService):
|
||||
|
||||
if self._settings.temperature is not None:
|
||||
create_config["temperature"] = self._settings.temperature
|
||||
if self._settings.apply_text_normalization is not None:
|
||||
create_config["applyTextNormalization"] = self._settings.apply_text_normalization
|
||||
if self._settings.auto_mode is not None:
|
||||
create_config["autoMode"] = self._settings.auto_mode
|
||||
if self._settings.timestamp_transport_strategy is not None:
|
||||
create_config["timestampTransportStrategy"] = (
|
||||
self._settings.timestamp_transport_strategy
|
||||
)
|
||||
if self._apply_text_normalization is not None:
|
||||
create_config["applyTextNormalization"] = self._apply_text_normalization
|
||||
if self._auto_mode is not None:
|
||||
create_config["autoMode"] = self._auto_mode
|
||||
if self._timestamp_transport_strategy is not None:
|
||||
create_config["timestampTransportStrategy"] = self._timestamp_transport_strategy
|
||||
|
||||
# Set buffer settings for timely audio generation.
|
||||
# Use provided values or defaults that work well for streaming LLM output.
|
||||
|
||||
@@ -19,7 +19,6 @@ from pipecat.frames.frames import (
|
||||
Frame,
|
||||
StartFrame,
|
||||
TTSAudioRawFrame,
|
||||
TTSStartedFrame,
|
||||
TTSStoppedFrame,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
@@ -48,6 +47,7 @@ def language_to_lmnt_language(language: Language) -> Optional[str]:
|
||||
The corresponding LMNT language code, or None if not supported.
|
||||
"""
|
||||
LANGUAGE_MAP = {
|
||||
Language.AR: "ar",
|
||||
Language.DE: "de",
|
||||
Language.EN: "en",
|
||||
Language.ES: "es",
|
||||
@@ -65,6 +65,7 @@ def language_to_lmnt_language(language: Language) -> Optional[str]:
|
||||
Language.TH: "th",
|
||||
Language.TR: "tr",
|
||||
Language.UK: "uk",
|
||||
Language.UR: "ur",
|
||||
Language.VI: "vi",
|
||||
Language.ZH: "zh",
|
||||
}
|
||||
@@ -96,6 +97,7 @@ class LmntTTSService(InterruptibleTTSService):
|
||||
voice_id: Optional[str] = None,
|
||||
sample_rate: Optional[int] = None,
|
||||
language: Language = Language.EN,
|
||||
output_format: str = "pcm_s16le",
|
||||
model: Optional[str] = None,
|
||||
settings: Optional[LmntTTSSettings] = None,
|
||||
**kwargs,
|
||||
@@ -111,6 +113,8 @@ class LmntTTSService(InterruptibleTTSService):
|
||||
|
||||
sample_rate: Audio sample rate. If None, uses default.
|
||||
language: Language for synthesis. Defaults to English.
|
||||
output_format: Audio output format. One of "pcm_s16le", "pcm_f32le",
|
||||
"mp3", "ulaw", "webm". Defaults to "pcm_s16le".
|
||||
model: TTS model to use.
|
||||
|
||||
.. deprecated:: 0.0.105
|
||||
@@ -122,7 +126,7 @@ class LmntTTSService(InterruptibleTTSService):
|
||||
"""
|
||||
# 1. Initialize default_settings with hardcoded defaults
|
||||
default_settings = LmntTTSSettings(
|
||||
model="blizzard",
|
||||
model="aurora",
|
||||
voice=None,
|
||||
language=self.language_to_service_language(language),
|
||||
)
|
||||
@@ -151,7 +155,7 @@ class LmntTTSService(InterruptibleTTSService):
|
||||
)
|
||||
|
||||
self._api_key = api_key
|
||||
self._output_format = "raw"
|
||||
self._output_format = output_format
|
||||
self._receive_task = None
|
||||
|
||||
def can_generate_metrics(self) -> bool:
|
||||
|
||||
@@ -90,7 +90,6 @@ class MiniMaxTTSSettings(TTSSettings):
|
||||
"""Settings for MiniMaxHttpTTSService.
|
||||
|
||||
Parameters:
|
||||
stream: Whether to use streaming mode.
|
||||
speed: Speech speed (range: 0.5 to 2.0).
|
||||
volume: Speech volume (range: 0 to 10).
|
||||
pitch: Pitch adjustment (range: -12 to 12).
|
||||
@@ -101,7 +100,6 @@ class MiniMaxTTSSettings(TTSSettings):
|
||||
language_boost: Language boost string for multilingual support.
|
||||
"""
|
||||
|
||||
stream: bool | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
|
||||
speed: float | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
|
||||
volume: float | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
|
||||
pitch: int | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
|
||||
@@ -189,6 +187,7 @@ class MiniMaxHttpTTSService(TTSService):
|
||||
voice_id: Optional[str] = None,
|
||||
aiohttp_session: aiohttp.ClientSession,
|
||||
sample_rate: Optional[int] = None,
|
||||
stream: bool = True,
|
||||
params: Optional[InputParams] = None,
|
||||
settings: Optional[MiniMaxTTSSettings] = None,
|
||||
**kwargs,
|
||||
@@ -217,6 +216,7 @@ class MiniMaxHttpTTSService(TTSService):
|
||||
|
||||
aiohttp_session: aiohttp.ClientSession for API communication.
|
||||
sample_rate: Output audio sample rate in Hz. If None, uses pipeline default.
|
||||
stream: Whether to use streaming mode. Defaults to True.
|
||||
params: Additional configuration parameters.
|
||||
|
||||
.. deprecated:: 0.0.105
|
||||
@@ -231,7 +231,6 @@ class MiniMaxHttpTTSService(TTSService):
|
||||
model="speech-02-turbo",
|
||||
voice="Calm_Woman",
|
||||
language=None,
|
||||
stream=True,
|
||||
speed=1.0,
|
||||
volume=1.0,
|
||||
pitch=0,
|
||||
@@ -311,6 +310,7 @@ class MiniMaxHttpTTSService(TTSService):
|
||||
|
||||
self._api_key = api_key
|
||||
self._group_id = group_id
|
||||
self._stream = stream
|
||||
self._base_url = f"{base_url}?GroupId={group_id}"
|
||||
self._session = aiohttp_session
|
||||
|
||||
@@ -392,7 +392,7 @@ class MiniMaxHttpTTSService(TTSService):
|
||||
|
||||
# Create payload from settings
|
||||
payload = {
|
||||
"stream": self._settings.stream,
|
||||
"stream": self._stream,
|
||||
"voice_setting": voice_setting,
|
||||
"audio_setting": audio_setting,
|
||||
"model": self._settings.model,
|
||||
|
||||
@@ -26,12 +26,10 @@ from pipecat.frames.frames import (
|
||||
EndFrame,
|
||||
ErrorFrame,
|
||||
Frame,
|
||||
InterruptionFrame,
|
||||
LLMFullResponseEndFrame,
|
||||
StartFrame,
|
||||
TTSAudioRawFrame,
|
||||
TTSSpeakFrame,
|
||||
TTSStartedFrame,
|
||||
TTSStoppedFrame,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
@@ -487,7 +485,7 @@ class NeuphonicHttpTTSService(TTSService):
|
||||
default_settings = NeuphonicTTSSettings(
|
||||
model=None,
|
||||
voice=None,
|
||||
language=self.language_to_service_language(Language.EN) or "en",
|
||||
language=self.language_to_service_language(Language.EN),
|
||||
speed=1.0,
|
||||
)
|
||||
|
||||
@@ -501,9 +499,7 @@ class NeuphonicHttpTTSService(TTSService):
|
||||
_warn_deprecated_param("params", NeuphonicTTSSettings)
|
||||
if not settings:
|
||||
if params.language is not None:
|
||||
default_settings.language = (
|
||||
self.language_to_service_language(params.language) or "en"
|
||||
)
|
||||
default_settings.language = self.language_to_service_language(params.language)
|
||||
if params.speed is not None:
|
||||
default_settings.speed = params.speed
|
||||
|
||||
|
||||
@@ -53,11 +53,9 @@ from pipecat.frames.frames import (
|
||||
EndFrame,
|
||||
ErrorFrame,
|
||||
Frame,
|
||||
InterruptionFrame,
|
||||
LLMFullResponseEndFrame,
|
||||
StartFrame,
|
||||
TTSAudioRawFrame,
|
||||
TTSStartedFrame,
|
||||
TTSStoppedFrame,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
@@ -230,16 +228,27 @@ def language_to_sarvam_language(language: Language) -> Optional[str]:
|
||||
"""
|
||||
LANGUAGE_MAP = {
|
||||
Language.BN: "bn-IN", # Bengali
|
||||
Language.BN_IN: "bn-IN",
|
||||
Language.EN: "en-IN", # English (India)
|
||||
Language.EN_IN: "en-IN",
|
||||
Language.GU: "gu-IN", # Gujarati
|
||||
Language.GU_IN: "gu-IN",
|
||||
Language.HI: "hi-IN", # Hindi
|
||||
Language.HI_IN: "hi-IN",
|
||||
Language.KN: "kn-IN", # Kannada
|
||||
Language.KN_IN: "kn-IN",
|
||||
Language.ML: "ml-IN", # Malayalam
|
||||
Language.ML_IN: "ml-IN",
|
||||
Language.MR: "mr-IN", # Marathi
|
||||
Language.MR_IN: "mr-IN",
|
||||
Language.OR: "od-IN", # Odia
|
||||
Language.OR_IN: "od-IN",
|
||||
Language.PA: "pa-IN", # Punjabi
|
||||
Language.PA_IN: "pa-IN",
|
||||
Language.TA: "ta-IN", # Tamil
|
||||
Language.TA_IN: "ta-IN",
|
||||
Language.TE: "te-IN", # Telugu
|
||||
Language.TE_IN: "te-IN",
|
||||
}
|
||||
|
||||
return resolve_language(language, LANGUAGE_MAP, use_base_code=False)
|
||||
@@ -481,6 +490,10 @@ class SarvamHttpTTSService(TTSService):
|
||||
if settings is not None:
|
||||
default_settings.apply_update(settings)
|
||||
|
||||
# Convert Language enum to service-specific string
|
||||
if isinstance(default_settings.language, Language):
|
||||
default_settings.language = self.language_to_service_language(default_settings.language)
|
||||
|
||||
# Get model configuration (validates model exists)
|
||||
resolved_model = default_settings.model
|
||||
if resolved_model not in TTS_MODEL_CONFIGS:
|
||||
@@ -900,6 +913,10 @@ class SarvamTTSService(InterruptibleTTSService):
|
||||
if settings is not None:
|
||||
default_settings.apply_update(settings)
|
||||
|
||||
# Convert Language enum to service-specific string
|
||||
if isinstance(default_settings.language, Language):
|
||||
default_settings.language = self.language_to_service_language(default_settings.language)
|
||||
|
||||
# Get model configuration (validates model exists)
|
||||
resolved_model = default_settings.model
|
||||
if resolved_model not in TTS_MODEL_CONFIGS:
|
||||
|
||||
@@ -70,7 +70,7 @@ def language_to_xtts_language(language: Language) -> Optional[str]:
|
||||
|
||||
@dataclass
|
||||
class XTTSTTSSettings(TTSSettings):
|
||||
"""Settings for XTTSTTSService."""
|
||||
"""Settings for XTTSService."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
Reference in New Issue
Block a user