Merge pull request #3941 from pipecat-ai/mb/stt-settings-updates
STT services: settings and examples fixes
This commit is contained in:
@@ -233,14 +233,14 @@ def can_generate_metrics(self) -> bool:
|
||||
|
||||
### Service Settings
|
||||
|
||||
Every STT, LLM, TTS, and image-generation service exposes a **Settings dataclass** that serves two roles:
|
||||
Every AI service (STT, LLM, TTS, image generation, etc.) exposes a **Settings dataclass** that serves two roles:
|
||||
|
||||
1. **Store mode** — the service's `self._settings` holds the current value of every runtime-updatable field.
|
||||
2. **Delta mode** — an update frame carries only the fields that changed; unset fields remain `NOT_GIVEN`.
|
||||
2. **Delta mode** — an update frame (e.g. `TTSUpdateSettingsFrame`) specifies only the fields that should change; unspecified fields remain `NOT_GIVEN`.
|
||||
|
||||
#### Defining your Settings class
|
||||
|
||||
Extend `STTSettings`, `TTSSettings`, `LLMSettings`, or `ImageGenSettings`. The base classes already provide common fields (e.g. `model`, `voice`, `language`). You only need to add **service-specific knobs that should be runtime-updatable**:
|
||||
Extend `STTSettings`, `TTSSettings`, `LLMSettings`, or `ImageGenSettings` (or, if your service directly subclasses `AIService`, `ServiceSettings`). The base classes already provide common fields (e.g. `model`, `voice`, `language`). You only need to add **service-specific knobs that should be runtime-updatable**:
|
||||
|
||||
```python
|
||||
from dataclasses import dataclass, field
|
||||
@@ -320,7 +320,7 @@ svc = MyTTSService(
|
||||
|
||||
#### Reacting to runtime changes
|
||||
|
||||
STT, LLM, and TTS services support runtime configuration changes via `*UpdateSettingsFrame`s (e.g. `STTUpdateSettingsFrame`, `TTSUpdateSettingsFrame`, `LLMUpdateSettingsFrame`).
|
||||
AI services support runtime configuration changes via `*UpdateSettingsFrame`s (e.g. `STTUpdateSettingsFrame`, `TTSUpdateSettingsFrame`, `LLMUpdateSettingsFrame`).
|
||||
|
||||
To react to runtime setting changes, override `_update_settings`. The base implementation applies the delta to `self._settings` and returns a `dict` mapping each changed field name to its **pre-update** value. Your override should call `super()` first, then act on the changed fields. A common implementation might look like:
|
||||
|
||||
|
||||
@@ -65,8 +65,8 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
llm = AzureLLMService(
|
||||
api_key=os.getenv("AZURE_CHATGPT_API_KEY"),
|
||||
endpoint=os.getenv("AZURE_CHATGPT_ENDPOINT"),
|
||||
model=os.getenv("AZURE_CHATGPT_MODEL"),
|
||||
settings=AzureLLMSettings(
|
||||
model=os.getenv("AZURE_CHATGPT_MODEL"),
|
||||
system_instruction="You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be spoken aloud, so avoid special characters that can't easily be spoken, such as emojis or bullet points. Respond to what the user said in a creative and helpful way.",
|
||||
),
|
||||
)
|
||||
|
||||
@@ -65,8 +65,8 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
llm = AzureLLMService(
|
||||
api_key=os.getenv("AZURE_CHATGPT_API_KEY"),
|
||||
endpoint=os.getenv("AZURE_CHATGPT_ENDPOINT"),
|
||||
model=os.getenv("AZURE_CHATGPT_MODEL"),
|
||||
settings=AzureLLMSettings(
|
||||
model=os.getenv("AZURE_CHATGPT_MODEL"),
|
||||
system_instruction="You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be spoken aloud, so avoid special characters that can't easily be spoken, such as emojis or bullet points. Respond to what the user said in a creative and helpful way.",
|
||||
),
|
||||
)
|
||||
|
||||
@@ -63,9 +63,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
|
||||
llm = AWSBedrockLLMService(
|
||||
aws_region="us-west-2",
|
||||
model="us.anthropic.claude-haiku-4-5-20251001-v1:0",
|
||||
params=AWSBedrockLLMService.InputParams(temperature=0.8),
|
||||
settings=AWSBedrockLLMSettings(
|
||||
model="us.anthropic.claude-haiku-4-5-20251001-v1:0",
|
||||
temperature=0.8,
|
||||
system_instruction="You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be spoken aloud, so avoid special characters that can't easily be spoken, such as emojis or bullet points. Respond to what the user said in a creative and helpful way.",
|
||||
),
|
||||
)
|
||||
|
||||
@@ -55,8 +55,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
|
||||
stt = GoogleSTTService(
|
||||
settings=GoogleSTTSettings(
|
||||
languages=Language.EN_US,
|
||||
model="chirp_3",
|
||||
languages=[Language.EN_US],
|
||||
# Add model to use a specific model
|
||||
# model="chirp_3",
|
||||
),
|
||||
credentials=os.getenv("GOOGLE_TEST_CREDENTIALS"),
|
||||
location="us",
|
||||
|
||||
@@ -94,7 +94,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
api_key=os.getenv("ASSEMBLYAI_API_KEY"),
|
||||
vad_force_turn_endpoint=False, # Use AssemblyAI's built-in turn detection
|
||||
settings=AssemblyAISTTSettings(
|
||||
speech_model="u3-rt-pro",
|
||||
model="u3-rt-pro",
|
||||
# Optional: Tune turn detection timing (defaults shown below)
|
||||
# min_turn_silence=100, # Default
|
||||
# max_turn_silence=1000, # Default
|
||||
|
||||
@@ -51,13 +51,13 @@ transport_params = {
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
stt = (
|
||||
SonioxSTTService(
|
||||
api_key=os.getenv("SONIOX_API_KEY"),
|
||||
settings=SonioxSTTSettings(
|
||||
language_hints=[Language.EN],
|
||||
language_hints_strict=True,
|
||||
),
|
||||
stt = SonioxSTTService(
|
||||
api_key=os.getenv("SONIOX_API_KEY"),
|
||||
settings=SonioxSTTSettings(
|
||||
# Add language hints to use a specific language
|
||||
# Add strict mode to enforce the language hints
|
||||
language_hints=[Language.EN],
|
||||
language_hints_strict=True,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@@ -99,7 +99,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
llm = AWSBedrockLLMService(
|
||||
aws_region="us-west-2",
|
||||
settings=AWSBedrockLLMSettings(
|
||||
model="us.anthropic.claude-3-7-sonnet-20250219-v1:0",
|
||||
model="us.anthropic.claude-sonnet-4-6",
|
||||
# Note: usually, prefer providing latency="optimized" param.
|
||||
# Here we can't because AWS Bedrock doesn't support it for Claude 3.7,
|
||||
# which we need for image input.
|
||||
@@ -170,7 +170,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
context.add_message(
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"Please introduce yourself to the user. Use '{client_id}' as the user ID during function calls.",
|
||||
"content": f"Please introduce yourself to the user briefly; don't mention the camera. Use '{client_id}' as the user ID during function calls.",
|
||||
}
|
||||
)
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@@ -99,7 +99,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(transport, client):
|
||||
logger.info(f"Client connected")
|
||||
context.add_message({"user": "system", "content": "Please introduce yourself to the user."})
|
||||
context.add_message({"role": "user", "content": "Please introduce yourself to the user."})
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
await asyncio.sleep(10)
|
||||
|
||||
@@ -255,7 +255,7 @@ class LLMContext:
|
||||
this method, which is part of the public API of OpenAILLMContext but
|
||||
doesn't need to be for LLMContext.
|
||||
|
||||
.. deprecated::
|
||||
.. deprecated:: 0.0.92
|
||||
Use `get_messages()` instead.
|
||||
|
||||
Returns:
|
||||
|
||||
@@ -27,7 +27,7 @@ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
class UserIdleProcessor(FrameProcessor):
|
||||
"""Monitors user inactivity and triggers callbacks after timeout periods.
|
||||
|
||||
.. deprecated::
|
||||
.. deprecated:: 0.0.100
|
||||
UserIdleProcessor is deprecated in 0.0.100 and will be removed in a future version.
|
||||
Use LLMUserAggregator with user_idle_timeout parameter instead.
|
||||
|
||||
|
||||
@@ -170,7 +170,7 @@ class AnthropicLLMService(LLMService):
|
||||
class InputParams(BaseModel):
|
||||
"""Input parameters for Anthropic model inference.
|
||||
|
||||
.. deprecated::
|
||||
.. deprecated:: 0.0.105
|
||||
Use ``AnthropicLLMSettings`` instead. Pass settings directly via the
|
||||
``settings`` parameter of :class:`AnthropicLLMService`.
|
||||
|
||||
@@ -231,12 +231,12 @@ class AnthropicLLMService(LLMService):
|
||||
api_key: Anthropic API key for authentication.
|
||||
model: Model name to use.
|
||||
|
||||
.. deprecated::
|
||||
.. deprecated:: 0.0.105
|
||||
Use ``settings=AnthropicLLMSettings(model=...)`` instead.
|
||||
|
||||
params: Optional model parameters for inference.
|
||||
|
||||
.. deprecated::
|
||||
.. deprecated:: 0.0.105
|
||||
Use ``settings=AnthropicLLMSettings(...)`` instead.
|
||||
|
||||
settings: Runtime-updatable settings for this service. When both
|
||||
|
||||
@@ -81,7 +81,7 @@ def map_language_from_assemblyai(language_code: str) -> Language:
|
||||
|
||||
@dataclass
|
||||
class AssemblyAISTTSettings(STTSettings):
|
||||
"""Settings for the AssemblyAI STT service.
|
||||
"""Settings for AssemblyAISTTService.
|
||||
|
||||
Parameters:
|
||||
formatted_finals: Whether to enable transcript formatting.
|
||||
@@ -99,6 +99,8 @@ class AssemblyAISTTSettings(STTSettings):
|
||||
language_detection: Enable automatic language detection.
|
||||
format_turns: Whether to format transcript turns.
|
||||
speaker_labels: Enable speaker diarization.
|
||||
vad_threshold: VAD confidence threshold (0.0–1.0) for classifying
|
||||
audio frames as silence. Only applicable to u3-rt-pro.
|
||||
"""
|
||||
|
||||
formatted_finals: bool | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
|
||||
@@ -115,6 +117,7 @@ class AssemblyAISTTSettings(STTSettings):
|
||||
language_detection: bool | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
|
||||
format_turns: bool | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
|
||||
speaker_labels: bool | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
|
||||
vad_threshold: float | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
|
||||
|
||||
|
||||
class AssemblyAISTTService(WebsocketSTTService):
|
||||
@@ -199,6 +202,7 @@ class AssemblyAISTTService(WebsocketSTTService):
|
||||
language_detection=None,
|
||||
format_turns=True,
|
||||
speaker_labels=None,
|
||||
vad_threshold=None,
|
||||
)
|
||||
|
||||
# 2. Apply direct init arg overrides (deprecated)
|
||||
@@ -227,6 +231,7 @@ class AssemblyAISTTService(WebsocketSTTService):
|
||||
default_settings.language_detection = connection_params.language_detection
|
||||
default_settings.format_turns = connection_params.format_turns
|
||||
default_settings.speaker_labels = connection_params.speaker_labels
|
||||
default_settings.vad_threshold = connection_params.vad_threshold
|
||||
|
||||
# 4. Apply settings delta (canonical API, always wins)
|
||||
if settings is not None:
|
||||
@@ -463,6 +468,7 @@ class AssemblyAISTTService(WebsocketSTTService):
|
||||
"language_detection": s.language_detection,
|
||||
"format_turns": s.format_turns,
|
||||
"speaker_labels": s.speaker_labels,
|
||||
"vad_threshold": s.vad_threshold,
|
||||
}
|
||||
|
||||
for k, v in optional_fields.items():
|
||||
@@ -651,7 +657,7 @@ class AssemblyAISTTService(WebsocketSTTService):
|
||||
await self.start_processing_metrics()
|
||||
await self.broadcast_frame(UserStartedSpeakingFrame)
|
||||
if self._should_interrupt:
|
||||
await self.push_interruption_task_frame_and_wait()
|
||||
await self.broadcast_interruption()
|
||||
self._user_speaking = True
|
||||
|
||||
async def _handle_termination(self, message: TerminationMessage):
|
||||
|
||||
@@ -754,7 +754,7 @@ class AWSBedrockLLMService(LLMService):
|
||||
class InputParams(BaseModel):
|
||||
"""Input parameters for AWS Bedrock LLM service.
|
||||
|
||||
.. deprecated::
|
||||
.. deprecated:: 0.0.105
|
||||
Use ``AWSBedrockLLMSettings`` instead. Pass settings directly via the
|
||||
``settings`` parameter of :class:`AWSBedrockLLMService`.
|
||||
|
||||
@@ -795,7 +795,7 @@ class AWSBedrockLLMService(LLMService):
|
||||
Args:
|
||||
model: The AWS Bedrock model identifier to use.
|
||||
|
||||
.. deprecated::
|
||||
.. deprecated:: 0.0.105
|
||||
Use ``settings=AWSBedrockLLMSettings(model=...)`` instead.
|
||||
|
||||
aws_access_key: AWS access key ID. If None, uses default credentials.
|
||||
@@ -804,7 +804,7 @@ class AWSBedrockLLMService(LLMService):
|
||||
aws_region: AWS region for the Bedrock service.
|
||||
params: Model parameters and configuration.
|
||||
|
||||
.. deprecated::
|
||||
.. deprecated:: 0.0.105
|
||||
Use ``settings=AWSBedrockLLMSettings(...)`` instead.
|
||||
|
||||
settings: Runtime-updatable settings for this service. When both
|
||||
|
||||
@@ -280,7 +280,7 @@ class AWSNovaSonicLLMService(LLMService):
|
||||
- Nova Sonic (the older model): "us-east-1", "ap-northeast-1"
|
||||
model: Model identifier. Defaults to "amazon.nova-2-sonic-v1:0".
|
||||
|
||||
.. deprecated::
|
||||
.. deprecated:: 0.0.105
|
||||
Use ``settings=AWSNovaSonicLLMSettings(model=...)`` instead.
|
||||
|
||||
voice_id: Voice ID for speech synthesis.
|
||||
@@ -289,7 +289,7 @@ class AWSNovaSonicLLMService(LLMService):
|
||||
- Nova 2 Sonic (the default model): see https://docs.aws.amazon.com/nova/latest/nova2-userguide/sonic-language-support.html
|
||||
- Nova Sonic (the older model): see https://docs.aws.amazon.com/nova/latest/userguide/available-voices.html.
|
||||
|
||||
.. deprecated::
|
||||
.. deprecated:: 0.0.105
|
||||
Use ``settings=AWSNovaSonicLLMSettings(voice=...)`` instead.
|
||||
|
||||
params: Model parameters for audio configuration and inference.
|
||||
|
||||
@@ -47,7 +47,7 @@ except ModuleNotFoundError as e:
|
||||
|
||||
@dataclass
|
||||
class AWSTranscribeSTTSettings(STTSettings):
|
||||
"""Settings for the AWS Transcribe STT service."""
|
||||
"""Settings for AWSTranscribeSTTService."""
|
||||
|
||||
pass
|
||||
|
||||
@@ -99,13 +99,13 @@ class AWSTranscribeSTTService(WebsocketSTTService):
|
||||
# 1. Initialize default_settings with hardcoded defaults
|
||||
default_settings = AWSTranscribeSTTSettings(
|
||||
model=None,
|
||||
language=self.language_to_service_language(Language.EN) or "en-US",
|
||||
language=self.language_to_service_language(Language.EN),
|
||||
)
|
||||
|
||||
# 2. Apply direct init arg overrides (deprecated)
|
||||
if language is not None:
|
||||
_warn_deprecated_param("language", AWSTranscribeSTTSettings, "language")
|
||||
default_settings.language = self.language_to_service_language(language) or "en-US"
|
||||
default_settings.language = self.language_to_service_language(language)
|
||||
|
||||
# 3. No params to apply
|
||||
|
||||
|
||||
@@ -53,7 +53,7 @@ except ModuleNotFoundError as e:
|
||||
|
||||
@dataclass
|
||||
class AzureSTTSettings(STTSettings):
|
||||
"""Settings for the Azure STT service."""
|
||||
"""Settings for AzureSTTService."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
@@ -46,7 +46,7 @@ except ModuleNotFoundError as e:
|
||||
|
||||
@dataclass
|
||||
class CartesiaSTTSettings(STTSettings):
|
||||
"""Settings for the Cartesia STT service."""
|
||||
"""Settings for CartesiaSTTService."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
@@ -71,7 +71,7 @@ class FluxEventType(str, Enum):
|
||||
|
||||
@dataclass
|
||||
class DeepgramFluxSTTSettings(STTSettings):
|
||||
"""Settings for the Deepgram Flux STT service.
|
||||
"""Settings for DeepgramFluxSTTService.
|
||||
|
||||
Parameters:
|
||||
eager_eot_threshold: EagerEndOfTurn/TurnResumed threshold. Off by default.
|
||||
@@ -81,7 +81,6 @@ class DeepgramFluxSTTSettings(STTSettings):
|
||||
eot_timeout_ms: Time in ms after speech to finish a turn regardless of EOT
|
||||
confidence (default 5000).
|
||||
keyterm: Keyterms to boost recognition accuracy for specialized terminology.
|
||||
tag: Tags to label requests for identification during usage reporting.
|
||||
min_confidence: Minimum confidence required to create a TranscriptionFrame.
|
||||
"""
|
||||
|
||||
@@ -89,7 +88,6 @@ class DeepgramFluxSTTSettings(STTSettings):
|
||||
eot_threshold: float | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
|
||||
eot_timeout_ms: int | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
|
||||
keyterm: list | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
|
||||
tag: list | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
|
||||
min_confidence: float | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
|
||||
|
||||
|
||||
@@ -157,6 +155,7 @@ class DeepgramFluxSTTService(WebsocketSTTService):
|
||||
mip_opt_out: Optional[bool] = None,
|
||||
model: Optional[str] = None,
|
||||
flux_encoding: str = "linear16",
|
||||
tag: Optional[list] = None,
|
||||
params: Optional[InputParams] = None,
|
||||
should_interrupt: bool = True,
|
||||
settings: Optional[DeepgramFluxSTTSettings] = None,
|
||||
@@ -177,6 +176,7 @@ class DeepgramFluxSTTService(WebsocketSTTService):
|
||||
|
||||
flux_encoding: Audio encoding format required by Flux API. Must be "linear16".
|
||||
Raw signed little-endian 16-bit PCM encoding.
|
||||
tag: Tags to label requests for identification during usage reporting.
|
||||
params: InputParams instance containing detailed API configuration options.
|
||||
|
||||
.. deprecated:: 0.0.105
|
||||
@@ -224,7 +224,6 @@ class DeepgramFluxSTTService(WebsocketSTTService):
|
||||
eot_threshold=None,
|
||||
eot_timeout_ms=None,
|
||||
keyterm=[],
|
||||
tag=[],
|
||||
min_confidence=None,
|
||||
)
|
||||
|
||||
@@ -241,7 +240,8 @@ class DeepgramFluxSTTService(WebsocketSTTService):
|
||||
default_settings.eot_threshold = params.eot_threshold
|
||||
default_settings.eot_timeout_ms = params.eot_timeout_ms
|
||||
default_settings.keyterm = params.keyterm or []
|
||||
default_settings.tag = params.tag or []
|
||||
if params.tag and tag is None:
|
||||
tag = params.tag
|
||||
default_settings.min_confidence = params.min_confidence
|
||||
if params.mip_opt_out is not None:
|
||||
mip_opt_out = params.mip_opt_out
|
||||
@@ -261,6 +261,7 @@ class DeepgramFluxSTTService(WebsocketSTTService):
|
||||
self._should_interrupt = should_interrupt
|
||||
self._encoding = flux_encoding
|
||||
self._mip_opt_out = mip_opt_out
|
||||
self._tag = tag or []
|
||||
self._websocket_url = None
|
||||
self._receive_task = None
|
||||
|
||||
@@ -469,7 +470,7 @@ class DeepgramFluxSTTService(WebsocketSTTService):
|
||||
url_params.append(urlencode({"keyterm": keyterm}))
|
||||
|
||||
# Add tag parameters (can have multiple)
|
||||
for tag_value in self._settings.tag:
|
||||
for tag_value in self._tag:
|
||||
url_params.append(urlencode({"tag": tag_value}))
|
||||
|
||||
self._websocket_url = f"{self._url}?{'&'.join(url_params)}"
|
||||
|
||||
@@ -177,7 +177,7 @@ class LiveOptions:
|
||||
|
||||
@dataclass
|
||||
class DeepgramSTTSettings(STTSettings):
|
||||
"""Settings for Deepgram STT services.
|
||||
"""Settings for DeepgramSTTService.
|
||||
|
||||
``model`` and ``language`` are inherited from ``STTSettings`` /
|
||||
``ServiceSettings``. Additional Deepgram connection params may
|
||||
|
||||
@@ -179,19 +179,19 @@ class CommitStrategy(str, Enum):
|
||||
|
||||
@dataclass
|
||||
class ElevenLabsSTTSettings(STTSettings):
|
||||
"""Settings for the ElevenLabs file-based STT service.
|
||||
"""Settings for ElevenLabsSTTService.
|
||||
|
||||
Parameters:
|
||||
tag_audio_events: Whether to include audio events like (laughter),
|
||||
(coughing) in the transcription.
|
||||
"""
|
||||
|
||||
tag_audio_events: bool | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
|
||||
tag_audio_events: bool | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ElevenLabsRealtimeSTTSettings(STTSettings):
|
||||
"""Settings for the ElevenLabs Realtime STT service.
|
||||
"""Settings for ElevenLabsRealtimeSTTService.
|
||||
|
||||
See ``ElevenLabsRealtimeSTTService.InputParams`` for detailed descriptions.
|
||||
|
||||
@@ -277,8 +277,8 @@ class ElevenLabsSTTService(SegmentedSTTService):
|
||||
# 1. Initialize default_settings with hardcoded defaults
|
||||
default_settings = ElevenLabsSTTSettings(
|
||||
model="scribe_v2",
|
||||
language="eng",
|
||||
tag_audio_events=True,
|
||||
language=language_to_elevenlabs_language(Language.EN),
|
||||
tag_audio_events=None,
|
||||
)
|
||||
|
||||
# 2. Apply direct init arg overrides (deprecated)
|
||||
@@ -291,9 +291,7 @@ class ElevenLabsSTTService(SegmentedSTTService):
|
||||
_warn_deprecated_param("params", ElevenLabsSTTSettings)
|
||||
if not settings:
|
||||
if params.language is not None:
|
||||
default_settings.language = (
|
||||
self.language_to_service_language(params.language) or "eng"
|
||||
)
|
||||
default_settings.language = language_to_elevenlabs_language(params.language)
|
||||
default_settings.tag_audio_events = params.tag_audio_events
|
||||
|
||||
# 4. Apply settings delta (canonical API, always wins)
|
||||
@@ -354,10 +352,11 @@ class ElevenLabsSTTService(SegmentedSTTService):
|
||||
content_type="audio/x-wav",
|
||||
)
|
||||
|
||||
# Add required model_id, language_code, and tag_audio_events
|
||||
# Add required model_id and language_code
|
||||
data.add_field("model_id", self._settings.model)
|
||||
data.add_field("language_code", self._settings.language)
|
||||
data.add_field("tag_audio_events", str(self._settings.tag_audio_events).lower())
|
||||
if self._settings.tag_audio_events is not None:
|
||||
data.add_field("tag_audio_events", str(self._settings.tag_audio_events).lower())
|
||||
|
||||
async with self._session.post(url, data=data, headers=headers) as response:
|
||||
if response.status != 200:
|
||||
|
||||
@@ -143,7 +143,7 @@ def language_to_fal_language(language: Language) -> Optional[str]:
|
||||
|
||||
@dataclass
|
||||
class FalSTTSettings(STTSettings):
|
||||
"""Settings for the Fal Wizper STT service."""
|
||||
"""Settings for FalSTTService."""
|
||||
|
||||
pass
|
||||
|
||||
@@ -215,7 +215,7 @@ class FalSTTService(SegmentedSTTService):
|
||||
# 1. Initialize default_settings with hardcoded defaults
|
||||
default_settings = FalSTTSettings(
|
||||
model=None,
|
||||
language=language_to_fal_language(Language.EN) or "en",
|
||||
language=language_to_fal_language(Language.EN),
|
||||
)
|
||||
|
||||
# 2. (no deprecated direct args for this service)
|
||||
@@ -224,9 +224,8 @@ class FalSTTService(SegmentedSTTService):
|
||||
if params is not None:
|
||||
_warn_deprecated_param("params", FalSTTSettings)
|
||||
if not settings:
|
||||
default_settings.language = (
|
||||
language_to_fal_language(params.language) if params.language else "en"
|
||||
)
|
||||
if params.language is not None:
|
||||
default_settings.language = language_to_fal_language(params.language)
|
||||
if params.task != "transcribe":
|
||||
task = params.task
|
||||
if params.chunk_level != "segment":
|
||||
|
||||
@@ -188,7 +188,7 @@ class _InputParamsDescriptor:
|
||||
|
||||
@dataclass
|
||||
class GladiaSTTSettings(STTSettings):
|
||||
"""Settings for Gladia STT service.
|
||||
"""Settings for GladiaSTTService.
|
||||
|
||||
Parameters:
|
||||
language_config: Language detection and handling configuration.
|
||||
|
||||
@@ -552,7 +552,7 @@ class ContextWindowCompressionParams(BaseModel):
|
||||
class InputParams(BaseModel):
|
||||
"""Input parameters for Gemini Live generation.
|
||||
|
||||
.. deprecated::
|
||||
.. deprecated:: 0.0.105
|
||||
Use ``GeminiLiveLLMSettings`` instead.
|
||||
|
||||
Parameters:
|
||||
@@ -678,7 +678,7 @@ class GeminiLiveLLMService(LLMService):
|
||||
|
||||
model: Model identifier to use.
|
||||
|
||||
.. deprecated::
|
||||
.. deprecated:: 0.0.105
|
||||
Use ``settings=GeminiLiveLLMSettings(model=...)`` instead.
|
||||
|
||||
voice_id: TTS voice identifier. Defaults to "Charon".
|
||||
@@ -691,7 +691,7 @@ class GeminiLiveLLMService(LLMService):
|
||||
tools: Tools/functions available to the model. Defaults to None.
|
||||
params: Configuration parameters for the model.
|
||||
|
||||
.. deprecated::
|
||||
.. deprecated:: 0.0.105
|
||||
Use ``settings=GeminiLiveLLMSettings(...)`` instead.
|
||||
|
||||
settings: Gemini Live LLM settings. If provided together with deprecated
|
||||
|
||||
@@ -88,7 +88,7 @@ class GeminiLiveVertexLLMService(GeminiLiveLLMService):
|
||||
project_id: Google Cloud project ID.
|
||||
model: Model identifier to use.
|
||||
|
||||
.. deprecated::
|
||||
.. deprecated:: 0.0.105
|
||||
Use ``settings=GeminiLiveLLMSettings(model=...)`` instead.
|
||||
|
||||
voice_id: TTS voice identifier. Defaults to "Charon".
|
||||
@@ -102,7 +102,7 @@ class GeminiLiveVertexLLMService(GeminiLiveLLMService):
|
||||
params: Configuration parameters for the model along with Vertex AI
|
||||
location and project ID.
|
||||
|
||||
.. deprecated::
|
||||
.. deprecated:: 0.0.105
|
||||
Use ``settings=GeminiLiveLLMSettings(...)`` instead.
|
||||
|
||||
settings: Gemini Live LLM settings. If provided together with deprecated
|
||||
|
||||
@@ -754,7 +754,7 @@ class GoogleLLMService(LLMService):
|
||||
class InputParams(BaseModel):
|
||||
"""Input parameters for Google AI models.
|
||||
|
||||
.. deprecated::
|
||||
.. deprecated:: 0.0.105
|
||||
Use ``settings=GoogleLLMSettings(...)`` instead.
|
||||
|
||||
Parameters:
|
||||
@@ -797,12 +797,12 @@ class GoogleLLMService(LLMService):
|
||||
api_key: Google AI API key for authentication.
|
||||
model: Model name to use.
|
||||
|
||||
.. deprecated::
|
||||
.. deprecated:: 0.0.105
|
||||
Use ``settings=GoogleLLMSettings(model=...)`` instead.
|
||||
|
||||
params: Optional model parameters for inference.
|
||||
|
||||
.. deprecated::
|
||||
.. deprecated:: 0.0.105
|
||||
Use ``settings=GoogleLLMSettings(...)`` instead.
|
||||
|
||||
settings: Runtime-updatable settings for this service. When both
|
||||
|
||||
@@ -128,14 +128,14 @@ class GoogleVertexLLMService(GoogleLLMService):
|
||||
credentials_path: Path to the service account JSON file.
|
||||
model: Model identifier (e.g., "gemini-2.5-flash").
|
||||
|
||||
.. deprecated::
|
||||
.. deprecated:: 0.0.105
|
||||
Use ``settings=GoogleLLMSettings(model=...)`` instead.
|
||||
|
||||
location: GCP region for Vertex AI endpoint (e.g., "us-east4").
|
||||
project_id: Google Cloud project ID.
|
||||
params: Input parameters for the model.
|
||||
|
||||
.. deprecated::
|
||||
.. deprecated:: 0.0.105
|
||||
Use ``settings=GoogleLLMSettings(...)`` instead.
|
||||
|
||||
settings: Runtime-updatable settings for this service. When both
|
||||
|
||||
@@ -360,7 +360,7 @@ def language_to_google_stt_language(language: Language) -> Optional[str]:
|
||||
|
||||
@dataclass
|
||||
class GoogleSTTSettings(STTSettings):
|
||||
"""Settings for Google Cloud Speech-to-Text V2.
|
||||
"""Settings for GoogleSTTService.
|
||||
|
||||
Parameters:
|
||||
languages: List of ``Language`` enums for recognition
|
||||
@@ -653,7 +653,7 @@ class GoogleSTTService(STTService):
|
||||
async def set_languages(self, languages: List[Language]):
|
||||
"""Update the service's recognition languages.
|
||||
|
||||
.. deprecated::
|
||||
.. deprecated:: 0.0.104
|
||||
Use ``STTUpdateSettingsFrame`` with ``GoogleSTTSettings(languages=...)``
|
||||
instead.
|
||||
|
||||
|
||||
@@ -482,7 +482,7 @@ def language_to_gemini_tts_language(language: Language) -> Optional[str]:
|
||||
|
||||
@dataclass
|
||||
class GoogleHttpTTSSettings(TTSSettings):
|
||||
"""Settings for Google HTTP TTS service.
|
||||
"""Settings for GoogleHttpTTSService.
|
||||
|
||||
Parameters:
|
||||
pitch: Voice pitch adjustment (e.g., "+2st", "-50%").
|
||||
@@ -512,8 +512,8 @@ class GoogleHttpTTSSettings(TTSSettings):
|
||||
|
||||
|
||||
@dataclass
|
||||
class GoogleStreamTTSSettings(TTSSettings):
|
||||
"""Settings for Google streaming TTS service.
|
||||
class GoogleTTSSettings(TTSSettings):
|
||||
"""Settings for GoogleTTSService.
|
||||
|
||||
Parameters:
|
||||
speaking_rate: The speaking rate, in the range [0.25, 2.0].
|
||||
@@ -522,9 +522,14 @@ class GoogleStreamTTSSettings(TTSSettings):
|
||||
speaking_rate: float | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
|
||||
|
||||
|
||||
#: .. deprecated:: 0.0.105
|
||||
#: Use ``GoogleTTSSettings`` instead.
|
||||
GoogleStreamTTSSettings = GoogleTTSSettings
|
||||
|
||||
|
||||
@dataclass
|
||||
class GeminiTTSSettings(TTSSettings):
|
||||
"""Settings for Gemini TTS service.
|
||||
"""Settings for GeminiTTSService.
|
||||
|
||||
Parameters:
|
||||
prompt: Optional style instructions for how to synthesize the content.
|
||||
@@ -619,6 +624,13 @@ class GoogleHttpTTSService(TTSService):
|
||||
model=None,
|
||||
voice="en-US-Chirp3-HD-Charon",
|
||||
language="en-US",
|
||||
pitch=None,
|
||||
rate=None,
|
||||
speaking_rate=None,
|
||||
volume=None,
|
||||
emphasis=None,
|
||||
gender=None,
|
||||
google_style=None,
|
||||
)
|
||||
|
||||
# 2. Apply direct init arg overrides (deprecated)
|
||||
@@ -1008,13 +1020,13 @@ class GoogleTTSService(GoogleBaseTTSService):
|
||||
)
|
||||
"""
|
||||
|
||||
_settings: GoogleStreamTTSSettings
|
||||
_settings: GoogleTTSSettings
|
||||
|
||||
class InputParams(BaseModel):
|
||||
"""Input parameters for Google streaming TTS configuration.
|
||||
|
||||
.. deprecated:: 0.0.105
|
||||
Use ``GoogleStreamTTSSettings`` directly via the ``settings`` parameter instead.
|
||||
Use ``GoogleTTSSettings`` directly via the ``settings`` parameter instead.
|
||||
|
||||
Parameters:
|
||||
language: Language for synthesis. Defaults to English.
|
||||
@@ -1034,7 +1046,7 @@ class GoogleTTSService(GoogleBaseTTSService):
|
||||
voice_cloning_key: Optional[str] = None,
|
||||
sample_rate: Optional[int] = None,
|
||||
params: Optional[InputParams] = None,
|
||||
settings: Optional[GoogleStreamTTSSettings] = None,
|
||||
settings: Optional[GoogleTTSSettings] = None,
|
||||
**kwargs,
|
||||
):
|
||||
"""Initializes the Google streaming TTS service.
|
||||
@@ -1046,34 +1058,35 @@ class GoogleTTSService(GoogleBaseTTSService):
|
||||
voice_id: Google TTS voice identifier (e.g., "en-US-Chirp3-HD-Charon").
|
||||
|
||||
.. deprecated:: 0.0.105
|
||||
Use ``settings=GoogleStreamTTSSettings(voice=...)`` instead.
|
||||
Use ``settings=GoogleTTSSettings(voice=...)`` instead.
|
||||
|
||||
voice_cloning_key: The voice cloning key for Chirp 3 custom voices.
|
||||
sample_rate: Audio sample rate in Hz. If None, uses default.
|
||||
params: Language configuration parameters.
|
||||
|
||||
.. deprecated:: 0.0.105
|
||||
Use ``settings=GoogleStreamTTSSettings(...)`` instead.
|
||||
Use ``settings=GoogleTTSSettings(...)`` instead.
|
||||
|
||||
settings: Runtime-updatable settings. When provided alongside deprecated
|
||||
parameters, ``settings`` values take precedence.
|
||||
**kwargs: Additional arguments passed to parent TTSService.
|
||||
"""
|
||||
# 1. Initialize default_settings with hardcoded defaults
|
||||
default_settings = GoogleStreamTTSSettings(
|
||||
default_settings = GoogleTTSSettings(
|
||||
model=None,
|
||||
voice="en-US-Chirp3-HD-Charon",
|
||||
language="en-US",
|
||||
speaking_rate=None,
|
||||
)
|
||||
|
||||
# 2. Apply direct init arg overrides (deprecated)
|
||||
if voice_id is not None:
|
||||
_warn_deprecated_param("voice_id", GoogleStreamTTSSettings, "voice")
|
||||
_warn_deprecated_param("voice_id", GoogleTTSSettings, "voice")
|
||||
default_settings.voice = voice_id
|
||||
|
||||
# 3. Apply params overrides — only if settings not provided
|
||||
if params is not None:
|
||||
_warn_deprecated_param("params", GoogleStreamTTSSettings)
|
||||
_warn_deprecated_param("params", GoogleTTSSettings)
|
||||
if not settings:
|
||||
if params.language is not None:
|
||||
default_settings.language = self.language_to_service_language(params.language)
|
||||
@@ -1104,7 +1117,7 @@ class GoogleTTSService(GoogleBaseTTSService):
|
||||
Args:
|
||||
delta: Settings delta. Can include 'speaking_rate' (float).
|
||||
"""
|
||||
if isinstance(delta, GoogleStreamTTSSettings) and is_given(delta.speaking_rate):
|
||||
if isinstance(delta, GoogleTTSSettings) and is_given(delta.speaking_rate):
|
||||
rate_value = float(delta.speaking_rate)
|
||||
if not (0.25 <= rate_value <= 2.0):
|
||||
logger.warning(
|
||||
@@ -1308,6 +1321,9 @@ class GeminiTTSService(GoogleBaseTTSService):
|
||||
model="gemini-2.5-flash-tts",
|
||||
voice="Kore",
|
||||
language="en-US",
|
||||
prompt=None,
|
||||
multi_speaker=False,
|
||||
speaker_configs=None,
|
||||
)
|
||||
|
||||
# 2. Apply direct init arg overrides (deprecated)
|
||||
|
||||
@@ -68,7 +68,7 @@ def language_to_gradium_language(language: Language) -> Optional[str]:
|
||||
|
||||
@dataclass
|
||||
class GradiumSTTSettings(STTSettings):
|
||||
"""Settings for the Gradium STT service."""
|
||||
"""Settings for GradiumSTTService."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
@@ -93,14 +93,14 @@ def language_to_nvidia_riva_language(language: Language) -> Optional[str]:
|
||||
|
||||
@dataclass
|
||||
class NvidiaSTTSettings(STTSettings):
|
||||
"""Settings for the NVIDIA Riva streaming STT service."""
|
||||
"""Settings for NvidiaSTTService."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class NvidiaSegmentedSTTSettings(STTSettings):
|
||||
"""Settings for the NVIDIA Riva segmented STT service.
|
||||
"""Settings for NvidiaSegmentedSTTService.
|
||||
|
||||
Parameters:
|
||||
profanity_filter: Whether to filter profanity from results.
|
||||
|
||||
@@ -182,7 +182,7 @@ _OPENAI_SAMPLE_RATE = 24000
|
||||
|
||||
@dataclass
|
||||
class OpenAIRealtimeSTTSettings(STTSettings):
|
||||
"""Settings for the OpenAI Realtime STT service.
|
||||
"""Settings for OpenAIRealtimeSTTService.
|
||||
|
||||
Parameters:
|
||||
prompt: Optional prompt text to guide transcription style.
|
||||
|
||||
@@ -134,7 +134,7 @@ class OpenAIRealtimeBetaLLMService(LLMService):
|
||||
api_key: OpenAI API key for authentication.
|
||||
model: OpenAI model name.
|
||||
|
||||
.. deprecated::
|
||||
.. deprecated:: 0.0.105
|
||||
Use ``settings=OpenAIRealtimeBetaLLMSettings(model=...)`` instead.
|
||||
|
||||
base_url: WebSocket base URL for the realtime API.
|
||||
|
||||
@@ -139,7 +139,7 @@ MODEL_CONFIGS: Dict[str, ModelConfig] = {
|
||||
|
||||
@dataclass
|
||||
class SarvamSTTSettings(STTSettings):
|
||||
"""Settings for the Sarvam STT service.
|
||||
"""Settings for SarvamSTTService.
|
||||
|
||||
Parameters:
|
||||
prompt: Optional prompt to guide transcription/translation style/context.
|
||||
@@ -414,7 +414,7 @@ class SarvamSTTService(STTService):
|
||||
async def set_prompt(self, prompt: Optional[str]):
|
||||
"""Set the transcription/translation prompt and reconnect.
|
||||
|
||||
.. deprecated::
|
||||
.. deprecated:: 0.0.104
|
||||
Use ``STTUpdateSettingsFrame(SarvamSTTSettings(prompt=...))`` instead.
|
||||
|
||||
Args:
|
||||
|
||||
@@ -141,7 +141,7 @@ def _prepare_language_hints(
|
||||
|
||||
@dataclass
|
||||
class SonioxSTTSettings(STTSettings):
|
||||
"""Settings for Soniox STT service.
|
||||
"""Settings for SonioxSTTService.
|
||||
|
||||
Parameters:
|
||||
language_hints: List of language hints to use for transcription.
|
||||
|
||||
@@ -85,12 +85,11 @@ class TurnDetectionMode(str, Enum):
|
||||
|
||||
@dataclass
|
||||
class SpeechmaticsSTTSettings(STTSettings):
|
||||
"""Settings for Speechmatics STT service.
|
||||
"""Settings for SpeechmaticsSTTService.
|
||||
|
||||
See ``SpeechmaticsSTTService.InputParams`` for detailed descriptions of each field.
|
||||
|
||||
Parameters:
|
||||
model: The operating point / model name.
|
||||
domain: Domain for Speechmatics API.
|
||||
turn_detection_mode: Endpoint handling mode.
|
||||
speaker_active_format: Formatter for active speaker ID.
|
||||
@@ -490,16 +489,16 @@ class SpeechmaticsSTTService(STTService):
|
||||
default_settings.prefer_current_speaker = _params.prefer_current_speaker
|
||||
default_settings.extra_params = _params.extra_params
|
||||
|
||||
# Build SDK config from settings, then resolve model from operating_point
|
||||
# --- 4. Settings delta (canonical API, always wins) ---
|
||||
if settings is not None:
|
||||
default_settings.apply_update(settings)
|
||||
|
||||
# Build SDK config from settings, set model name before calling super
|
||||
self._client: VoiceAgentClient | None = None
|
||||
self._audio_encoding = encoding
|
||||
self._config: VoiceAgentConfig = self._build_config(default_settings)
|
||||
default_settings.model = self._config.operating_point.value
|
||||
|
||||
# --- 4. Settings delta (canonical API, always wins) ---
|
||||
if settings is not None:
|
||||
default_settings.apply_update(settings)
|
||||
|
||||
super().__init__(
|
||||
sample_rate=sample_rate,
|
||||
ttfs_p99_latency=ttfs_p99_latency,
|
||||
|
||||
@@ -28,7 +28,7 @@ from pipecat.utils.tracing.service_decorators import traced_stt
|
||||
|
||||
@dataclass
|
||||
class BaseWhisperSTTSettings(STTSettings):
|
||||
"""Settings for Whisper API-based STT services.
|
||||
"""Settings for BaseWhisperSTTService.
|
||||
|
||||
Parameters:
|
||||
prompt: Optional text to guide the model's style or continue
|
||||
|
||||
@@ -176,7 +176,7 @@ def language_to_whisper_language(language: Language) -> Optional[str]:
|
||||
|
||||
@dataclass
|
||||
class WhisperSTTSettings(STTSettings):
|
||||
"""Settings for the local Whisper (Faster Whisper) STT service.
|
||||
"""Settings for WhisperSTTService.
|
||||
|
||||
Parameters:
|
||||
no_speech_prob: Probability threshold for filtering non-speech segments.
|
||||
@@ -187,7 +187,7 @@ class WhisperSTTSettings(STTSettings):
|
||||
|
||||
@dataclass
|
||||
class WhisperMLXSTTSettings(STTSettings):
|
||||
"""Settings for the MLX Whisper STT service.
|
||||
"""Settings for WhisperMLXSTTService.
|
||||
|
||||
Parameters:
|
||||
no_speech_prob: Probability threshold for filtering non-speech segments.
|
||||
|
||||
Reference in New Issue
Block a user