diff --git a/COMMUNITY_INTEGRATIONS.md b/COMMUNITY_INTEGRATIONS.md index 5dd9e5764..e17c72bc7 100644 --- a/COMMUNITY_INTEGRATIONS.md +++ b/COMMUNITY_INTEGRATIONS.md @@ -233,14 +233,14 @@ def can_generate_metrics(self) -> bool: ### Service Settings -Every STT, LLM, TTS, and image-generation service exposes a **Settings dataclass** that serves two roles: +Every AI service (STT, LLM, TTS, image generation, etc.) exposes a **Settings dataclass** that serves two roles: 1. **Store mode** — the service's `self._settings` holds the current value of every runtime-updatable field. -2. **Delta mode** — an update frame carries only the fields that changed; unset fields remain `NOT_GIVEN`. +2. **Delta mode** — an update frame (e.g. `TTSUpdateSettingsFrame`) specifies only the fields that should change; unspecified fields remain `NOT_GIVEN`. #### Defining your Settings class -Extend `STTSettings`, `TTSSettings`, `LLMSettings`, or `ImageGenSettings`. The base classes already provide common fields (e.g. `model`, `voice`, `language`). You only need to add **service-specific knobs that should be runtime-updatable**: +Extend `STTSettings`, `TTSSettings`, `LLMSettings`, or `ImageGenSettings` (or, if your service directly subclasses `AIService`, `ServiceSettings`). The base classes already provide common fields (e.g. `model`, `voice`, `language`). You only need to add **service-specific knobs that should be runtime-updatable**: ```python from dataclasses import dataclass, field @@ -320,7 +320,7 @@ svc = MyTTSService( #### Reacting to runtime changes -STT, LLM, and TTS services support runtime configuration changes via `*UpdateSettingsFrame`s (e.g. `STTUpdateSettingsFrame`, `TTSUpdateSettingsFrame`, `LLMUpdateSettingsFrame`). +AI services support runtime configuration changes via `*UpdateSettingsFrame`s (e.g. `STTUpdateSettingsFrame`, `TTSUpdateSettingsFrame`, `LLMUpdateSettingsFrame`). To react to runtime setting changes, override `_update_settings`. The base implementation applies the delta to `self._settings` and returns a `dict` mapping each changed field name to its **pre-update** value. Your override should call `super()` first, then act on the changed fields. A common implementation might look like: diff --git a/examples/foundational/07f-interruptible-azure-http.py b/examples/foundational/07f-interruptible-azure-http.py index 407022f75..ec5068b25 100644 --- a/examples/foundational/07f-interruptible-azure-http.py +++ b/examples/foundational/07f-interruptible-azure-http.py @@ -65,8 +65,8 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments): llm = AzureLLMService( api_key=os.getenv("AZURE_CHATGPT_API_KEY"), endpoint=os.getenv("AZURE_CHATGPT_ENDPOINT"), - model=os.getenv("AZURE_CHATGPT_MODEL"), settings=AzureLLMSettings( + model=os.getenv("AZURE_CHATGPT_MODEL"), system_instruction="You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be spoken aloud, so avoid special characters that can't easily be spoken, such as emojis or bullet points. Respond to what the user said in a creative and helpful way.", ), ) diff --git a/examples/foundational/07f-interruptible-azure.py b/examples/foundational/07f-interruptible-azure.py index 7e47e1c3e..d905d59bb 100644 --- a/examples/foundational/07f-interruptible-azure.py +++ b/examples/foundational/07f-interruptible-azure.py @@ -65,8 +65,8 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments): llm = AzureLLMService( api_key=os.getenv("AZURE_CHATGPT_API_KEY"), endpoint=os.getenv("AZURE_CHATGPT_ENDPOINT"), - model=os.getenv("AZURE_CHATGPT_MODEL"), settings=AzureLLMSettings( + model=os.getenv("AZURE_CHATGPT_MODEL"), system_instruction="You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be spoken aloud, so avoid special characters that can't easily be spoken, such as emojis or bullet points. Respond to what the user said in a creative and helpful way.", ), ) diff --git a/examples/foundational/07m-interruptible-aws.py b/examples/foundational/07m-interruptible-aws.py index 8b7c0c29e..9cea445c9 100644 --- a/examples/foundational/07m-interruptible-aws.py +++ b/examples/foundational/07m-interruptible-aws.py @@ -63,9 +63,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments): llm = AWSBedrockLLMService( aws_region="us-west-2", - model="us.anthropic.claude-haiku-4-5-20251001-v1:0", - params=AWSBedrockLLMService.InputParams(temperature=0.8), settings=AWSBedrockLLMSettings( + model="us.anthropic.claude-haiku-4-5-20251001-v1:0", + temperature=0.8, system_instruction="You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be spoken aloud, so avoid special characters that can't easily be spoken, such as emojis or bullet points. Respond to what the user said in a creative and helpful way.", ), ) diff --git a/examples/foundational/07n-interruptible-google.py b/examples/foundational/07n-interruptible-google.py index 522d168e1..4f5409085 100644 --- a/examples/foundational/07n-interruptible-google.py +++ b/examples/foundational/07n-interruptible-google.py @@ -55,8 +55,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments): stt = GoogleSTTService( settings=GoogleSTTSettings( - languages=Language.EN_US, - model="chirp_3", + languages=[Language.EN_US], + # Add model to use a specific model + # model="chirp_3", ), credentials=os.getenv("GOOGLE_TEST_CREDENTIALS"), location="us", diff --git a/examples/foundational/07o-interruptible-assemblyai-turn-detection.py b/examples/foundational/07o-interruptible-assemblyai-turn-detection.py index 114c6f263..2c2ca5419 100644 --- a/examples/foundational/07o-interruptible-assemblyai-turn-detection.py +++ b/examples/foundational/07o-interruptible-assemblyai-turn-detection.py @@ -94,7 +94,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments): api_key=os.getenv("ASSEMBLYAI_API_KEY"), vad_force_turn_endpoint=False, # Use AssemblyAI's built-in turn detection settings=AssemblyAISTTSettings( - speech_model="u3-rt-pro", + model="u3-rt-pro", # Optional: Tune turn detection timing (defaults shown below) # min_turn_silence=100, # Default # max_turn_silence=1000, # Default diff --git a/examples/foundational/07za-interruptible-soniox.py b/examples/foundational/07za-interruptible-soniox.py index 78e3ac873..71011c5d9 100644 --- a/examples/foundational/07za-interruptible-soniox.py +++ b/examples/foundational/07za-interruptible-soniox.py @@ -51,13 +51,13 @@ transport_params = { async def run_bot(transport: BaseTransport, runner_args: RunnerArguments): logger.info(f"Starting bot") - stt = ( - SonioxSTTService( - api_key=os.getenv("SONIOX_API_KEY"), - settings=SonioxSTTSettings( - language_hints=[Language.EN], - language_hints_strict=True, - ), + stt = SonioxSTTService( + api_key=os.getenv("SONIOX_API_KEY"), + settings=SonioxSTTSettings( + # Add language hints to use a specific language + # Add strict mode to enforce the language hints + language_hints=[Language.EN], + language_hints_strict=True, ), ) diff --git a/examples/foundational/14d-function-calling-aws-video.py b/examples/foundational/14d-function-calling-aws-video.py index 358c6f7f0..75235bc01 100644 --- a/examples/foundational/14d-function-calling-aws-video.py +++ b/examples/foundational/14d-function-calling-aws-video.py @@ -99,7 +99,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments): llm = AWSBedrockLLMService( aws_region="us-west-2", settings=AWSBedrockLLMSettings( - model="us.anthropic.claude-3-7-sonnet-20250219-v1:0", + model="us.anthropic.claude-sonnet-4-6", # Note: usually, prefer providing latency="optimized" param. # Here we can't because AWS Bedrock doesn't support it for Claude 3.7, # which we need for image input. @@ -170,7 +170,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments): context.add_message( { "role": "user", - "content": f"Please introduce yourself to the user. Use '{client_id}' as the user ID during function calls.", + "content": f"Please introduce yourself to the user briefly; don't mention the camera. Use '{client_id}' as the user ID during function calls.", } ) await task.queue_frames([LLMRunFrame()]) diff --git a/examples/foundational/55zzn-update-settings-groq-stt.py b/examples/foundational/55zzn-update-settings-groq-stt.py index f3f3ffa01..dc7ba6d23 100644 --- a/examples/foundational/55zzn-update-settings-groq-stt.py +++ b/examples/foundational/55zzn-update-settings-groq-stt.py @@ -99,7 +99,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments): @transport.event_handler("on_client_connected") async def on_client_connected(transport, client): logger.info(f"Client connected") - context.add_message({"user": "system", "content": "Please introduce yourself to the user."}) + context.add_message({"role": "user", "content": "Please introduce yourself to the user."}) await task.queue_frames([LLMRunFrame()]) await asyncio.sleep(10) diff --git a/src/pipecat/processors/aggregators/llm_context.py b/src/pipecat/processors/aggregators/llm_context.py index 4b0e95aa7..1375b8297 100644 --- a/src/pipecat/processors/aggregators/llm_context.py +++ b/src/pipecat/processors/aggregators/llm_context.py @@ -255,7 +255,7 @@ class LLMContext: this method, which is part of the public API of OpenAILLMContext but doesn't need to be for LLMContext. - .. deprecated:: + .. deprecated:: 0.0.92 Use `get_messages()` instead. Returns: diff --git a/src/pipecat/processors/user_idle_processor.py b/src/pipecat/processors/user_idle_processor.py index 67c41ab13..f7ea48599 100644 --- a/src/pipecat/processors/user_idle_processor.py +++ b/src/pipecat/processors/user_idle_processor.py @@ -27,7 +27,7 @@ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor class UserIdleProcessor(FrameProcessor): """Monitors user inactivity and triggers callbacks after timeout periods. - .. deprecated:: + .. deprecated:: 0.0.100 UserIdleProcessor is deprecated in 0.0.100 and will be removed in a future version. Use LLMUserAggregator with user_idle_timeout parameter instead. diff --git a/src/pipecat/services/anthropic/llm.py b/src/pipecat/services/anthropic/llm.py index 369c2a4ed..49f7f58b4 100644 --- a/src/pipecat/services/anthropic/llm.py +++ b/src/pipecat/services/anthropic/llm.py @@ -170,7 +170,7 @@ class AnthropicLLMService(LLMService): class InputParams(BaseModel): """Input parameters for Anthropic model inference. - .. deprecated:: + .. deprecated:: 0.0.105 Use ``AnthropicLLMSettings`` instead. Pass settings directly via the ``settings`` parameter of :class:`AnthropicLLMService`. @@ -231,12 +231,12 @@ class AnthropicLLMService(LLMService): api_key: Anthropic API key for authentication. model: Model name to use. - .. deprecated:: + .. deprecated:: 0.0.105 Use ``settings=AnthropicLLMSettings(model=...)`` instead. params: Optional model parameters for inference. - .. deprecated:: + .. deprecated:: 0.0.105 Use ``settings=AnthropicLLMSettings(...)`` instead. settings: Runtime-updatable settings for this service. When both diff --git a/src/pipecat/services/assemblyai/stt.py b/src/pipecat/services/assemblyai/stt.py index ec4130ea5..3d10c970f 100644 --- a/src/pipecat/services/assemblyai/stt.py +++ b/src/pipecat/services/assemblyai/stt.py @@ -81,7 +81,7 @@ def map_language_from_assemblyai(language_code: str) -> Language: @dataclass class AssemblyAISTTSettings(STTSettings): - """Settings for the AssemblyAI STT service. + """Settings for AssemblyAISTTService. Parameters: formatted_finals: Whether to enable transcript formatting. @@ -99,6 +99,8 @@ class AssemblyAISTTSettings(STTSettings): language_detection: Enable automatic language detection. format_turns: Whether to format transcript turns. speaker_labels: Enable speaker diarization. + vad_threshold: VAD confidence threshold (0.0–1.0) for classifying + audio frames as silence. Only applicable to u3-rt-pro. """ formatted_finals: bool | _NotGiven = field(default_factory=lambda: NOT_GIVEN) @@ -115,6 +117,7 @@ class AssemblyAISTTSettings(STTSettings): language_detection: bool | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN) format_turns: bool | _NotGiven = field(default_factory=lambda: NOT_GIVEN) speaker_labels: bool | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN) + vad_threshold: float | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN) class AssemblyAISTTService(WebsocketSTTService): @@ -199,6 +202,7 @@ class AssemblyAISTTService(WebsocketSTTService): language_detection=None, format_turns=True, speaker_labels=None, + vad_threshold=None, ) # 2. Apply direct init arg overrides (deprecated) @@ -227,6 +231,7 @@ class AssemblyAISTTService(WebsocketSTTService): default_settings.language_detection = connection_params.language_detection default_settings.format_turns = connection_params.format_turns default_settings.speaker_labels = connection_params.speaker_labels + default_settings.vad_threshold = connection_params.vad_threshold # 4. Apply settings delta (canonical API, always wins) if settings is not None: @@ -463,6 +468,7 @@ class AssemblyAISTTService(WebsocketSTTService): "language_detection": s.language_detection, "format_turns": s.format_turns, "speaker_labels": s.speaker_labels, + "vad_threshold": s.vad_threshold, } for k, v in optional_fields.items(): @@ -651,7 +657,7 @@ class AssemblyAISTTService(WebsocketSTTService): await self.start_processing_metrics() await self.broadcast_frame(UserStartedSpeakingFrame) if self._should_interrupt: - await self.push_interruption_task_frame_and_wait() + await self.broadcast_interruption() self._user_speaking = True async def _handle_termination(self, message: TerminationMessage): diff --git a/src/pipecat/services/aws/llm.py b/src/pipecat/services/aws/llm.py index 34a0dd780..5b8c80996 100644 --- a/src/pipecat/services/aws/llm.py +++ b/src/pipecat/services/aws/llm.py @@ -754,7 +754,7 @@ class AWSBedrockLLMService(LLMService): class InputParams(BaseModel): """Input parameters for AWS Bedrock LLM service. - .. deprecated:: + .. deprecated:: 0.0.105 Use ``AWSBedrockLLMSettings`` instead. Pass settings directly via the ``settings`` parameter of :class:`AWSBedrockLLMService`. @@ -795,7 +795,7 @@ class AWSBedrockLLMService(LLMService): Args: model: The AWS Bedrock model identifier to use. - .. deprecated:: + .. deprecated:: 0.0.105 Use ``settings=AWSBedrockLLMSettings(model=...)`` instead. aws_access_key: AWS access key ID. If None, uses default credentials. @@ -804,7 +804,7 @@ class AWSBedrockLLMService(LLMService): aws_region: AWS region for the Bedrock service. params: Model parameters and configuration. - .. deprecated:: + .. deprecated:: 0.0.105 Use ``settings=AWSBedrockLLMSettings(...)`` instead. settings: Runtime-updatable settings for this service. When both diff --git a/src/pipecat/services/aws/nova_sonic/llm.py b/src/pipecat/services/aws/nova_sonic/llm.py index 3948ae1eb..3acc1d0fc 100644 --- a/src/pipecat/services/aws/nova_sonic/llm.py +++ b/src/pipecat/services/aws/nova_sonic/llm.py @@ -280,7 +280,7 @@ class AWSNovaSonicLLMService(LLMService): - Nova Sonic (the older model): "us-east-1", "ap-northeast-1" model: Model identifier. Defaults to "amazon.nova-2-sonic-v1:0". - .. deprecated:: + .. deprecated:: 0.0.105 Use ``settings=AWSNovaSonicLLMSettings(model=...)`` instead. voice_id: Voice ID for speech synthesis. @@ -289,7 +289,7 @@ class AWSNovaSonicLLMService(LLMService): - Nova 2 Sonic (the default model): see https://docs.aws.amazon.com/nova/latest/nova2-userguide/sonic-language-support.html - Nova Sonic (the older model): see https://docs.aws.amazon.com/nova/latest/userguide/available-voices.html. - .. deprecated:: + .. deprecated:: 0.0.105 Use ``settings=AWSNovaSonicLLMSettings(voice=...)`` instead. params: Model parameters for audio configuration and inference. diff --git a/src/pipecat/services/aws/stt.py b/src/pipecat/services/aws/stt.py index f46f5259c..879fa99ab 100644 --- a/src/pipecat/services/aws/stt.py +++ b/src/pipecat/services/aws/stt.py @@ -47,7 +47,7 @@ except ModuleNotFoundError as e: @dataclass class AWSTranscribeSTTSettings(STTSettings): - """Settings for the AWS Transcribe STT service.""" + """Settings for AWSTranscribeSTTService.""" pass @@ -99,13 +99,13 @@ class AWSTranscribeSTTService(WebsocketSTTService): # 1. Initialize default_settings with hardcoded defaults default_settings = AWSTranscribeSTTSettings( model=None, - language=self.language_to_service_language(Language.EN) or "en-US", + language=self.language_to_service_language(Language.EN), ) # 2. Apply direct init arg overrides (deprecated) if language is not None: _warn_deprecated_param("language", AWSTranscribeSTTSettings, "language") - default_settings.language = self.language_to_service_language(language) or "en-US" + default_settings.language = self.language_to_service_language(language) # 3. No params to apply diff --git a/src/pipecat/services/azure/stt.py b/src/pipecat/services/azure/stt.py index 8e6204c5e..c1db76b82 100644 --- a/src/pipecat/services/azure/stt.py +++ b/src/pipecat/services/azure/stt.py @@ -53,7 +53,7 @@ except ModuleNotFoundError as e: @dataclass class AzureSTTSettings(STTSettings): - """Settings for the Azure STT service.""" + """Settings for AzureSTTService.""" pass diff --git a/src/pipecat/services/cartesia/stt.py b/src/pipecat/services/cartesia/stt.py index 67416016e..cdf46d50a 100644 --- a/src/pipecat/services/cartesia/stt.py +++ b/src/pipecat/services/cartesia/stt.py @@ -46,7 +46,7 @@ except ModuleNotFoundError as e: @dataclass class CartesiaSTTSettings(STTSettings): - """Settings for the Cartesia STT service.""" + """Settings for CartesiaSTTService.""" pass diff --git a/src/pipecat/services/deepgram/flux/stt.py b/src/pipecat/services/deepgram/flux/stt.py index 1e7f135b2..1ed28a749 100644 --- a/src/pipecat/services/deepgram/flux/stt.py +++ b/src/pipecat/services/deepgram/flux/stt.py @@ -71,7 +71,7 @@ class FluxEventType(str, Enum): @dataclass class DeepgramFluxSTTSettings(STTSettings): - """Settings for the Deepgram Flux STT service. + """Settings for DeepgramFluxSTTService. Parameters: eager_eot_threshold: EagerEndOfTurn/TurnResumed threshold. Off by default. @@ -81,7 +81,6 @@ class DeepgramFluxSTTSettings(STTSettings): eot_timeout_ms: Time in ms after speech to finish a turn regardless of EOT confidence (default 5000). keyterm: Keyterms to boost recognition accuracy for specialized terminology. - tag: Tags to label requests for identification during usage reporting. min_confidence: Minimum confidence required to create a TranscriptionFrame. """ @@ -89,7 +88,6 @@ class DeepgramFluxSTTSettings(STTSettings): eot_threshold: float | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN) eot_timeout_ms: int | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN) keyterm: list | _NotGiven = field(default_factory=lambda: NOT_GIVEN) - tag: list | _NotGiven = field(default_factory=lambda: NOT_GIVEN) min_confidence: float | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN) @@ -157,6 +155,7 @@ class DeepgramFluxSTTService(WebsocketSTTService): mip_opt_out: Optional[bool] = None, model: Optional[str] = None, flux_encoding: str = "linear16", + tag: Optional[list] = None, params: Optional[InputParams] = None, should_interrupt: bool = True, settings: Optional[DeepgramFluxSTTSettings] = None, @@ -177,6 +176,7 @@ class DeepgramFluxSTTService(WebsocketSTTService): flux_encoding: Audio encoding format required by Flux API. Must be "linear16". Raw signed little-endian 16-bit PCM encoding. + tag: Tags to label requests for identification during usage reporting. params: InputParams instance containing detailed API configuration options. .. deprecated:: 0.0.105 @@ -224,7 +224,6 @@ class DeepgramFluxSTTService(WebsocketSTTService): eot_threshold=None, eot_timeout_ms=None, keyterm=[], - tag=[], min_confidence=None, ) @@ -241,7 +240,8 @@ class DeepgramFluxSTTService(WebsocketSTTService): default_settings.eot_threshold = params.eot_threshold default_settings.eot_timeout_ms = params.eot_timeout_ms default_settings.keyterm = params.keyterm or [] - default_settings.tag = params.tag or [] + if params.tag and tag is None: + tag = params.tag default_settings.min_confidence = params.min_confidence if params.mip_opt_out is not None: mip_opt_out = params.mip_opt_out @@ -261,6 +261,7 @@ class DeepgramFluxSTTService(WebsocketSTTService): self._should_interrupt = should_interrupt self._encoding = flux_encoding self._mip_opt_out = mip_opt_out + self._tag = tag or [] self._websocket_url = None self._receive_task = None @@ -469,7 +470,7 @@ class DeepgramFluxSTTService(WebsocketSTTService): url_params.append(urlencode({"keyterm": keyterm})) # Add tag parameters (can have multiple) - for tag_value in self._settings.tag: + for tag_value in self._tag: url_params.append(urlencode({"tag": tag_value})) self._websocket_url = f"{self._url}?{'&'.join(url_params)}" diff --git a/src/pipecat/services/deepgram/stt.py b/src/pipecat/services/deepgram/stt.py index d377fe69a..caa6233b3 100644 --- a/src/pipecat/services/deepgram/stt.py +++ b/src/pipecat/services/deepgram/stt.py @@ -177,7 +177,7 @@ class LiveOptions: @dataclass class DeepgramSTTSettings(STTSettings): - """Settings for Deepgram STT services. + """Settings for DeepgramSTTService. ``model`` and ``language`` are inherited from ``STTSettings`` / ``ServiceSettings``. Additional Deepgram connection params may diff --git a/src/pipecat/services/elevenlabs/stt.py b/src/pipecat/services/elevenlabs/stt.py index f9899f7f1..230e8a368 100644 --- a/src/pipecat/services/elevenlabs/stt.py +++ b/src/pipecat/services/elevenlabs/stt.py @@ -179,19 +179,19 @@ class CommitStrategy(str, Enum): @dataclass class ElevenLabsSTTSettings(STTSettings): - """Settings for the ElevenLabs file-based STT service. + """Settings for ElevenLabsSTTService. Parameters: tag_audio_events: Whether to include audio events like (laughter), (coughing) in the transcription. """ - tag_audio_events: bool | _NotGiven = field(default_factory=lambda: NOT_GIVEN) + tag_audio_events: bool | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN) @dataclass class ElevenLabsRealtimeSTTSettings(STTSettings): - """Settings for the ElevenLabs Realtime STT service. + """Settings for ElevenLabsRealtimeSTTService. See ``ElevenLabsRealtimeSTTService.InputParams`` for detailed descriptions. @@ -277,8 +277,8 @@ class ElevenLabsSTTService(SegmentedSTTService): # 1. Initialize default_settings with hardcoded defaults default_settings = ElevenLabsSTTSettings( model="scribe_v2", - language="eng", - tag_audio_events=True, + language=language_to_elevenlabs_language(Language.EN), + tag_audio_events=None, ) # 2. Apply direct init arg overrides (deprecated) @@ -291,9 +291,7 @@ class ElevenLabsSTTService(SegmentedSTTService): _warn_deprecated_param("params", ElevenLabsSTTSettings) if not settings: if params.language is not None: - default_settings.language = ( - self.language_to_service_language(params.language) or "eng" - ) + default_settings.language = language_to_elevenlabs_language(params.language) default_settings.tag_audio_events = params.tag_audio_events # 4. Apply settings delta (canonical API, always wins) @@ -354,10 +352,11 @@ class ElevenLabsSTTService(SegmentedSTTService): content_type="audio/x-wav", ) - # Add required model_id, language_code, and tag_audio_events + # Add required model_id and language_code data.add_field("model_id", self._settings.model) data.add_field("language_code", self._settings.language) - data.add_field("tag_audio_events", str(self._settings.tag_audio_events).lower()) + if self._settings.tag_audio_events is not None: + data.add_field("tag_audio_events", str(self._settings.tag_audio_events).lower()) async with self._session.post(url, data=data, headers=headers) as response: if response.status != 200: diff --git a/src/pipecat/services/fal/stt.py b/src/pipecat/services/fal/stt.py index 18b8ed85c..92e97d381 100644 --- a/src/pipecat/services/fal/stt.py +++ b/src/pipecat/services/fal/stt.py @@ -143,7 +143,7 @@ def language_to_fal_language(language: Language) -> Optional[str]: @dataclass class FalSTTSettings(STTSettings): - """Settings for the Fal Wizper STT service.""" + """Settings for FalSTTService.""" pass @@ -215,7 +215,7 @@ class FalSTTService(SegmentedSTTService): # 1. Initialize default_settings with hardcoded defaults default_settings = FalSTTSettings( model=None, - language=language_to_fal_language(Language.EN) or "en", + language=language_to_fal_language(Language.EN), ) # 2. (no deprecated direct args for this service) @@ -224,9 +224,8 @@ class FalSTTService(SegmentedSTTService): if params is not None: _warn_deprecated_param("params", FalSTTSettings) if not settings: - default_settings.language = ( - language_to_fal_language(params.language) if params.language else "en" - ) + if params.language is not None: + default_settings.language = language_to_fal_language(params.language) if params.task != "transcribe": task = params.task if params.chunk_level != "segment": diff --git a/src/pipecat/services/gladia/stt.py b/src/pipecat/services/gladia/stt.py index f1eca2dc2..144f37fb2 100644 --- a/src/pipecat/services/gladia/stt.py +++ b/src/pipecat/services/gladia/stt.py @@ -188,7 +188,7 @@ class _InputParamsDescriptor: @dataclass class GladiaSTTSettings(STTSettings): - """Settings for Gladia STT service. + """Settings for GladiaSTTService. Parameters: language_config: Language detection and handling configuration. diff --git a/src/pipecat/services/google/gemini_live/llm.py b/src/pipecat/services/google/gemini_live/llm.py index 146a0fcc4..9f65f57a1 100644 --- a/src/pipecat/services/google/gemini_live/llm.py +++ b/src/pipecat/services/google/gemini_live/llm.py @@ -552,7 +552,7 @@ class ContextWindowCompressionParams(BaseModel): class InputParams(BaseModel): """Input parameters for Gemini Live generation. - .. deprecated:: + .. deprecated:: 0.0.105 Use ``GeminiLiveLLMSettings`` instead. Parameters: @@ -678,7 +678,7 @@ class GeminiLiveLLMService(LLMService): model: Model identifier to use. - .. deprecated:: + .. deprecated:: 0.0.105 Use ``settings=GeminiLiveLLMSettings(model=...)`` instead. voice_id: TTS voice identifier. Defaults to "Charon". @@ -691,7 +691,7 @@ class GeminiLiveLLMService(LLMService): tools: Tools/functions available to the model. Defaults to None. params: Configuration parameters for the model. - .. deprecated:: + .. deprecated:: 0.0.105 Use ``settings=GeminiLiveLLMSettings(...)`` instead. settings: Gemini Live LLM settings. If provided together with deprecated diff --git a/src/pipecat/services/google/gemini_live/llm_vertex.py b/src/pipecat/services/google/gemini_live/llm_vertex.py index 6264ea285..9edd73e61 100644 --- a/src/pipecat/services/google/gemini_live/llm_vertex.py +++ b/src/pipecat/services/google/gemini_live/llm_vertex.py @@ -88,7 +88,7 @@ class GeminiLiveVertexLLMService(GeminiLiveLLMService): project_id: Google Cloud project ID. model: Model identifier to use. - .. deprecated:: + .. deprecated:: 0.0.105 Use ``settings=GeminiLiveLLMSettings(model=...)`` instead. voice_id: TTS voice identifier. Defaults to "Charon". @@ -102,7 +102,7 @@ class GeminiLiveVertexLLMService(GeminiLiveLLMService): params: Configuration parameters for the model along with Vertex AI location and project ID. - .. deprecated:: + .. deprecated:: 0.0.105 Use ``settings=GeminiLiveLLMSettings(...)`` instead. settings: Gemini Live LLM settings. If provided together with deprecated diff --git a/src/pipecat/services/google/llm.py b/src/pipecat/services/google/llm.py index 980d2d576..d5f025bcd 100644 --- a/src/pipecat/services/google/llm.py +++ b/src/pipecat/services/google/llm.py @@ -754,7 +754,7 @@ class GoogleLLMService(LLMService): class InputParams(BaseModel): """Input parameters for Google AI models. - .. deprecated:: + .. deprecated:: 0.0.105 Use ``settings=GoogleLLMSettings(...)`` instead. Parameters: @@ -797,12 +797,12 @@ class GoogleLLMService(LLMService): api_key: Google AI API key for authentication. model: Model name to use. - .. deprecated:: + .. deprecated:: 0.0.105 Use ``settings=GoogleLLMSettings(model=...)`` instead. params: Optional model parameters for inference. - .. deprecated:: + .. deprecated:: 0.0.105 Use ``settings=GoogleLLMSettings(...)`` instead. settings: Runtime-updatable settings for this service. When both diff --git a/src/pipecat/services/google/llm_vertex.py b/src/pipecat/services/google/llm_vertex.py index 42d96333c..cbad30c48 100644 --- a/src/pipecat/services/google/llm_vertex.py +++ b/src/pipecat/services/google/llm_vertex.py @@ -128,14 +128,14 @@ class GoogleVertexLLMService(GoogleLLMService): credentials_path: Path to the service account JSON file. model: Model identifier (e.g., "gemini-2.5-flash"). - .. deprecated:: + .. deprecated:: 0.0.105 Use ``settings=GoogleLLMSettings(model=...)`` instead. location: GCP region for Vertex AI endpoint (e.g., "us-east4"). project_id: Google Cloud project ID. params: Input parameters for the model. - .. deprecated:: + .. deprecated:: 0.0.105 Use ``settings=GoogleLLMSettings(...)`` instead. settings: Runtime-updatable settings for this service. When both diff --git a/src/pipecat/services/google/stt.py b/src/pipecat/services/google/stt.py index 376e74a6d..131d0e9d1 100644 --- a/src/pipecat/services/google/stt.py +++ b/src/pipecat/services/google/stt.py @@ -360,7 +360,7 @@ def language_to_google_stt_language(language: Language) -> Optional[str]: @dataclass class GoogleSTTSettings(STTSettings): - """Settings for Google Cloud Speech-to-Text V2. + """Settings for GoogleSTTService. Parameters: languages: List of ``Language`` enums for recognition @@ -653,7 +653,7 @@ class GoogleSTTService(STTService): async def set_languages(self, languages: List[Language]): """Update the service's recognition languages. - .. deprecated:: + .. deprecated:: 0.0.104 Use ``STTUpdateSettingsFrame`` with ``GoogleSTTSettings(languages=...)`` instead. diff --git a/src/pipecat/services/google/tts.py b/src/pipecat/services/google/tts.py index 071d731b1..984989fa5 100644 --- a/src/pipecat/services/google/tts.py +++ b/src/pipecat/services/google/tts.py @@ -482,7 +482,7 @@ def language_to_gemini_tts_language(language: Language) -> Optional[str]: @dataclass class GoogleHttpTTSSettings(TTSSettings): - """Settings for Google HTTP TTS service. + """Settings for GoogleHttpTTSService. Parameters: pitch: Voice pitch adjustment (e.g., "+2st", "-50%"). @@ -512,8 +512,8 @@ class GoogleHttpTTSSettings(TTSSettings): @dataclass -class GoogleStreamTTSSettings(TTSSettings): - """Settings for Google streaming TTS service. +class GoogleTTSSettings(TTSSettings): + """Settings for GoogleTTSService. Parameters: speaking_rate: The speaking rate, in the range [0.25, 2.0]. @@ -522,9 +522,14 @@ class GoogleStreamTTSSettings(TTSSettings): speaking_rate: float | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN) +#: .. deprecated:: 0.0.105 +#: Use ``GoogleTTSSettings`` instead. +GoogleStreamTTSSettings = GoogleTTSSettings + + @dataclass class GeminiTTSSettings(TTSSettings): - """Settings for Gemini TTS service. + """Settings for GeminiTTSService. Parameters: prompt: Optional style instructions for how to synthesize the content. @@ -619,6 +624,13 @@ class GoogleHttpTTSService(TTSService): model=None, voice="en-US-Chirp3-HD-Charon", language="en-US", + pitch=None, + rate=None, + speaking_rate=None, + volume=None, + emphasis=None, + gender=None, + google_style=None, ) # 2. Apply direct init arg overrides (deprecated) @@ -1008,13 +1020,13 @@ class GoogleTTSService(GoogleBaseTTSService): ) """ - _settings: GoogleStreamTTSSettings + _settings: GoogleTTSSettings class InputParams(BaseModel): """Input parameters for Google streaming TTS configuration. .. deprecated:: 0.0.105 - Use ``GoogleStreamTTSSettings`` directly via the ``settings`` parameter instead. + Use ``GoogleTTSSettings`` directly via the ``settings`` parameter instead. Parameters: language: Language for synthesis. Defaults to English. @@ -1034,7 +1046,7 @@ class GoogleTTSService(GoogleBaseTTSService): voice_cloning_key: Optional[str] = None, sample_rate: Optional[int] = None, params: Optional[InputParams] = None, - settings: Optional[GoogleStreamTTSSettings] = None, + settings: Optional[GoogleTTSSettings] = None, **kwargs, ): """Initializes the Google streaming TTS service. @@ -1046,34 +1058,35 @@ class GoogleTTSService(GoogleBaseTTSService): voice_id: Google TTS voice identifier (e.g., "en-US-Chirp3-HD-Charon"). .. deprecated:: 0.0.105 - Use ``settings=GoogleStreamTTSSettings(voice=...)`` instead. + Use ``settings=GoogleTTSSettings(voice=...)`` instead. voice_cloning_key: The voice cloning key for Chirp 3 custom voices. sample_rate: Audio sample rate in Hz. If None, uses default. params: Language configuration parameters. .. deprecated:: 0.0.105 - Use ``settings=GoogleStreamTTSSettings(...)`` instead. + Use ``settings=GoogleTTSSettings(...)`` instead. settings: Runtime-updatable settings. When provided alongside deprecated parameters, ``settings`` values take precedence. **kwargs: Additional arguments passed to parent TTSService. """ # 1. Initialize default_settings with hardcoded defaults - default_settings = GoogleStreamTTSSettings( + default_settings = GoogleTTSSettings( model=None, voice="en-US-Chirp3-HD-Charon", language="en-US", + speaking_rate=None, ) # 2. Apply direct init arg overrides (deprecated) if voice_id is not None: - _warn_deprecated_param("voice_id", GoogleStreamTTSSettings, "voice") + _warn_deprecated_param("voice_id", GoogleTTSSettings, "voice") default_settings.voice = voice_id # 3. Apply params overrides — only if settings not provided if params is not None: - _warn_deprecated_param("params", GoogleStreamTTSSettings) + _warn_deprecated_param("params", GoogleTTSSettings) if not settings: if params.language is not None: default_settings.language = self.language_to_service_language(params.language) @@ -1104,7 +1117,7 @@ class GoogleTTSService(GoogleBaseTTSService): Args: delta: Settings delta. Can include 'speaking_rate' (float). """ - if isinstance(delta, GoogleStreamTTSSettings) and is_given(delta.speaking_rate): + if isinstance(delta, GoogleTTSSettings) and is_given(delta.speaking_rate): rate_value = float(delta.speaking_rate) if not (0.25 <= rate_value <= 2.0): logger.warning( @@ -1308,6 +1321,9 @@ class GeminiTTSService(GoogleBaseTTSService): model="gemini-2.5-flash-tts", voice="Kore", language="en-US", + prompt=None, + multi_speaker=False, + speaker_configs=None, ) # 2. Apply direct init arg overrides (deprecated) diff --git a/src/pipecat/services/gradium/stt.py b/src/pipecat/services/gradium/stt.py index e8ab072d0..2a912c355 100644 --- a/src/pipecat/services/gradium/stt.py +++ b/src/pipecat/services/gradium/stt.py @@ -68,7 +68,7 @@ def language_to_gradium_language(language: Language) -> Optional[str]: @dataclass class GradiumSTTSettings(STTSettings): - """Settings for the Gradium STT service.""" + """Settings for GradiumSTTService.""" pass diff --git a/src/pipecat/services/nvidia/stt.py b/src/pipecat/services/nvidia/stt.py index 6823965a9..3d6d2391e 100644 --- a/src/pipecat/services/nvidia/stt.py +++ b/src/pipecat/services/nvidia/stt.py @@ -93,14 +93,14 @@ def language_to_nvidia_riva_language(language: Language) -> Optional[str]: @dataclass class NvidiaSTTSettings(STTSettings): - """Settings for the NVIDIA Riva streaming STT service.""" + """Settings for NvidiaSTTService.""" pass @dataclass class NvidiaSegmentedSTTSettings(STTSettings): - """Settings for the NVIDIA Riva segmented STT service. + """Settings for NvidiaSegmentedSTTService. Parameters: profanity_filter: Whether to filter profanity from results. diff --git a/src/pipecat/services/openai/stt.py b/src/pipecat/services/openai/stt.py index 0e1e4f594..a6d7beb0c 100644 --- a/src/pipecat/services/openai/stt.py +++ b/src/pipecat/services/openai/stt.py @@ -182,7 +182,7 @@ _OPENAI_SAMPLE_RATE = 24000 @dataclass class OpenAIRealtimeSTTSettings(STTSettings): - """Settings for the OpenAI Realtime STT service. + """Settings for OpenAIRealtimeSTTService. Parameters: prompt: Optional prompt text to guide transcription style. diff --git a/src/pipecat/services/openai_realtime_beta/openai.py b/src/pipecat/services/openai_realtime_beta/openai.py index a3f8e47fc..bd9dc29b0 100644 --- a/src/pipecat/services/openai_realtime_beta/openai.py +++ b/src/pipecat/services/openai_realtime_beta/openai.py @@ -134,7 +134,7 @@ class OpenAIRealtimeBetaLLMService(LLMService): api_key: OpenAI API key for authentication. model: OpenAI model name. - .. deprecated:: + .. deprecated:: 0.0.105 Use ``settings=OpenAIRealtimeBetaLLMSettings(model=...)`` instead. base_url: WebSocket base URL for the realtime API. diff --git a/src/pipecat/services/sarvam/stt.py b/src/pipecat/services/sarvam/stt.py index d8659a6e7..3e4136c41 100644 --- a/src/pipecat/services/sarvam/stt.py +++ b/src/pipecat/services/sarvam/stt.py @@ -139,7 +139,7 @@ MODEL_CONFIGS: Dict[str, ModelConfig] = { @dataclass class SarvamSTTSettings(STTSettings): - """Settings for the Sarvam STT service. + """Settings for SarvamSTTService. Parameters: prompt: Optional prompt to guide transcription/translation style/context. @@ -414,7 +414,7 @@ class SarvamSTTService(STTService): async def set_prompt(self, prompt: Optional[str]): """Set the transcription/translation prompt and reconnect. - .. deprecated:: + .. deprecated:: 0.0.104 Use ``STTUpdateSettingsFrame(SarvamSTTSettings(prompt=...))`` instead. Args: diff --git a/src/pipecat/services/soniox/stt.py b/src/pipecat/services/soniox/stt.py index 483cad062..85277a41b 100644 --- a/src/pipecat/services/soniox/stt.py +++ b/src/pipecat/services/soniox/stt.py @@ -141,7 +141,7 @@ def _prepare_language_hints( @dataclass class SonioxSTTSettings(STTSettings): - """Settings for Soniox STT service. + """Settings for SonioxSTTService. Parameters: language_hints: List of language hints to use for transcription. diff --git a/src/pipecat/services/speechmatics/stt.py b/src/pipecat/services/speechmatics/stt.py index b3ded9255..e3100a2ad 100644 --- a/src/pipecat/services/speechmatics/stt.py +++ b/src/pipecat/services/speechmatics/stt.py @@ -85,12 +85,11 @@ class TurnDetectionMode(str, Enum): @dataclass class SpeechmaticsSTTSettings(STTSettings): - """Settings for Speechmatics STT service. + """Settings for SpeechmaticsSTTService. See ``SpeechmaticsSTTService.InputParams`` for detailed descriptions of each field. Parameters: - model: The operating point / model name. domain: Domain for Speechmatics API. turn_detection_mode: Endpoint handling mode. speaker_active_format: Formatter for active speaker ID. @@ -490,16 +489,16 @@ class SpeechmaticsSTTService(STTService): default_settings.prefer_current_speaker = _params.prefer_current_speaker default_settings.extra_params = _params.extra_params - # Build SDK config from settings, then resolve model from operating_point + # --- 4. Settings delta (canonical API, always wins) --- + if settings is not None: + default_settings.apply_update(settings) + + # Build SDK config from settings, set model name before calling super self._client: VoiceAgentClient | None = None self._audio_encoding = encoding self._config: VoiceAgentConfig = self._build_config(default_settings) default_settings.model = self._config.operating_point.value - # --- 4. Settings delta (canonical API, always wins) --- - if settings is not None: - default_settings.apply_update(settings) - super().__init__( sample_rate=sample_rate, ttfs_p99_latency=ttfs_p99_latency, diff --git a/src/pipecat/services/whisper/base_stt.py b/src/pipecat/services/whisper/base_stt.py index 13de0f251..a3bb98548 100644 --- a/src/pipecat/services/whisper/base_stt.py +++ b/src/pipecat/services/whisper/base_stt.py @@ -28,7 +28,7 @@ from pipecat.utils.tracing.service_decorators import traced_stt @dataclass class BaseWhisperSTTSettings(STTSettings): - """Settings for Whisper API-based STT services. + """Settings for BaseWhisperSTTService. Parameters: prompt: Optional text to guide the model's style or continue diff --git a/src/pipecat/services/whisper/stt.py b/src/pipecat/services/whisper/stt.py index af96f92c0..ab5354e2c 100644 --- a/src/pipecat/services/whisper/stt.py +++ b/src/pipecat/services/whisper/stt.py @@ -176,7 +176,7 @@ def language_to_whisper_language(language: Language) -> Optional[str]: @dataclass class WhisperSTTSettings(STTSettings): - """Settings for the local Whisper (Faster Whisper) STT service. + """Settings for WhisperSTTService. Parameters: no_speech_prob: Probability threshold for filtering non-speech segments. @@ -187,7 +187,7 @@ class WhisperSTTSettings(STTSettings): @dataclass class WhisperMLXSTTSettings(STTSettings): - """Settings for the MLX Whisper STT service. + """Settings for WhisperMLXSTTService. Parameters: no_speech_prob: Probability threshold for filtering non-speech segments.