diff --git a/examples/foundational/07j-interruptible-gladia.py b/examples/foundational/07j-interruptible-gladia.py index 7dcd44a7a..2c7d86a79 100644 --- a/examples/foundational/07j-interruptible-gladia.py +++ b/examples/foundational/07j-interruptible-gladia.py @@ -18,9 +18,11 @@ from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext -from pipecat.services.cartesia import CartesiaTTSService -from pipecat.services.gladia import GladiaSTTService -from pipecat.services.openai import OpenAILLMService +from pipecat.services.cartesia.tts import CartesiaTTSService +from pipecat.services.gladia.config import GladiaInputParams, LanguageConfig +from pipecat.services.gladia.stt import GladiaSTTService +from pipecat.services.openai.llm import OpenAILLMService +from pipecat.transcriptions.language import Language from pipecat.transports.services.daily import DailyParams, DailyTransport load_dotenv(override=True) @@ -47,6 +49,11 @@ async def main(): stt = GladiaSTTService( api_key=os.getenv("GLADIA_API_KEY"), + params=GladiaInputParams( + language_config=LanguageConfig( + languages=[Language.EN], + ) + ), ) tts = CartesiaTTSService( diff --git a/src/pipecat/services/gladia/__init__.py b/src/pipecat/services/gladia/__init__.py new file mode 100644 index 000000000..916988e42 --- /dev/null +++ b/src/pipecat/services/gladia/__init__.py @@ -0,0 +1,13 @@ +# +# Copyright (c) 2024–2025, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +import sys + +from pipecat.services import DeprecatedModuleProxy + +from .stt import * + +sys.modules[__name__] = DeprecatedModuleProxy(globals(), "gladia", "gladia.stt") diff --git a/src/pipecat/services/gladia/config.py b/src/pipecat/services/gladia/config.py new file mode 100644 index 000000000..6014dd576 --- /dev/null +++ b/src/pipecat/services/gladia/config.py @@ -0,0 +1,165 @@ +# +# Copyright (c) 2024–2025, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +from typing import Any, Dict, List, Optional, Union + +from pydantic import BaseModel + +from pipecat.transcriptions.language import Language + + +class LanguageConfig(BaseModel): + """Configuration for language detection and handling. + + Attributes: + languages: List of language codes to use for transcription + code_switching: Whether to auto-detect language changes during transcription + """ + + languages: Optional[List[str]] = None + code_switching: Optional[bool] = None + + +class PreProcessingConfig(BaseModel): + """Configuration for audio pre-processing options. + + Attributes: + audio_enhancer: Whether to apply audio enhancement + speech_threshold: Sensitivity for speech detection (0-1) + """ + + audio_enhancer: Optional[bool] = None + speech_threshold: Optional[float] = None + + +class CustomVocabularyItem(BaseModel): + """Represents a custom vocabulary item with an intensity value. + + Attributes: + value: The vocabulary word or phrase + intensity: The bias intensity for this vocabulary item (0-1) + """ + + value: str + intensity: float + + +class CustomVocabularyConfig(BaseModel): + """Configuration for custom vocabulary. + + Attributes: + vocabulary: List of words/phrases or CustomVocabularyItem objects + default_intensity: Default intensity for simple string vocabulary items + """ + + vocabulary: Optional[List[Union[str, CustomVocabularyItem]]] = None + default_intensity: Optional[float] = None + + +class CustomSpellingConfig(BaseModel): + """Configuration for custom spelling rules. + + Attributes: + spelling_dictionary: Mapping of correct spellings to phonetic variations + """ + + spelling_dictionary: Optional[Dict[str, List[str]]] = None + + +class TranslationConfig(BaseModel): + """Configuration for real-time translation. + + Attributes: + target_languages: List of target language codes for translation + model: Translation model to use ("base" or "enhanced") + match_original_utterances: Whether to align translations with original utterances + """ + + target_languages: Optional[List[str]] = None + model: Optional[str] = None + match_original_utterances: Optional[bool] = None + + +class RealtimeProcessingConfig(BaseModel): + """Configuration for real-time processing features. + + Attributes: + words_accurate_timestamps: Whether to provide per-word timestamps + custom_vocabulary: Whether to enable custom vocabulary + custom_vocabulary_config: Custom vocabulary configuration + custom_spelling: Whether to enable custom spelling + custom_spelling_config: Custom spelling configuration + translation: Whether to enable translation + translation_config: Translation configuration + named_entity_recognition: Whether to enable named entity recognition + sentiment_analysis: Whether to enable sentiment analysis + """ + + words_accurate_timestamps: Optional[bool] = None + custom_vocabulary: Optional[bool] = None + custom_vocabulary_config: Optional[CustomVocabularyConfig] = None + custom_spelling: Optional[bool] = None + custom_spelling_config: Optional[CustomSpellingConfig] = None + translation: Optional[bool] = None + translation_config: Optional[TranslationConfig] = None + named_entity_recognition: Optional[bool] = None + sentiment_analysis: Optional[bool] = None + + +class MessagesConfig(BaseModel): + """Configuration for controlling which message types are sent via WebSocket. + + Attributes: + receive_partial_transcripts: Whether to receive intermediate transcription results + receive_final_transcripts: Whether to receive final transcription results + receive_speech_events: Whether to receive speech begin/end events + receive_pre_processing_events: Whether to receive pre-processing events + receive_realtime_processing_events: Whether to receive real-time processing events + receive_post_processing_events: Whether to receive post-processing events + receive_acknowledgments: Whether to receive acknowledgment messages + receive_errors: Whether to receive error messages + receive_lifecycle_events: Whether to receive lifecycle events + """ + + receive_partial_transcripts: Optional[bool] = None + receive_final_transcripts: Optional[bool] = None + receive_speech_events: Optional[bool] = None + receive_pre_processing_events: Optional[bool] = None + receive_realtime_processing_events: Optional[bool] = None + receive_post_processing_events: Optional[bool] = None + receive_acknowledgments: Optional[bool] = None + receive_errors: Optional[bool] = None + receive_lifecycle_events: Optional[bool] = None + + +class GladiaInputParams(BaseModel): + """Configuration parameters for the Gladia STT service. + + Attributes: + encoding: Audio encoding format + bit_depth: Audio bit depth + channels: Number of audio channels + custom_metadata: Additional metadata to include with requests + endpointing: Silence duration in seconds to mark end of speech + maximum_duration_without_endpointing: Maximum utterance duration without silence + language: DEPRECATED - Use language_config instead + language_config: Detailed language configuration + pre_processing: Audio pre-processing options + realtime_processing: Real-time processing features + messages_config: WebSocket message filtering options + """ + + encoding: Optional[str] = "wav/pcm" + bit_depth: Optional[int] = 16 + channels: Optional[int] = 1 + custom_metadata: Optional[Dict[str, Any]] = None + endpointing: Optional[float] = None + maximum_duration_without_endpointing: Optional[int] = 10 + language: Optional[Language] = None # Deprecated + language_config: Optional[LanguageConfig] = None + pre_processing: Optional[PreProcessingConfig] = None + realtime_processing: Optional[RealtimeProcessingConfig] = None + messages_config: Optional[MessagesConfig] = None diff --git a/src/pipecat/services/gladia.py b/src/pipecat/services/gladia/stt.py similarity index 56% rename from src/pipecat/services/gladia.py rename to src/pipecat/services/gladia/stt.py index 73c36b74b..87c1c649d 100644 --- a/src/pipecat/services/gladia.py +++ b/src/pipecat/services/gladia/stt.py @@ -7,11 +7,10 @@ import base64 import json import warnings -from typing import Any, AsyncGenerator, Dict, List, Optional, Union +from typing import Any, AsyncGenerator, Dict, Optional import aiohttp from loguru import logger -from pydantic import BaseModel from pipecat.frames.frames import ( CancelFrame, @@ -22,6 +21,7 @@ from pipecat.frames.frames import ( TranscriptionFrame, ) from pipecat.services.ai_services import STTService +from pipecat.services.gladia.config import GladiaInputParams from pipecat.transcriptions.language import Language from pipecat.utils.time import time_now_iso8601 @@ -29,9 +29,7 @@ try: import websockets except ModuleNotFoundError as e: logger.error(f"Exception: {e}") - logger.error( - "In order to use Gladia, you need to `pip install pipecat-ai[gladia]`. Also, set `GLADIA_API_KEY` environment variable." - ) + logger.error("In order to use Gladia, you need to `pip install pipecat-ai[gladia]`.") raise Exception(f"Missing module: {e}") @@ -138,133 +136,18 @@ def language_to_gladia_language(language: Language) -> Optional[str]: return result -# Configurations supported by Gladia -# Refer to the docs for more information: -# https://docs.gladia.io/api-reference/v2/live/init +# Deprecation warning for nested InputParams +class _InputParamsDescriptor: + """Descriptor for backward compatibility with deprecation warning.""" - -class LanguageConfig(BaseModel): - """Configuration for language detection and handling. - - Attributes: - languages: List of language codes to use for transcription - code_switching: Whether to auto-detect language changes during transcription - """ - - languages: Optional[List[str]] = None - code_switching: Optional[bool] = None - - -class PreProcessingConfig(BaseModel): - """Configuration for audio pre-processing options. - - Attributes: - audio_enhancer: Whether to apply audio enhancement - speech_threshold: Sensitivity for speech detection (0-1) - """ - - audio_enhancer: Optional[bool] = None - speech_threshold: Optional[float] = None - - -class CustomVocabularyItem(BaseModel): - """Represents a custom vocabulary item with an intensity value. - - Attributes: - value: The vocabulary word or phrase - intensity: The bias intensity for this vocabulary item (0-1) - """ - - value: str - intensity: float - - -class CustomVocabularyConfig(BaseModel): - """Configuration for custom vocabulary. - - Attributes: - vocabulary: List of words/phrases or CustomVocabularyItem objects - default_intensity: Default intensity for simple string vocabulary items - """ - - vocabulary: Optional[List[Union[str, CustomVocabularyItem]]] = None - default_intensity: Optional[float] = None - - -class CustomSpellingConfig(BaseModel): - """Configuration for custom spelling rules. - - Attributes: - spelling_dictionary: Mapping of correct spellings to phonetic variations - """ - - spelling_dictionary: Optional[Dict[str, List[str]]] = None - - -class TranslationConfig(BaseModel): - """Configuration for real-time translation. - - Attributes: - target_languages: List of target language codes for translation - model: Translation model to use ("base" or "enhanced") - match_original_utterances: Whether to align translations with original utterances - """ - - target_languages: Optional[List[str]] = None - model: Optional[str] = None - match_original_utterances: Optional[bool] = None - - -class RealtimeProcessingConfig(BaseModel): - """Configuration for real-time processing features. - - Attributes: - words_accurate_timestamps: Whether to provide per-word timestamps - custom_vocabulary: Whether to enable custom vocabulary - custom_vocabulary_config: Custom vocabulary configuration - custom_spelling: Whether to enable custom spelling - custom_spelling_config: Custom spelling configuration - translation: Whether to enable translation - translation_config: Translation configuration - named_entity_recognition: Whether to enable named entity recognition - sentiment_analysis: Whether to enable sentiment analysis - """ - - words_accurate_timestamps: Optional[bool] = None - custom_vocabulary: Optional[bool] = None - custom_vocabulary_config: Optional[CustomVocabularyConfig] = None - custom_spelling: Optional[bool] = None - custom_spelling_config: Optional[CustomSpellingConfig] = None - translation: Optional[bool] = None - translation_config: Optional[TranslationConfig] = None - named_entity_recognition: Optional[bool] = None - sentiment_analysis: Optional[bool] = None - - -class MessagesConfig(BaseModel): - """Configuration for controlling which message types are sent via WebSocket. - - Attributes: - receive_partial_transcripts: Whether to receive intermediate transcription results - receive_final_transcripts: Whether to receive final transcription results - receive_speech_events: Whether to receive speech begin/end events - receive_pre_processing_events: Whether to receive pre-processing events - receive_realtime_processing_events: Whether to receive real-time processing events - receive_post_processing_events: Whether to receive post-processing events - receive_acknowledgments: Whether to receive acknowledgment messages - receive_errors: Whether to receive error messages - receive_lifecycle_events: Whether to receive lifecycle events - """ - - receive_partial_transcripts: Optional[bool] = None - receive_final_transcripts: Optional[bool] = None - receive_speech_events: Optional[bool] = None - receive_pre_processing_events: Optional[bool] = None - receive_realtime_processing_events: Optional[bool] = None - receive_post_processing_events: Optional[bool] = None - receive_acknowledgments: Optional[bool] = None - receive_errors: Optional[bool] = None - receive_lifecycle_events: Optional[bool] = None + def __get__(self, obj, objtype=None): + warnings.warn( + "GladiaSTTService.InputParams is deprecated and will be removed in a future version. " + "Import and use GladiaInputParams directly instead.", + DeprecationWarning, + stacklevel=2, + ) + return GladiaInputParams class GladiaSTTService(STTService): @@ -276,34 +159,8 @@ class GladiaSTTService(STTService): For complete API documentation, see: https://docs.gladia.io/api-reference/v2/live/init """ - class InputParams(BaseModel): - """Configuration parameters for the Gladia STT service. - - Attributes: - encoding: Audio encoding format - bit_depth: Audio bit depth - channels: Number of audio channels - custom_metadata: Additional metadata to include with requests - endpointing: Silence duration in seconds to mark end of speech - maximum_duration_without_endpointing: Maximum utterance duration without silence - language: DEPRECATED - Use language_config instead - language_config: Detailed language configuration - pre_processing: Audio pre-processing options - realtime_processing: Real-time processing features - messages_config: WebSocket message filtering options - """ - - encoding: Optional[str] = "wav/pcm" - bit_depth: Optional[int] = 16 - channels: Optional[int] = 1 - custom_metadata: Optional[Dict[str, Any]] = None - endpointing: Optional[float] = None - maximum_duration_without_endpointing: Optional[int] = None - language: Optional[Language] = None # Deprecated - language_config: Optional[LanguageConfig] = None - pre_processing: Optional[PreProcessingConfig] = None - realtime_processing: Optional[RealtimeProcessingConfig] = None - messages_config: Optional[MessagesConfig] = None + # Maintain backward compatibility + InputParams = _InputParamsDescriptor() def __init__( self, @@ -313,7 +170,7 @@ class GladiaSTTService(STTService): confidence: float = 0.5, sample_rate: Optional[int] = None, model: str = "fast", - params: InputParams = InputParams(), + params: GladiaInputParams = GladiaInputParams(), **kwargs, ): """Initialize the Gladia STT service. @@ -373,7 +230,7 @@ class GladiaSTTService(STTService): # Add language configuration (prioritize language_config over deprecated language) if self._params.language_config: - settings["language_config"] = self._params.language_config.dict(exclude_none=True) + settings["language_config"] = self._params.language_config.model_dump(exclude_none=True) elif self._params.language: # Backward compatibility for deprecated parameter language_code = self.language_to_service_language(self._params.language) if language_code: @@ -384,17 +241,17 @@ class GladiaSTTService(STTService): # Add pre_processing configuration if provided if self._params.pre_processing: - settings["pre_processing"] = self._params.pre_processing.dict(exclude_none=True) + settings["pre_processing"] = self._params.pre_processing.model_dump(exclude_none=True) # Add realtime_processing configuration if provided if self._params.realtime_processing: - settings["realtime_processing"] = self._params.realtime_processing.dict( + settings["realtime_processing"] = self._params.realtime_processing.model_dump( exclude_none=True ) # Add messages_config if provided if self._params.messages_config: - settings["messages_config"] = self._params.messages_config.dict(exclude_none=True) + settings["messages_config"] = self._params.messages_config.model_dump(exclude_none=True) return settings @@ -445,10 +302,13 @@ class GladiaSTTService(STTService): if response.ok: return await response.json() else: + error_text = await response.text() logger.error( - f"Gladia error: {response.status}: {response.text or response.reason}" + f"Gladia error: {response.status}: {error_text or response.reason}" + ) + raise Exception( + f"Failed to initialize Gladia session: {response.status} - {error_text}" ) - raise Exception(f"Failed to initialize Gladia session: {response.status}") async def _send_audio(self, audio: bytes): data = base64.b64encode(audio).decode("utf-8") @@ -460,18 +320,24 @@ class GladiaSTTService(STTService): await self._websocket.send(json.dumps({"type": "stop_recording"})) async def _receive_task_handler(self): - async for message in self._websocket: - content = json.loads(message) - if content["type"] == "transcript": - utterance = content["data"]["utterance"] - confidence = utterance.get("confidence", 0) - transcript = utterance["text"] - if confidence >= self._confidence: - if content["data"]["is_final"]: - await self.push_frame( - TranscriptionFrame(transcript, "", time_now_iso8601()) - ) - else: - await self.push_frame( - InterimTranscriptionFrame(transcript, "", time_now_iso8601()) - ) + try: + async for message in self._websocket: + content = json.loads(message) + if content["type"] == "transcript": + utterance = content["data"]["utterance"] + confidence = utterance.get("confidence", 0) + transcript = utterance["text"] + if confidence >= self._confidence: + if content["data"]["is_final"]: + await self.push_frame( + TranscriptionFrame(transcript, "", time_now_iso8601()) + ) + else: + await self.push_frame( + InterimTranscriptionFrame(transcript, "", time_now_iso8601()) + ) + except websockets.exceptions.ConnectionClosed: + # Expected when closing the connection + pass + except Exception as e: + logger.error(f"Error in Gladia WebSocket handler: {e}")