Reorganize into a directory

2025-03-28 13:33:15 -04:00
parent 05d53bc66f
commit 8a12470efd
4 changed files with 235 additions and 184 deletions
--- a/examples/foundational/07j-interruptible-gladia.py
+++ b/examples/foundational/07j-interruptible-gladia.py
@@ -18,9 +18,11 @@ from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineParams, PipelineTask
 from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
-from pipecat.services.cartesia import CartesiaTTSService
-from pipecat.services.gladia import GladiaSTTService
-from pipecat.services.openai import OpenAILLMService
+from pipecat.services.cartesia.tts import CartesiaTTSService
+from pipecat.services.gladia.config import GladiaInputParams, LanguageConfig
+from pipecat.services.gladia.stt import GladiaSTTService
+from pipecat.services.openai.llm import OpenAILLMService
+from pipecat.transcriptions.language import Language
 from pipecat.transports.services.daily import DailyParams, DailyTransport

 load_dotenv(override=True)
@@ -47,6 +49,11 @@ async def main():

        stt = GladiaSTTService(
            api_key=os.getenv("GLADIA_API_KEY"),
+            params=GladiaInputParams(
+                language_config=LanguageConfig(
+                    languages=[Language.EN],
+                )
+            ),
        )

        tts = CartesiaTTSService(
--- a/src/pipecat/services/gladia/init.py
+++ b/src/pipecat/services/gladia/init.py
@@ -0,0 +1,13 @@
+#
+# Copyright (c) 2024–2025, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+
+import sys
+
+from pipecat.services import DeprecatedModuleProxy
+
+from .stt import *
+
+sys.modules[__name__] = DeprecatedModuleProxy(globals(), "gladia", "gladia.stt")
--- a/src/pipecat/services/gladia/config.py
+++ b/src/pipecat/services/gladia/config.py
@@ -0,0 +1,165 @@
+#
+# Copyright (c) 2024–2025, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+
+from typing import Any, Dict, List, Optional, Union
+
+from pydantic import BaseModel
+
+from pipecat.transcriptions.language import Language
+
+
+class LanguageConfig(BaseModel):
+    """Configuration for language detection and handling.
+
+    Attributes:
+        languages: List of language codes to use for transcription
+        code_switching: Whether to auto-detect language changes during transcription
+    """
+
+    languages: Optional[List[str]] = None
+    code_switching: Optional[bool] = None
+
+
+class PreProcessingConfig(BaseModel):
+    """Configuration for audio pre-processing options.
+
+    Attributes:
+        audio_enhancer: Whether to apply audio enhancement
+        speech_threshold: Sensitivity for speech detection (0-1)
+    """
+
+    audio_enhancer: Optional[bool] = None
+    speech_threshold: Optional[float] = None
+
+
+class CustomVocabularyItem(BaseModel):
+    """Represents a custom vocabulary item with an intensity value.
+
+    Attributes:
+        value: The vocabulary word or phrase
+        intensity: The bias intensity for this vocabulary item (0-1)
+    """
+
+    value: str
+    intensity: float
+
+
+class CustomVocabularyConfig(BaseModel):
+    """Configuration for custom vocabulary.
+
+    Attributes:
+        vocabulary: List of words/phrases or CustomVocabularyItem objects
+        default_intensity: Default intensity for simple string vocabulary items
+    """
+
+    vocabulary: Optional[List[Union[str, CustomVocabularyItem]]] = None
+    default_intensity: Optional[float] = None
+
+
+class CustomSpellingConfig(BaseModel):
+    """Configuration for custom spelling rules.
+
+    Attributes:
+        spelling_dictionary: Mapping of correct spellings to phonetic variations
+    """
+
+    spelling_dictionary: Optional[Dict[str, List[str]]] = None
+
+
+class TranslationConfig(BaseModel):
+    """Configuration for real-time translation.
+
+    Attributes:
+        target_languages: List of target language codes for translation
+        model: Translation model to use ("base" or "enhanced")
+        match_original_utterances: Whether to align translations with original utterances
+    """
+
+    target_languages: Optional[List[str]] = None
+    model: Optional[str] = None
+    match_original_utterances: Optional[bool] = None
+
+
+class RealtimeProcessingConfig(BaseModel):
+    """Configuration for real-time processing features.
+
+    Attributes:
+        words_accurate_timestamps: Whether to provide per-word timestamps
+        custom_vocabulary: Whether to enable custom vocabulary
+        custom_vocabulary_config: Custom vocabulary configuration
+        custom_spelling: Whether to enable custom spelling
+        custom_spelling_config: Custom spelling configuration
+        translation: Whether to enable translation
+        translation_config: Translation configuration
+        named_entity_recognition: Whether to enable named entity recognition
+        sentiment_analysis: Whether to enable sentiment analysis
+    """
+
+    words_accurate_timestamps: Optional[bool] = None
+    custom_vocabulary: Optional[bool] = None
+    custom_vocabulary_config: Optional[CustomVocabularyConfig] = None
+    custom_spelling: Optional[bool] = None
+    custom_spelling_config: Optional[CustomSpellingConfig] = None
+    translation: Optional[bool] = None
+    translation_config: Optional[TranslationConfig] = None
+    named_entity_recognition: Optional[bool] = None
+    sentiment_analysis: Optional[bool] = None
+
+
+class MessagesConfig(BaseModel):
+    """Configuration for controlling which message types are sent via WebSocket.
+
+    Attributes:
+        receive_partial_transcripts: Whether to receive intermediate transcription results
+        receive_final_transcripts: Whether to receive final transcription results
+        receive_speech_events: Whether to receive speech begin/end events
+        receive_pre_processing_events: Whether to receive pre-processing events
+        receive_realtime_processing_events: Whether to receive real-time processing events
+        receive_post_processing_events: Whether to receive post-processing events
+        receive_acknowledgments: Whether to receive acknowledgment messages
+        receive_errors: Whether to receive error messages
+        receive_lifecycle_events: Whether to receive lifecycle events
+    """
+
+    receive_partial_transcripts: Optional[bool] = None
+    receive_final_transcripts: Optional[bool] = None
+    receive_speech_events: Optional[bool] = None
+    receive_pre_processing_events: Optional[bool] = None
+    receive_realtime_processing_events: Optional[bool] = None
+    receive_post_processing_events: Optional[bool] = None
+    receive_acknowledgments: Optional[bool] = None
+    receive_errors: Optional[bool] = None
+    receive_lifecycle_events: Optional[bool] = None
+
+
+class GladiaInputParams(BaseModel):
+    """Configuration parameters for the Gladia STT service.
+
+    Attributes:
+        encoding: Audio encoding format
+        bit_depth: Audio bit depth
+        channels: Number of audio channels
+        custom_metadata: Additional metadata to include with requests
+        endpointing: Silence duration in seconds to mark end of speech
+        maximum_duration_without_endpointing: Maximum utterance duration without silence
+        language: DEPRECATED - Use language_config instead
+        language_config: Detailed language configuration
+        pre_processing: Audio pre-processing options
+        realtime_processing: Real-time processing features
+        messages_config: WebSocket message filtering options
+    """
+
+    encoding: Optional[str] = "wav/pcm"
+    bit_depth: Optional[int] = 16
+    channels: Optional[int] = 1
+    custom_metadata: Optional[Dict[str, Any]] = None
+    endpointing: Optional[float] = None
+    maximum_duration_without_endpointing: Optional[int] = 10
+    language: Optional[Language] = None  # Deprecated
+    language_config: Optional[LanguageConfig] = None
+    pre_processing: Optional[PreProcessingConfig] = None
+    realtime_processing: Optional[RealtimeProcessingConfig] = None
+    messages_config: Optional[MessagesConfig] = None
--- a/src/pipecat/services/gladia/stt.py
+++ b/src/pipecat/services/gladia/stt.py
@@ -7,11 +7,10 @@
 import base64
 import json
 import warnings
-from typing import Any, AsyncGenerator, Dict, List, Optional, Union
+from typing import Any, AsyncGenerator, Dict, Optional

 import aiohttp
 from loguru import logger
-from pydantic import BaseModel

 from pipecat.frames.frames import (
    CancelFrame,
@@ -22,6 +21,7 @@ from pipecat.frames.frames import (
    TranscriptionFrame,
 )
 from pipecat.services.ai_services import STTService
+from pipecat.services.gladia.config import GladiaInputParams
 from pipecat.transcriptions.language import Language
 from pipecat.utils.time import time_now_iso8601

@@ -29,9 +29,7 @@ try:
    import websockets
 except ModuleNotFoundError as e:
    logger.error(f"Exception: {e}")
-    logger.error(
-        "In order to use Gladia, you need to `pip install pipecat-ai[gladia]`. Also, set `GLADIA_API_KEY` environment variable."
-    )
+    logger.error("In order to use Gladia, you need to `pip install pipecat-ai[gladia]`.")
    raise Exception(f"Missing module: {e}")


@@ -138,133 +136,18 @@ def language_to_gladia_language(language: Language) -> Optional[str]:
    return result


-# Configurations supported by Gladia
-# Refer to the docs for more information:
-# https://docs.gladia.io/api-reference/v2/live/init
+# Deprecation warning for nested InputParams
+class _InputParamsDescriptor:
+    """Descriptor for backward compatibility with deprecation warning."""

-
-class LanguageConfig(BaseModel):
-    """Configuration for language detection and handling.
-
-    Attributes:
-        languages: List of language codes to use for transcription
-        code_switching: Whether to auto-detect language changes during transcription
-    """
-
-    languages: Optional[List[str]] = None
-    code_switching: Optional[bool] = None
-
-
-class PreProcessingConfig(BaseModel):
-    """Configuration for audio pre-processing options.
-
-    Attributes:
-        audio_enhancer: Whether to apply audio enhancement
-        speech_threshold: Sensitivity for speech detection (0-1)
-    """
-
-    audio_enhancer: Optional[bool] = None
-    speech_threshold: Optional[float] = None
-
-
-class CustomVocabularyItem(BaseModel):
-    """Represents a custom vocabulary item with an intensity value.
-
-    Attributes:
-        value: The vocabulary word or phrase
-        intensity: The bias intensity for this vocabulary item (0-1)
-    """
-
-    value: str
-    intensity: float
-
-
-class CustomVocabularyConfig(BaseModel):
-    """Configuration for custom vocabulary.
-
-    Attributes:
-        vocabulary: List of words/phrases or CustomVocabularyItem objects
-        default_intensity: Default intensity for simple string vocabulary items
-    """
-
-    vocabulary: Optional[List[Union[str, CustomVocabularyItem]]] = None
-    default_intensity: Optional[float] = None
-
-
-class CustomSpellingConfig(BaseModel):
-    """Configuration for custom spelling rules.
-
-    Attributes:
-        spelling_dictionary: Mapping of correct spellings to phonetic variations
-    """
-
-    spelling_dictionary: Optional[Dict[str, List[str]]] = None
-
-
-class TranslationConfig(BaseModel):
-    """Configuration for real-time translation.
-
-    Attributes:
-        target_languages: List of target language codes for translation
-        model: Translation model to use ("base" or "enhanced")
-        match_original_utterances: Whether to align translations with original utterances
-    """
-
-    target_languages: Optional[List[str]] = None
-    model: Optional[str] = None
-    match_original_utterances: Optional[bool] = None
-
-
-class RealtimeProcessingConfig(BaseModel):
-    """Configuration for real-time processing features.
-
-    Attributes:
-        words_accurate_timestamps: Whether to provide per-word timestamps
-        custom_vocabulary: Whether to enable custom vocabulary
-        custom_vocabulary_config: Custom vocabulary configuration
-        custom_spelling: Whether to enable custom spelling
-        custom_spelling_config: Custom spelling configuration
-        translation: Whether to enable translation
-        translation_config: Translation configuration
-        named_entity_recognition: Whether to enable named entity recognition
-        sentiment_analysis: Whether to enable sentiment analysis
-    """
-
-    words_accurate_timestamps: Optional[bool] = None
-    custom_vocabulary: Optional[bool] = None
-    custom_vocabulary_config: Optional[CustomVocabularyConfig] = None
-    custom_spelling: Optional[bool] = None
-    custom_spelling_config: Optional[CustomSpellingConfig] = None
-    translation: Optional[bool] = None
-    translation_config: Optional[TranslationConfig] = None
-    named_entity_recognition: Optional[bool] = None
-    sentiment_analysis: Optional[bool] = None
-
-
-class MessagesConfig(BaseModel):
-    """Configuration for controlling which message types are sent via WebSocket.
-
-    Attributes:
-        receive_partial_transcripts: Whether to receive intermediate transcription results
-        receive_final_transcripts: Whether to receive final transcription results
-        receive_speech_events: Whether to receive speech begin/end events
-        receive_pre_processing_events: Whether to receive pre-processing events
-        receive_realtime_processing_events: Whether to receive real-time processing events
-        receive_post_processing_events: Whether to receive post-processing events
-        receive_acknowledgments: Whether to receive acknowledgment messages
-        receive_errors: Whether to receive error messages
-        receive_lifecycle_events: Whether to receive lifecycle events
-    """
-
-    receive_partial_transcripts: Optional[bool] = None
-    receive_final_transcripts: Optional[bool] = None
-    receive_speech_events: Optional[bool] = None
-    receive_pre_processing_events: Optional[bool] = None
-    receive_realtime_processing_events: Optional[bool] = None
-    receive_post_processing_events: Optional[bool] = None
-    receive_acknowledgments: Optional[bool] = None
-    receive_errors: Optional[bool] = None
-    receive_lifecycle_events: Optional[bool] = None
+    def __get__(self, obj, objtype=None):
+        warnings.warn(
+            "GladiaSTTService.InputParams is deprecated and will be removed in a future version. "
+            "Import and use GladiaInputParams directly instead.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        return GladiaInputParams


 class GladiaSTTService(STTService):
@@ -276,34 +159,8 @@ class GladiaSTTService(STTService):
    For complete API documentation, see: https://docs.gladia.io/api-reference/v2/live/init
    """

-    class InputParams(BaseModel):
-        """Configuration parameters for the Gladia STT service.
-
-        Attributes:
-            encoding: Audio encoding format
-            bit_depth: Audio bit depth
-            channels: Number of audio channels
-            custom_metadata: Additional metadata to include with requests
-            endpointing: Silence duration in seconds to mark end of speech
-            maximum_duration_without_endpointing: Maximum utterance duration without silence
-            language: DEPRECATED - Use language_config instead
-            language_config: Detailed language configuration
-            pre_processing: Audio pre-processing options
-            realtime_processing: Real-time processing features
-            messages_config: WebSocket message filtering options
-        """
-
-        encoding: Optional[str] = "wav/pcm"
-        bit_depth: Optional[int] = 16
-        channels: Optional[int] = 1
-        custom_metadata: Optional[Dict[str, Any]] = None
-        endpointing: Optional[float] = None
-        maximum_duration_without_endpointing: Optional[int] = None
-        language: Optional[Language] = None  # Deprecated
-        language_config: Optional[LanguageConfig] = None
-        pre_processing: Optional[PreProcessingConfig] = None
-        realtime_processing: Optional[RealtimeProcessingConfig] = None
-        messages_config: Optional[MessagesConfig] = None
+    # Maintain backward compatibility
+    InputParams = _InputParamsDescriptor()

    def __init__(
        self,
@@ -313,7 +170,7 @@ class GladiaSTTService(STTService):
        confidence: float = 0.5,
        sample_rate: Optional[int] = None,
        model: str = "fast",
-        params: InputParams = InputParams(),
+        params: GladiaInputParams = GladiaInputParams(),
        **kwargs,
    ):
        """Initialize the Gladia STT service.
@@ -373,7 +230,7 @@ class GladiaSTTService(STTService):

        # Add language configuration (prioritize language_config over deprecated language)
        if self._params.language_config:
-            settings["language_config"] = self._params.language_config.dict(exclude_none=True)
+            settings["language_config"] = self._params.language_config.model_dump(exclude_none=True)
        elif self._params.language:  # Backward compatibility for deprecated parameter
            language_code = self.language_to_service_language(self._params.language)
            if language_code:
@@ -384,17 +241,17 @@ class GladiaSTTService(STTService):

        # Add pre_processing configuration if provided
        if self._params.pre_processing:
-            settings["pre_processing"] = self._params.pre_processing.dict(exclude_none=True)
+            settings["pre_processing"] = self._params.pre_processing.model_dump(exclude_none=True)

        # Add realtime_processing configuration if provided
        if self._params.realtime_processing:
-            settings["realtime_processing"] = self._params.realtime_processing.dict(
+            settings["realtime_processing"] = self._params.realtime_processing.model_dump(
                exclude_none=True
            )

        # Add messages_config if provided
        if self._params.messages_config:
-            settings["messages_config"] = self._params.messages_config.dict(exclude_none=True)
+            settings["messages_config"] = self._params.messages_config.model_dump(exclude_none=True)

        return settings

@@ -445,10 +302,13 @@ class GladiaSTTService(STTService):
                if response.ok:
                    return await response.json()
                else:
+                    error_text = await response.text()
                    logger.error(
-                        f"Gladia error: {response.status}: {response.text or response.reason}"
+                        f"Gladia error: {response.status}: {error_text or response.reason}"
+                    )
+                    raise Exception(
+                        f"Failed to initialize Gladia session: {response.status} - {error_text}"
                    )
-                    raise Exception(f"Failed to initialize Gladia session: {response.status}")

    async def _send_audio(self, audio: bytes):
        data = base64.b64encode(audio).decode("utf-8")
@@ -460,18 +320,24 @@ class GladiaSTTService(STTService):
            await self._websocket.send(json.dumps({"type": "stop_recording"}))

    async def _receive_task_handler(self):
-        async for message in self._websocket:
-            content = json.loads(message)
-            if content["type"] == "transcript":
-                utterance = content["data"]["utterance"]
-                confidence = utterance.get("confidence", 0)
-                transcript = utterance["text"]
-                if confidence >= self._confidence:
-                    if content["data"]["is_final"]:
-                        await self.push_frame(
-                            TranscriptionFrame(transcript, "", time_now_iso8601())
-                        )
-                    else:
-                        await self.push_frame(
-                            InterimTranscriptionFrame(transcript, "", time_now_iso8601())
-                        )
+        try:
+            async for message in self._websocket:
+                content = json.loads(message)
+                if content["type"] == "transcript":
+                    utterance = content["data"]["utterance"]
+                    confidence = utterance.get("confidence", 0)
+                    transcript = utterance["text"]
+                    if confidence >= self._confidence:
+                        if content["data"]["is_final"]:
+                            await self.push_frame(
+                                TranscriptionFrame(transcript, "", time_now_iso8601())
+                            )
+                        else:
+                            await self.push_frame(
+                                InterimTranscriptionFrame(transcript, "", time_now_iso8601())
+                            )
+        except websockets.exceptions.ConnectionClosed:
+            # Expected when closing the connection
+            pass
+        except Exception as e:
+            logger.error(f"Error in Gladia WebSocket handler: {e}")