Reorganize into a directory

This commit is contained in:
Mark Backman
2025-03-28 13:33:15 -04:00
parent 05d53bc66f
commit 8a12470efd
4 changed files with 235 additions and 184 deletions

View File

@@ -18,9 +18,11 @@ from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.gladia import GladiaSTTService
from pipecat.services.openai import OpenAILLMService
from pipecat.services.cartesia.tts import CartesiaTTSService
from pipecat.services.gladia.config import GladiaInputParams, LanguageConfig
from pipecat.services.gladia.stt import GladiaSTTService
from pipecat.services.openai.llm import OpenAILLMService
from pipecat.transcriptions.language import Language
from pipecat.transports.services.daily import DailyParams, DailyTransport
load_dotenv(override=True)
@@ -47,6 +49,11 @@ async def main():
stt = GladiaSTTService(
api_key=os.getenv("GLADIA_API_KEY"),
params=GladiaInputParams(
language_config=LanguageConfig(
languages=[Language.EN],
)
),
)
tts = CartesiaTTSService(

View File

@@ -0,0 +1,13 @@
#
# Copyright (c) 20242025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import sys
from pipecat.services import DeprecatedModuleProxy
from .stt import *
sys.modules[__name__] = DeprecatedModuleProxy(globals(), "gladia", "gladia.stt")

View File

@@ -0,0 +1,165 @@
#
# Copyright (c) 20242025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
from typing import Any, Dict, List, Optional, Union
from pydantic import BaseModel
from pipecat.transcriptions.language import Language
class LanguageConfig(BaseModel):
"""Configuration for language detection and handling.
Attributes:
languages: List of language codes to use for transcription
code_switching: Whether to auto-detect language changes during transcription
"""
languages: Optional[List[str]] = None
code_switching: Optional[bool] = None
class PreProcessingConfig(BaseModel):
"""Configuration for audio pre-processing options.
Attributes:
audio_enhancer: Whether to apply audio enhancement
speech_threshold: Sensitivity for speech detection (0-1)
"""
audio_enhancer: Optional[bool] = None
speech_threshold: Optional[float] = None
class CustomVocabularyItem(BaseModel):
"""Represents a custom vocabulary item with an intensity value.
Attributes:
value: The vocabulary word or phrase
intensity: The bias intensity for this vocabulary item (0-1)
"""
value: str
intensity: float
class CustomVocabularyConfig(BaseModel):
"""Configuration for custom vocabulary.
Attributes:
vocabulary: List of words/phrases or CustomVocabularyItem objects
default_intensity: Default intensity for simple string vocabulary items
"""
vocabulary: Optional[List[Union[str, CustomVocabularyItem]]] = None
default_intensity: Optional[float] = None
class CustomSpellingConfig(BaseModel):
"""Configuration for custom spelling rules.
Attributes:
spelling_dictionary: Mapping of correct spellings to phonetic variations
"""
spelling_dictionary: Optional[Dict[str, List[str]]] = None
class TranslationConfig(BaseModel):
"""Configuration for real-time translation.
Attributes:
target_languages: List of target language codes for translation
model: Translation model to use ("base" or "enhanced")
match_original_utterances: Whether to align translations with original utterances
"""
target_languages: Optional[List[str]] = None
model: Optional[str] = None
match_original_utterances: Optional[bool] = None
class RealtimeProcessingConfig(BaseModel):
"""Configuration for real-time processing features.
Attributes:
words_accurate_timestamps: Whether to provide per-word timestamps
custom_vocabulary: Whether to enable custom vocabulary
custom_vocabulary_config: Custom vocabulary configuration
custom_spelling: Whether to enable custom spelling
custom_spelling_config: Custom spelling configuration
translation: Whether to enable translation
translation_config: Translation configuration
named_entity_recognition: Whether to enable named entity recognition
sentiment_analysis: Whether to enable sentiment analysis
"""
words_accurate_timestamps: Optional[bool] = None
custom_vocabulary: Optional[bool] = None
custom_vocabulary_config: Optional[CustomVocabularyConfig] = None
custom_spelling: Optional[bool] = None
custom_spelling_config: Optional[CustomSpellingConfig] = None
translation: Optional[bool] = None
translation_config: Optional[TranslationConfig] = None
named_entity_recognition: Optional[bool] = None
sentiment_analysis: Optional[bool] = None
class MessagesConfig(BaseModel):
"""Configuration for controlling which message types are sent via WebSocket.
Attributes:
receive_partial_transcripts: Whether to receive intermediate transcription results
receive_final_transcripts: Whether to receive final transcription results
receive_speech_events: Whether to receive speech begin/end events
receive_pre_processing_events: Whether to receive pre-processing events
receive_realtime_processing_events: Whether to receive real-time processing events
receive_post_processing_events: Whether to receive post-processing events
receive_acknowledgments: Whether to receive acknowledgment messages
receive_errors: Whether to receive error messages
receive_lifecycle_events: Whether to receive lifecycle events
"""
receive_partial_transcripts: Optional[bool] = None
receive_final_transcripts: Optional[bool] = None
receive_speech_events: Optional[bool] = None
receive_pre_processing_events: Optional[bool] = None
receive_realtime_processing_events: Optional[bool] = None
receive_post_processing_events: Optional[bool] = None
receive_acknowledgments: Optional[bool] = None
receive_errors: Optional[bool] = None
receive_lifecycle_events: Optional[bool] = None
class GladiaInputParams(BaseModel):
"""Configuration parameters for the Gladia STT service.
Attributes:
encoding: Audio encoding format
bit_depth: Audio bit depth
channels: Number of audio channels
custom_metadata: Additional metadata to include with requests
endpointing: Silence duration in seconds to mark end of speech
maximum_duration_without_endpointing: Maximum utterance duration without silence
language: DEPRECATED - Use language_config instead
language_config: Detailed language configuration
pre_processing: Audio pre-processing options
realtime_processing: Real-time processing features
messages_config: WebSocket message filtering options
"""
encoding: Optional[str] = "wav/pcm"
bit_depth: Optional[int] = 16
channels: Optional[int] = 1
custom_metadata: Optional[Dict[str, Any]] = None
endpointing: Optional[float] = None
maximum_duration_without_endpointing: Optional[int] = 10
language: Optional[Language] = None # Deprecated
language_config: Optional[LanguageConfig] = None
pre_processing: Optional[PreProcessingConfig] = None
realtime_processing: Optional[RealtimeProcessingConfig] = None
messages_config: Optional[MessagesConfig] = None

View File

@@ -7,11 +7,10 @@
import base64
import json
import warnings
from typing import Any, AsyncGenerator, Dict, List, Optional, Union
from typing import Any, AsyncGenerator, Dict, Optional
import aiohttp
from loguru import logger
from pydantic import BaseModel
from pipecat.frames.frames import (
CancelFrame,
@@ -22,6 +21,7 @@ from pipecat.frames.frames import (
TranscriptionFrame,
)
from pipecat.services.ai_services import STTService
from pipecat.services.gladia.config import GladiaInputParams
from pipecat.transcriptions.language import Language
from pipecat.utils.time import time_now_iso8601
@@ -29,9 +29,7 @@ try:
import websockets
except ModuleNotFoundError as e:
logger.error(f"Exception: {e}")
logger.error(
"In order to use Gladia, you need to `pip install pipecat-ai[gladia]`. Also, set `GLADIA_API_KEY` environment variable."
)
logger.error("In order to use Gladia, you need to `pip install pipecat-ai[gladia]`.")
raise Exception(f"Missing module: {e}")
@@ -138,133 +136,18 @@ def language_to_gladia_language(language: Language) -> Optional[str]:
return result
# Configurations supported by Gladia
# Refer to the docs for more information:
# https://docs.gladia.io/api-reference/v2/live/init
# Deprecation warning for nested InputParams
class _InputParamsDescriptor:
"""Descriptor for backward compatibility with deprecation warning."""
class LanguageConfig(BaseModel):
"""Configuration for language detection and handling.
Attributes:
languages: List of language codes to use for transcription
code_switching: Whether to auto-detect language changes during transcription
"""
languages: Optional[List[str]] = None
code_switching: Optional[bool] = None
class PreProcessingConfig(BaseModel):
"""Configuration for audio pre-processing options.
Attributes:
audio_enhancer: Whether to apply audio enhancement
speech_threshold: Sensitivity for speech detection (0-1)
"""
audio_enhancer: Optional[bool] = None
speech_threshold: Optional[float] = None
class CustomVocabularyItem(BaseModel):
"""Represents a custom vocabulary item with an intensity value.
Attributes:
value: The vocabulary word or phrase
intensity: The bias intensity for this vocabulary item (0-1)
"""
value: str
intensity: float
class CustomVocabularyConfig(BaseModel):
"""Configuration for custom vocabulary.
Attributes:
vocabulary: List of words/phrases or CustomVocabularyItem objects
default_intensity: Default intensity for simple string vocabulary items
"""
vocabulary: Optional[List[Union[str, CustomVocabularyItem]]] = None
default_intensity: Optional[float] = None
class CustomSpellingConfig(BaseModel):
"""Configuration for custom spelling rules.
Attributes:
spelling_dictionary: Mapping of correct spellings to phonetic variations
"""
spelling_dictionary: Optional[Dict[str, List[str]]] = None
class TranslationConfig(BaseModel):
"""Configuration for real-time translation.
Attributes:
target_languages: List of target language codes for translation
model: Translation model to use ("base" or "enhanced")
match_original_utterances: Whether to align translations with original utterances
"""
target_languages: Optional[List[str]] = None
model: Optional[str] = None
match_original_utterances: Optional[bool] = None
class RealtimeProcessingConfig(BaseModel):
"""Configuration for real-time processing features.
Attributes:
words_accurate_timestamps: Whether to provide per-word timestamps
custom_vocabulary: Whether to enable custom vocabulary
custom_vocabulary_config: Custom vocabulary configuration
custom_spelling: Whether to enable custom spelling
custom_spelling_config: Custom spelling configuration
translation: Whether to enable translation
translation_config: Translation configuration
named_entity_recognition: Whether to enable named entity recognition
sentiment_analysis: Whether to enable sentiment analysis
"""
words_accurate_timestamps: Optional[bool] = None
custom_vocabulary: Optional[bool] = None
custom_vocabulary_config: Optional[CustomVocabularyConfig] = None
custom_spelling: Optional[bool] = None
custom_spelling_config: Optional[CustomSpellingConfig] = None
translation: Optional[bool] = None
translation_config: Optional[TranslationConfig] = None
named_entity_recognition: Optional[bool] = None
sentiment_analysis: Optional[bool] = None
class MessagesConfig(BaseModel):
"""Configuration for controlling which message types are sent via WebSocket.
Attributes:
receive_partial_transcripts: Whether to receive intermediate transcription results
receive_final_transcripts: Whether to receive final transcription results
receive_speech_events: Whether to receive speech begin/end events
receive_pre_processing_events: Whether to receive pre-processing events
receive_realtime_processing_events: Whether to receive real-time processing events
receive_post_processing_events: Whether to receive post-processing events
receive_acknowledgments: Whether to receive acknowledgment messages
receive_errors: Whether to receive error messages
receive_lifecycle_events: Whether to receive lifecycle events
"""
receive_partial_transcripts: Optional[bool] = None
receive_final_transcripts: Optional[bool] = None
receive_speech_events: Optional[bool] = None
receive_pre_processing_events: Optional[bool] = None
receive_realtime_processing_events: Optional[bool] = None
receive_post_processing_events: Optional[bool] = None
receive_acknowledgments: Optional[bool] = None
receive_errors: Optional[bool] = None
receive_lifecycle_events: Optional[bool] = None
def __get__(self, obj, objtype=None):
warnings.warn(
"GladiaSTTService.InputParams is deprecated and will be removed in a future version. "
"Import and use GladiaInputParams directly instead.",
DeprecationWarning,
stacklevel=2,
)
return GladiaInputParams
class GladiaSTTService(STTService):
@@ -276,34 +159,8 @@ class GladiaSTTService(STTService):
For complete API documentation, see: https://docs.gladia.io/api-reference/v2/live/init
"""
class InputParams(BaseModel):
"""Configuration parameters for the Gladia STT service.
Attributes:
encoding: Audio encoding format
bit_depth: Audio bit depth
channels: Number of audio channels
custom_metadata: Additional metadata to include with requests
endpointing: Silence duration in seconds to mark end of speech
maximum_duration_without_endpointing: Maximum utterance duration without silence
language: DEPRECATED - Use language_config instead
language_config: Detailed language configuration
pre_processing: Audio pre-processing options
realtime_processing: Real-time processing features
messages_config: WebSocket message filtering options
"""
encoding: Optional[str] = "wav/pcm"
bit_depth: Optional[int] = 16
channels: Optional[int] = 1
custom_metadata: Optional[Dict[str, Any]] = None
endpointing: Optional[float] = None
maximum_duration_without_endpointing: Optional[int] = None
language: Optional[Language] = None # Deprecated
language_config: Optional[LanguageConfig] = None
pre_processing: Optional[PreProcessingConfig] = None
realtime_processing: Optional[RealtimeProcessingConfig] = None
messages_config: Optional[MessagesConfig] = None
# Maintain backward compatibility
InputParams = _InputParamsDescriptor()
def __init__(
self,
@@ -313,7 +170,7 @@ class GladiaSTTService(STTService):
confidence: float = 0.5,
sample_rate: Optional[int] = None,
model: str = "fast",
params: InputParams = InputParams(),
params: GladiaInputParams = GladiaInputParams(),
**kwargs,
):
"""Initialize the Gladia STT service.
@@ -373,7 +230,7 @@ class GladiaSTTService(STTService):
# Add language configuration (prioritize language_config over deprecated language)
if self._params.language_config:
settings["language_config"] = self._params.language_config.dict(exclude_none=True)
settings["language_config"] = self._params.language_config.model_dump(exclude_none=True)
elif self._params.language: # Backward compatibility for deprecated parameter
language_code = self.language_to_service_language(self._params.language)
if language_code:
@@ -384,17 +241,17 @@ class GladiaSTTService(STTService):
# Add pre_processing configuration if provided
if self._params.pre_processing:
settings["pre_processing"] = self._params.pre_processing.dict(exclude_none=True)
settings["pre_processing"] = self._params.pre_processing.model_dump(exclude_none=True)
# Add realtime_processing configuration if provided
if self._params.realtime_processing:
settings["realtime_processing"] = self._params.realtime_processing.dict(
settings["realtime_processing"] = self._params.realtime_processing.model_dump(
exclude_none=True
)
# Add messages_config if provided
if self._params.messages_config:
settings["messages_config"] = self._params.messages_config.dict(exclude_none=True)
settings["messages_config"] = self._params.messages_config.model_dump(exclude_none=True)
return settings
@@ -445,10 +302,13 @@ class GladiaSTTService(STTService):
if response.ok:
return await response.json()
else:
error_text = await response.text()
logger.error(
f"Gladia error: {response.status}: {response.text or response.reason}"
f"Gladia error: {response.status}: {error_text or response.reason}"
)
raise Exception(
f"Failed to initialize Gladia session: {response.status} - {error_text}"
)
raise Exception(f"Failed to initialize Gladia session: {response.status}")
async def _send_audio(self, audio: bytes):
data = base64.b64encode(audio).decode("utf-8")
@@ -460,18 +320,24 @@ class GladiaSTTService(STTService):
await self._websocket.send(json.dumps({"type": "stop_recording"}))
async def _receive_task_handler(self):
async for message in self._websocket:
content = json.loads(message)
if content["type"] == "transcript":
utterance = content["data"]["utterance"]
confidence = utterance.get("confidence", 0)
transcript = utterance["text"]
if confidence >= self._confidence:
if content["data"]["is_final"]:
await self.push_frame(
TranscriptionFrame(transcript, "", time_now_iso8601())
)
else:
await self.push_frame(
InterimTranscriptionFrame(transcript, "", time_now_iso8601())
)
try:
async for message in self._websocket:
content = json.loads(message)
if content["type"] == "transcript":
utterance = content["data"]["utterance"]
confidence = utterance.get("confidence", 0)
transcript = utterance["text"]
if confidence >= self._confidence:
if content["data"]["is_final"]:
await self.push_frame(
TranscriptionFrame(transcript, "", time_now_iso8601())
)
else:
await self.push_frame(
InterimTranscriptionFrame(transcript, "", time_now_iso8601())
)
except websockets.exceptions.ConnectionClosed:
# Expected when closing the connection
pass
except Exception as e:
logger.error(f"Error in Gladia WebSocket handler: {e}")