Reorganize into a directory
This commit is contained in:
@@ -18,9 +18,11 @@ from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.services.cartesia import CartesiaTTSService
|
||||
from pipecat.services.gladia import GladiaSTTService
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.services.cartesia.tts import CartesiaTTSService
|
||||
from pipecat.services.gladia.config import GladiaInputParams, LanguageConfig
|
||||
from pipecat.services.gladia.stt import GladiaSTTService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.transcriptions.language import Language
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
|
||||
load_dotenv(override=True)
|
||||
@@ -47,6 +49,11 @@ async def main():
|
||||
|
||||
stt = GladiaSTTService(
|
||||
api_key=os.getenv("GLADIA_API_KEY"),
|
||||
params=GladiaInputParams(
|
||||
language_config=LanguageConfig(
|
||||
languages=[Language.EN],
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
|
||||
13
src/pipecat/services/gladia/__init__.py
Normal file
13
src/pipecat/services/gladia/__init__.py
Normal file
@@ -0,0 +1,13 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import sys
|
||||
|
||||
from pipecat.services import DeprecatedModuleProxy
|
||||
|
||||
from .stt import *
|
||||
|
||||
sys.modules[__name__] = DeprecatedModuleProxy(globals(), "gladia", "gladia.stt")
|
||||
165
src/pipecat/services/gladia/config.py
Normal file
165
src/pipecat/services/gladia/config.py
Normal file
@@ -0,0 +1,165 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from pipecat.transcriptions.language import Language
|
||||
|
||||
|
||||
class LanguageConfig(BaseModel):
|
||||
"""Configuration for language detection and handling.
|
||||
|
||||
Attributes:
|
||||
languages: List of language codes to use for transcription
|
||||
code_switching: Whether to auto-detect language changes during transcription
|
||||
"""
|
||||
|
||||
languages: Optional[List[str]] = None
|
||||
code_switching: Optional[bool] = None
|
||||
|
||||
|
||||
class PreProcessingConfig(BaseModel):
|
||||
"""Configuration for audio pre-processing options.
|
||||
|
||||
Attributes:
|
||||
audio_enhancer: Whether to apply audio enhancement
|
||||
speech_threshold: Sensitivity for speech detection (0-1)
|
||||
"""
|
||||
|
||||
audio_enhancer: Optional[bool] = None
|
||||
speech_threshold: Optional[float] = None
|
||||
|
||||
|
||||
class CustomVocabularyItem(BaseModel):
|
||||
"""Represents a custom vocabulary item with an intensity value.
|
||||
|
||||
Attributes:
|
||||
value: The vocabulary word or phrase
|
||||
intensity: The bias intensity for this vocabulary item (0-1)
|
||||
"""
|
||||
|
||||
value: str
|
||||
intensity: float
|
||||
|
||||
|
||||
class CustomVocabularyConfig(BaseModel):
|
||||
"""Configuration for custom vocabulary.
|
||||
|
||||
Attributes:
|
||||
vocabulary: List of words/phrases or CustomVocabularyItem objects
|
||||
default_intensity: Default intensity for simple string vocabulary items
|
||||
"""
|
||||
|
||||
vocabulary: Optional[List[Union[str, CustomVocabularyItem]]] = None
|
||||
default_intensity: Optional[float] = None
|
||||
|
||||
|
||||
class CustomSpellingConfig(BaseModel):
|
||||
"""Configuration for custom spelling rules.
|
||||
|
||||
Attributes:
|
||||
spelling_dictionary: Mapping of correct spellings to phonetic variations
|
||||
"""
|
||||
|
||||
spelling_dictionary: Optional[Dict[str, List[str]]] = None
|
||||
|
||||
|
||||
class TranslationConfig(BaseModel):
|
||||
"""Configuration for real-time translation.
|
||||
|
||||
Attributes:
|
||||
target_languages: List of target language codes for translation
|
||||
model: Translation model to use ("base" or "enhanced")
|
||||
match_original_utterances: Whether to align translations with original utterances
|
||||
"""
|
||||
|
||||
target_languages: Optional[List[str]] = None
|
||||
model: Optional[str] = None
|
||||
match_original_utterances: Optional[bool] = None
|
||||
|
||||
|
||||
class RealtimeProcessingConfig(BaseModel):
|
||||
"""Configuration for real-time processing features.
|
||||
|
||||
Attributes:
|
||||
words_accurate_timestamps: Whether to provide per-word timestamps
|
||||
custom_vocabulary: Whether to enable custom vocabulary
|
||||
custom_vocabulary_config: Custom vocabulary configuration
|
||||
custom_spelling: Whether to enable custom spelling
|
||||
custom_spelling_config: Custom spelling configuration
|
||||
translation: Whether to enable translation
|
||||
translation_config: Translation configuration
|
||||
named_entity_recognition: Whether to enable named entity recognition
|
||||
sentiment_analysis: Whether to enable sentiment analysis
|
||||
"""
|
||||
|
||||
words_accurate_timestamps: Optional[bool] = None
|
||||
custom_vocabulary: Optional[bool] = None
|
||||
custom_vocabulary_config: Optional[CustomVocabularyConfig] = None
|
||||
custom_spelling: Optional[bool] = None
|
||||
custom_spelling_config: Optional[CustomSpellingConfig] = None
|
||||
translation: Optional[bool] = None
|
||||
translation_config: Optional[TranslationConfig] = None
|
||||
named_entity_recognition: Optional[bool] = None
|
||||
sentiment_analysis: Optional[bool] = None
|
||||
|
||||
|
||||
class MessagesConfig(BaseModel):
|
||||
"""Configuration for controlling which message types are sent via WebSocket.
|
||||
|
||||
Attributes:
|
||||
receive_partial_transcripts: Whether to receive intermediate transcription results
|
||||
receive_final_transcripts: Whether to receive final transcription results
|
||||
receive_speech_events: Whether to receive speech begin/end events
|
||||
receive_pre_processing_events: Whether to receive pre-processing events
|
||||
receive_realtime_processing_events: Whether to receive real-time processing events
|
||||
receive_post_processing_events: Whether to receive post-processing events
|
||||
receive_acknowledgments: Whether to receive acknowledgment messages
|
||||
receive_errors: Whether to receive error messages
|
||||
receive_lifecycle_events: Whether to receive lifecycle events
|
||||
"""
|
||||
|
||||
receive_partial_transcripts: Optional[bool] = None
|
||||
receive_final_transcripts: Optional[bool] = None
|
||||
receive_speech_events: Optional[bool] = None
|
||||
receive_pre_processing_events: Optional[bool] = None
|
||||
receive_realtime_processing_events: Optional[bool] = None
|
||||
receive_post_processing_events: Optional[bool] = None
|
||||
receive_acknowledgments: Optional[bool] = None
|
||||
receive_errors: Optional[bool] = None
|
||||
receive_lifecycle_events: Optional[bool] = None
|
||||
|
||||
|
||||
class GladiaInputParams(BaseModel):
|
||||
"""Configuration parameters for the Gladia STT service.
|
||||
|
||||
Attributes:
|
||||
encoding: Audio encoding format
|
||||
bit_depth: Audio bit depth
|
||||
channels: Number of audio channels
|
||||
custom_metadata: Additional metadata to include with requests
|
||||
endpointing: Silence duration in seconds to mark end of speech
|
||||
maximum_duration_without_endpointing: Maximum utterance duration without silence
|
||||
language: DEPRECATED - Use language_config instead
|
||||
language_config: Detailed language configuration
|
||||
pre_processing: Audio pre-processing options
|
||||
realtime_processing: Real-time processing features
|
||||
messages_config: WebSocket message filtering options
|
||||
"""
|
||||
|
||||
encoding: Optional[str] = "wav/pcm"
|
||||
bit_depth: Optional[int] = 16
|
||||
channels: Optional[int] = 1
|
||||
custom_metadata: Optional[Dict[str, Any]] = None
|
||||
endpointing: Optional[float] = None
|
||||
maximum_duration_without_endpointing: Optional[int] = 10
|
||||
language: Optional[Language] = None # Deprecated
|
||||
language_config: Optional[LanguageConfig] = None
|
||||
pre_processing: Optional[PreProcessingConfig] = None
|
||||
realtime_processing: Optional[RealtimeProcessingConfig] = None
|
||||
messages_config: Optional[MessagesConfig] = None
|
||||
@@ -7,11 +7,10 @@
|
||||
import base64
|
||||
import json
|
||||
import warnings
|
||||
from typing import Any, AsyncGenerator, Dict, List, Optional, Union
|
||||
from typing import Any, AsyncGenerator, Dict, Optional
|
||||
|
||||
import aiohttp
|
||||
from loguru import logger
|
||||
from pydantic import BaseModel
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
CancelFrame,
|
||||
@@ -22,6 +21,7 @@ from pipecat.frames.frames import (
|
||||
TranscriptionFrame,
|
||||
)
|
||||
from pipecat.services.ai_services import STTService
|
||||
from pipecat.services.gladia.config import GladiaInputParams
|
||||
from pipecat.transcriptions.language import Language
|
||||
from pipecat.utils.time import time_now_iso8601
|
||||
|
||||
@@ -29,9 +29,7 @@ try:
|
||||
import websockets
|
||||
except ModuleNotFoundError as e:
|
||||
logger.error(f"Exception: {e}")
|
||||
logger.error(
|
||||
"In order to use Gladia, you need to `pip install pipecat-ai[gladia]`. Also, set `GLADIA_API_KEY` environment variable."
|
||||
)
|
||||
logger.error("In order to use Gladia, you need to `pip install pipecat-ai[gladia]`.")
|
||||
raise Exception(f"Missing module: {e}")
|
||||
|
||||
|
||||
@@ -138,133 +136,18 @@ def language_to_gladia_language(language: Language) -> Optional[str]:
|
||||
return result
|
||||
|
||||
|
||||
# Configurations supported by Gladia
|
||||
# Refer to the docs for more information:
|
||||
# https://docs.gladia.io/api-reference/v2/live/init
|
||||
# Deprecation warning for nested InputParams
|
||||
class _InputParamsDescriptor:
|
||||
"""Descriptor for backward compatibility with deprecation warning."""
|
||||
|
||||
|
||||
class LanguageConfig(BaseModel):
|
||||
"""Configuration for language detection and handling.
|
||||
|
||||
Attributes:
|
||||
languages: List of language codes to use for transcription
|
||||
code_switching: Whether to auto-detect language changes during transcription
|
||||
"""
|
||||
|
||||
languages: Optional[List[str]] = None
|
||||
code_switching: Optional[bool] = None
|
||||
|
||||
|
||||
class PreProcessingConfig(BaseModel):
|
||||
"""Configuration for audio pre-processing options.
|
||||
|
||||
Attributes:
|
||||
audio_enhancer: Whether to apply audio enhancement
|
||||
speech_threshold: Sensitivity for speech detection (0-1)
|
||||
"""
|
||||
|
||||
audio_enhancer: Optional[bool] = None
|
||||
speech_threshold: Optional[float] = None
|
||||
|
||||
|
||||
class CustomVocabularyItem(BaseModel):
|
||||
"""Represents a custom vocabulary item with an intensity value.
|
||||
|
||||
Attributes:
|
||||
value: The vocabulary word or phrase
|
||||
intensity: The bias intensity for this vocabulary item (0-1)
|
||||
"""
|
||||
|
||||
value: str
|
||||
intensity: float
|
||||
|
||||
|
||||
class CustomVocabularyConfig(BaseModel):
|
||||
"""Configuration for custom vocabulary.
|
||||
|
||||
Attributes:
|
||||
vocabulary: List of words/phrases or CustomVocabularyItem objects
|
||||
default_intensity: Default intensity for simple string vocabulary items
|
||||
"""
|
||||
|
||||
vocabulary: Optional[List[Union[str, CustomVocabularyItem]]] = None
|
||||
default_intensity: Optional[float] = None
|
||||
|
||||
|
||||
class CustomSpellingConfig(BaseModel):
|
||||
"""Configuration for custom spelling rules.
|
||||
|
||||
Attributes:
|
||||
spelling_dictionary: Mapping of correct spellings to phonetic variations
|
||||
"""
|
||||
|
||||
spelling_dictionary: Optional[Dict[str, List[str]]] = None
|
||||
|
||||
|
||||
class TranslationConfig(BaseModel):
|
||||
"""Configuration for real-time translation.
|
||||
|
||||
Attributes:
|
||||
target_languages: List of target language codes for translation
|
||||
model: Translation model to use ("base" or "enhanced")
|
||||
match_original_utterances: Whether to align translations with original utterances
|
||||
"""
|
||||
|
||||
target_languages: Optional[List[str]] = None
|
||||
model: Optional[str] = None
|
||||
match_original_utterances: Optional[bool] = None
|
||||
|
||||
|
||||
class RealtimeProcessingConfig(BaseModel):
|
||||
"""Configuration for real-time processing features.
|
||||
|
||||
Attributes:
|
||||
words_accurate_timestamps: Whether to provide per-word timestamps
|
||||
custom_vocabulary: Whether to enable custom vocabulary
|
||||
custom_vocabulary_config: Custom vocabulary configuration
|
||||
custom_spelling: Whether to enable custom spelling
|
||||
custom_spelling_config: Custom spelling configuration
|
||||
translation: Whether to enable translation
|
||||
translation_config: Translation configuration
|
||||
named_entity_recognition: Whether to enable named entity recognition
|
||||
sentiment_analysis: Whether to enable sentiment analysis
|
||||
"""
|
||||
|
||||
words_accurate_timestamps: Optional[bool] = None
|
||||
custom_vocabulary: Optional[bool] = None
|
||||
custom_vocabulary_config: Optional[CustomVocabularyConfig] = None
|
||||
custom_spelling: Optional[bool] = None
|
||||
custom_spelling_config: Optional[CustomSpellingConfig] = None
|
||||
translation: Optional[bool] = None
|
||||
translation_config: Optional[TranslationConfig] = None
|
||||
named_entity_recognition: Optional[bool] = None
|
||||
sentiment_analysis: Optional[bool] = None
|
||||
|
||||
|
||||
class MessagesConfig(BaseModel):
|
||||
"""Configuration for controlling which message types are sent via WebSocket.
|
||||
|
||||
Attributes:
|
||||
receive_partial_transcripts: Whether to receive intermediate transcription results
|
||||
receive_final_transcripts: Whether to receive final transcription results
|
||||
receive_speech_events: Whether to receive speech begin/end events
|
||||
receive_pre_processing_events: Whether to receive pre-processing events
|
||||
receive_realtime_processing_events: Whether to receive real-time processing events
|
||||
receive_post_processing_events: Whether to receive post-processing events
|
||||
receive_acknowledgments: Whether to receive acknowledgment messages
|
||||
receive_errors: Whether to receive error messages
|
||||
receive_lifecycle_events: Whether to receive lifecycle events
|
||||
"""
|
||||
|
||||
receive_partial_transcripts: Optional[bool] = None
|
||||
receive_final_transcripts: Optional[bool] = None
|
||||
receive_speech_events: Optional[bool] = None
|
||||
receive_pre_processing_events: Optional[bool] = None
|
||||
receive_realtime_processing_events: Optional[bool] = None
|
||||
receive_post_processing_events: Optional[bool] = None
|
||||
receive_acknowledgments: Optional[bool] = None
|
||||
receive_errors: Optional[bool] = None
|
||||
receive_lifecycle_events: Optional[bool] = None
|
||||
def __get__(self, obj, objtype=None):
|
||||
warnings.warn(
|
||||
"GladiaSTTService.InputParams is deprecated and will be removed in a future version. "
|
||||
"Import and use GladiaInputParams directly instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
return GladiaInputParams
|
||||
|
||||
|
||||
class GladiaSTTService(STTService):
|
||||
@@ -276,34 +159,8 @@ class GladiaSTTService(STTService):
|
||||
For complete API documentation, see: https://docs.gladia.io/api-reference/v2/live/init
|
||||
"""
|
||||
|
||||
class InputParams(BaseModel):
|
||||
"""Configuration parameters for the Gladia STT service.
|
||||
|
||||
Attributes:
|
||||
encoding: Audio encoding format
|
||||
bit_depth: Audio bit depth
|
||||
channels: Number of audio channels
|
||||
custom_metadata: Additional metadata to include with requests
|
||||
endpointing: Silence duration in seconds to mark end of speech
|
||||
maximum_duration_without_endpointing: Maximum utterance duration without silence
|
||||
language: DEPRECATED - Use language_config instead
|
||||
language_config: Detailed language configuration
|
||||
pre_processing: Audio pre-processing options
|
||||
realtime_processing: Real-time processing features
|
||||
messages_config: WebSocket message filtering options
|
||||
"""
|
||||
|
||||
encoding: Optional[str] = "wav/pcm"
|
||||
bit_depth: Optional[int] = 16
|
||||
channels: Optional[int] = 1
|
||||
custom_metadata: Optional[Dict[str, Any]] = None
|
||||
endpointing: Optional[float] = None
|
||||
maximum_duration_without_endpointing: Optional[int] = None
|
||||
language: Optional[Language] = None # Deprecated
|
||||
language_config: Optional[LanguageConfig] = None
|
||||
pre_processing: Optional[PreProcessingConfig] = None
|
||||
realtime_processing: Optional[RealtimeProcessingConfig] = None
|
||||
messages_config: Optional[MessagesConfig] = None
|
||||
# Maintain backward compatibility
|
||||
InputParams = _InputParamsDescriptor()
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -313,7 +170,7 @@ class GladiaSTTService(STTService):
|
||||
confidence: float = 0.5,
|
||||
sample_rate: Optional[int] = None,
|
||||
model: str = "fast",
|
||||
params: InputParams = InputParams(),
|
||||
params: GladiaInputParams = GladiaInputParams(),
|
||||
**kwargs,
|
||||
):
|
||||
"""Initialize the Gladia STT service.
|
||||
@@ -373,7 +230,7 @@ class GladiaSTTService(STTService):
|
||||
|
||||
# Add language configuration (prioritize language_config over deprecated language)
|
||||
if self._params.language_config:
|
||||
settings["language_config"] = self._params.language_config.dict(exclude_none=True)
|
||||
settings["language_config"] = self._params.language_config.model_dump(exclude_none=True)
|
||||
elif self._params.language: # Backward compatibility for deprecated parameter
|
||||
language_code = self.language_to_service_language(self._params.language)
|
||||
if language_code:
|
||||
@@ -384,17 +241,17 @@ class GladiaSTTService(STTService):
|
||||
|
||||
# Add pre_processing configuration if provided
|
||||
if self._params.pre_processing:
|
||||
settings["pre_processing"] = self._params.pre_processing.dict(exclude_none=True)
|
||||
settings["pre_processing"] = self._params.pre_processing.model_dump(exclude_none=True)
|
||||
|
||||
# Add realtime_processing configuration if provided
|
||||
if self._params.realtime_processing:
|
||||
settings["realtime_processing"] = self._params.realtime_processing.dict(
|
||||
settings["realtime_processing"] = self._params.realtime_processing.model_dump(
|
||||
exclude_none=True
|
||||
)
|
||||
|
||||
# Add messages_config if provided
|
||||
if self._params.messages_config:
|
||||
settings["messages_config"] = self._params.messages_config.dict(exclude_none=True)
|
||||
settings["messages_config"] = self._params.messages_config.model_dump(exclude_none=True)
|
||||
|
||||
return settings
|
||||
|
||||
@@ -445,10 +302,13 @@ class GladiaSTTService(STTService):
|
||||
if response.ok:
|
||||
return await response.json()
|
||||
else:
|
||||
error_text = await response.text()
|
||||
logger.error(
|
||||
f"Gladia error: {response.status}: {response.text or response.reason}"
|
||||
f"Gladia error: {response.status}: {error_text or response.reason}"
|
||||
)
|
||||
raise Exception(
|
||||
f"Failed to initialize Gladia session: {response.status} - {error_text}"
|
||||
)
|
||||
raise Exception(f"Failed to initialize Gladia session: {response.status}")
|
||||
|
||||
async def _send_audio(self, audio: bytes):
|
||||
data = base64.b64encode(audio).decode("utf-8")
|
||||
@@ -460,18 +320,24 @@ class GladiaSTTService(STTService):
|
||||
await self._websocket.send(json.dumps({"type": "stop_recording"}))
|
||||
|
||||
async def _receive_task_handler(self):
|
||||
async for message in self._websocket:
|
||||
content = json.loads(message)
|
||||
if content["type"] == "transcript":
|
||||
utterance = content["data"]["utterance"]
|
||||
confidence = utterance.get("confidence", 0)
|
||||
transcript = utterance["text"]
|
||||
if confidence >= self._confidence:
|
||||
if content["data"]["is_final"]:
|
||||
await self.push_frame(
|
||||
TranscriptionFrame(transcript, "", time_now_iso8601())
|
||||
)
|
||||
else:
|
||||
await self.push_frame(
|
||||
InterimTranscriptionFrame(transcript, "", time_now_iso8601())
|
||||
)
|
||||
try:
|
||||
async for message in self._websocket:
|
||||
content = json.loads(message)
|
||||
if content["type"] == "transcript":
|
||||
utterance = content["data"]["utterance"]
|
||||
confidence = utterance.get("confidence", 0)
|
||||
transcript = utterance["text"]
|
||||
if confidence >= self._confidence:
|
||||
if content["data"]["is_final"]:
|
||||
await self.push_frame(
|
||||
TranscriptionFrame(transcript, "", time_now_iso8601())
|
||||
)
|
||||
else:
|
||||
await self.push_frame(
|
||||
InterimTranscriptionFrame(transcript, "", time_now_iso8601())
|
||||
)
|
||||
except websockets.exceptions.ConnectionClosed:
|
||||
# Expected when closing the connection
|
||||
pass
|
||||
except Exception as e:
|
||||
logger.error(f"Error in Gladia WebSocket handler: {e}")
|
||||
Reference in New Issue
Block a user