diff --git a/CHANGELOG.md b/CHANGELOG.md index eff100b2c..156913108 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -49,6 +49,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added 04 foundational examples for client/server transports. Also, renamed `29-livekit-audio-chat.py` to `04b-transports-livekit.py`. +- Added foundational example `13c-gladia-translation.py` showing how to use + `TranscriptionFrame` and `TranslationFrame`. + ## [0.0.65] - 2025-04-23 "Sant Jordi's release" 🌹📕 https://en.wikipedia.org/wiki/Saint_George%27s_Day_in_Catalonia @@ -108,6 +111,9 @@ https://en.wikipedia.org/wiki/Saint_George%27s_Day_in_Catalonia - Added word/timestamp pairs to `ElevenLabsHttpTTSService`. +- Added `TranslationFrame`, a new frame type that contains a translated + transcription. + - It is now possible to disable `SoundfileMixer` when created. You can then use `MixerEnableFrame` to dynamically enable it when necessary. @@ -129,6 +135,9 @@ https://en.wikipedia.org/wiki/Saint_George%27s_Day_in_Catalonia - `OpenAILLMService` and `OpenPipeLLMService` now use `gpt-4.1` as their default model. +- Updated `GladiaSTTService` to output a `TranslationFrame` when specifying a + `translation` and `translation_config`. + - `SoundfileMixer` constructor arguments need to be keywords. ### Deprecated diff --git a/examples/foundational/13c-gladia-translation.py b/examples/foundational/13c-gladia-translation.py new file mode 100644 index 000000000..8ebd17aa4 --- /dev/null +++ b/examples/foundational/13c-gladia-translation.py @@ -0,0 +1,90 @@ +# +# Copyright (c) 2024–2025, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +import os + +from dotenv import load_dotenv +from loguru import logger + +from pipecat.frames.frames import Frame, TranscriptionFrame, TranslationFrame +from pipecat.pipeline.pipeline import Pipeline +from pipecat.pipeline.runner import PipelineRunner +from pipecat.pipeline.task import PipelineTask +from pipecat.processors.frame_processor import FrameDirection, FrameProcessor +from pipecat.services.gladia.config import ( + GladiaInputParams, + LanguageConfig, + RealtimeProcessingConfig, + TranslationConfig, +) +from pipecat.services.gladia.stt import GladiaSTTService +from pipecat.transcriptions.language import Language +from pipecat.transports.base_transport import TransportParams +from pipecat.transports.network.small_webrtc import SmallWebRTCTransport +from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection + +load_dotenv(override=True) + + +class TranscriptionLogger(FrameProcessor): + async def process_frame(self, frame: Frame, direction: FrameDirection): + await super().process_frame(frame, direction) + + if isinstance(frame, TranscriptionFrame): + print(f"Transcription ({frame.language}): {frame.text}") + elif isinstance(frame, TranslationFrame): + print(f"Translation ({frame.language}): {frame.text}") + + +async def run_bot(webrtc_connection: SmallWebRTCConnection): + logger.info(f"Starting bot") + + transport = SmallWebRTCTransport( + webrtc_connection=webrtc_connection, + params=TransportParams(audio_in_enabled=True), + ) + + stt = GladiaSTTService( + api_key=os.getenv("GLADIA_API_KEY"), + params=GladiaInputParams( + language_config=LanguageConfig( + languages=[Language.EN], # Input in English + code_switching=False, + ), + realtime_processing=RealtimeProcessingConfig( + translation=True, # Enable translation + translation_config=TranslationConfig( + target_languages=[Language.ES], # Translate to Spanish + model="enhanced", # Use the enhanced translation model + ), + ), + ), + ) + + tl = TranscriptionLogger() + + pipeline = Pipeline([transport.input(), stt, tl]) + + task = PipelineTask(pipeline) + + @transport.event_handler("on_client_disconnected") + async def on_client_disconnected(transport, client): + logger.info(f"Client disconnected") + + @transport.event_handler("on_client_closed") + async def on_client_closed(transport, client): + logger.info(f"Client closed connection") + await task.cancel() + + runner = PipelineRunner(handle_sigint=False) + + await runner.run(task) + + +if __name__ == "__main__": + from run import main + + main() diff --git a/src/pipecat/frames/frames.py b/src/pipecat/frames/frames.py index 72acf1a2a..073112a9f 100644 --- a/src/pipecat/frames/frames.py +++ b/src/pipecat/frames/frames.py @@ -256,6 +256,22 @@ class InterimTranscriptionFrame(TextFrame): return f"{self.name}(user: {self.user_id}, text: [{self.text}], language: {self.language}, timestamp: {self.timestamp})" +@dataclass +class TranslationFrame(TextFrame): + """A text frame with translated transcription data. + + Will be placed in the transport's receive queue when a participant speaks. + + """ + + user_id: str + timestamp: str + language: Optional[Language] = None + + def __str__(self): + return f"{self.name}(user: {self.user_id}, text: [{self.text}], language: {self.language}, timestamp: {self.timestamp})" + + @dataclass class OpenAILLMContextAssistantTimestampFrame(DataFrame): """Timestamp information for assistant message in LLM context.""" diff --git a/src/pipecat/services/gladia/stt.py b/src/pipecat/services/gladia/stt.py index 61fffc846..f27664a25 100644 --- a/src/pipecat/services/gladia/stt.py +++ b/src/pipecat/services/gladia/stt.py @@ -20,6 +20,7 @@ from pipecat.frames.frames import ( InterimTranscriptionFrame, StartFrame, TranscriptionFrame, + TranslationFrame, ) from pipecat.services.gladia.config import GladiaInputParams from pipecat.services.stt_service import STTService @@ -405,7 +406,7 @@ class GladiaSTTService(STTService): translation = translated_utterance["text"] if translated_language != original_language and confidence >= self._confidence: await self.push_frame( - TranscriptionFrame( + TranslationFrame( translation, "", time_now_iso8601(), translated_language ) )