diff --git a/CHANGELOG.md b/CHANGELOG.md index ddc78b7e2..4dcefc96b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- Added `audio_passthrough` parameter to `STTService`. If enabled it allows + audio frames to be pushed downstream in case other processors need them. + - Added input parameter options for `PlayHTTTSService` and `PlayHTHttpTTSService`. diff --git a/examples/foundational/07-interruptible-vad.py b/examples/foundational/07-interruptible-vad.py index 716e3ee03..9ec380bd0 100644 --- a/examples/foundational/07-interruptible-vad.py +++ b/examples/foundational/07-interruptible-vad.py @@ -9,6 +9,7 @@ import aiohttp import os import sys +from pipecat.audio.vad.silero import SileroVAD from pipecat.frames.frames import LLMMessagesFrame from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner @@ -20,7 +21,6 @@ from pipecat.processors.aggregators.llm_response import ( from pipecat.services.cartesia import CartesiaTTSService from pipecat.services.openai import OpenAILLMService from pipecat.transports.services.daily import DailyParams, DailyTransport -from pipecat.vad.silero import SileroVAD from runner import configure diff --git a/examples/foundational/07c-interruptible-deepgram.py b/examples/foundational/07c-interruptible-deepgram.py index e913005e1..d232ad973 100644 --- a/examples/foundational/07c-interruptible-deepgram.py +++ b/examples/foundational/07c-interruptible-deepgram.py @@ -80,7 +80,6 @@ async def main(): @transport.event_handler("on_first_participant_joined") async def on_first_participant_joined(transport, participant): - transport.capture_participant_transcription(participant["id"]) # Kick off the conversation. messages.append({"role": "system", "content": "Please introduce yourself to the user."}) await task.queue_frames([LLMMessagesFrame(messages)]) diff --git a/examples/foundational/19-openai-realtime-beta.py b/examples/foundational/19-openai-realtime-beta.py index e9cb02f23..f258f95e4 100644 --- a/examples/foundational/19-openai-realtime-beta.py +++ b/examples/foundational/19-openai-realtime-beta.py @@ -15,7 +15,7 @@ from loguru import logger from runner import configure from pipecat.audio.vad.silero import SileroVADAnalyzer -from pipecat.vad.vad_analyzer import VADParams +from pipecat.audio.vad.vad_analyzer import VADParams from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask diff --git a/src/pipecat/services/ai_services.py b/src/pipecat/services/ai_services.py index 8e11ad6ee..22ceffb7b 100644 --- a/src/pipecat/services/ai_services.py +++ b/src/pipecat/services/ai_services.py @@ -451,8 +451,9 @@ class WordTTSService(TTSService): class STTService(AIService): """STTService is a base class for speech-to-text services.""" - def __init__(self, **kwargs): + def __init__(self, audio_passthrough=False, **kwargs): super().__init__(**kwargs) + self._audio_passthrough = audio_passthrough self._settings: Dict[str, Any] = {} @abstractmethod @@ -490,8 +491,11 @@ class STTService(AIService): if isinstance(frame, AudioRawFrame): # In this service we accumulate audio internally and at the end we - # push a TextFrame. We don't really want to push audio frames down. + # push a TextFrame. We also push audio downstream in case someone + # else needs it. await self.process_audio_frame(frame) + if self._audio_passthrough: + await self.push_frame(frame, direction) elif isinstance(frame, STTUpdateSettingsFrame): await self._update_settings(frame.settings) else: