Merge pull request #610 from pipecat-ai/aleix/stt-push-audio

allow STT services to passthrough audio frames
2024-10-17 21:02:30 -07:00
parent 71c8c0dcdb c9318ecd5c
commit f3c0767c81
5 changed files with 11 additions and 5 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

 ### Added

+- Added `audio_passthrough` parameter to `STTService`. If enabled it allows
+  audio frames to be pushed downstream in case other processors need them.
+
 - Added input parameter options for `PlayHTTTSService` and
  `PlayHTHttpTTSService`.

--- a/examples/foundational/07-interruptible-vad.py
+++ b/examples/foundational/07-interruptible-vad.py
@@ -9,6 +9,7 @@ import aiohttp
 import os
 import sys

+from pipecat.audio.vad.silero import SileroVAD
 from pipecat.frames.frames import LLMMessagesFrame
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
@@ -20,7 +21,6 @@ from pipecat.processors.aggregators.llm_response import (
 from pipecat.services.cartesia import CartesiaTTSService
 from pipecat.services.openai import OpenAILLMService
 from pipecat.transports.services.daily import DailyParams, DailyTransport
-from pipecat.vad.silero import SileroVAD

 from runner import configure

--- a/examples/foundational/07c-interruptible-deepgram.py
+++ b/examples/foundational/07c-interruptible-deepgram.py
@@ -80,7 +80,6 @@ async def main():

        @transport.event_handler("on_first_participant_joined")
        async def on_first_participant_joined(transport, participant):
-            transport.capture_participant_transcription(participant["id"])
            # Kick off the conversation.
            messages.append({"role": "system", "content": "Please introduce yourself to the user."})
            await task.queue_frames([LLMMessagesFrame(messages)])
--- a/examples/foundational/19-openai-realtime-beta.py
+++ b/examples/foundational/19-openai-realtime-beta.py
@@ -15,7 +15,7 @@ from loguru import logger
 from runner import configure

 from pipecat.audio.vad.silero import SileroVADAnalyzer
-from pipecat.vad.vad_analyzer import VADParams
+from pipecat.audio.vad.vad_analyzer import VADParams
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineParams, PipelineTask
--- a/src/pipecat/services/ai_services.py
+++ b/src/pipecat/services/ai_services.py
@@ -451,8 +451,9 @@ class WordTTSService(TTSService):
 class STTService(AIService):
    """STTService is a base class for speech-to-text services."""

-    def __init__(self, **kwargs):
+    def __init__(self, audio_passthrough=False, **kwargs):
        super().__init__(**kwargs)
+        self._audio_passthrough = audio_passthrough
        self._settings: Dict[str, Any] = {}

    @abstractmethod
@@ -490,8 +491,11 @@ class STTService(AIService):

        if isinstance(frame, AudioRawFrame):
            # In this service we accumulate audio internally and at the end we
-            # push a TextFrame. We don't really want to push audio frames down.
+            # push a TextFrame. We also push audio downstream in case someone
+            # else needs it.
            await self.process_audio_frame(frame)
+            if self._audio_passthrough:
+                await self.push_frame(frame, direction)
        elif isinstance(frame, STTUpdateSettingsFrame):
            await self._update_settings(frame.settings)
        else: