Merge pull request #610 from pipecat-ai/aleix/stt-push-audio

allow STT services to passthrough audio frames
This commit is contained in:
Aleix Conchillo Flaqué
2024-10-17 21:02:30 -07:00
committed by GitHub
5 changed files with 11 additions and 5 deletions

View File

@@ -9,6 +9,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added
- Added `audio_passthrough` parameter to `STTService`. If enabled it allows
audio frames to be pushed downstream in case other processors need them.
- Added input parameter options for `PlayHTTTSService` and
`PlayHTHttpTTSService`.

View File

@@ -9,6 +9,7 @@ import aiohttp
import os
import sys
from pipecat.audio.vad.silero import SileroVAD
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
@@ -20,7 +21,6 @@ from pipecat.processors.aggregators.llm_response import (
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.vad.silero import SileroVAD
from runner import configure

View File

@@ -80,7 +80,6 @@ async def main():
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
transport.capture_participant_transcription(participant["id"])
# Kick off the conversation.
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
await task.queue_frames([LLMMessagesFrame(messages)])

View File

@@ -15,7 +15,7 @@ from loguru import logger
from runner import configure
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.vad.vad_analyzer import VADParams
from pipecat.audio.vad.vad_analyzer import VADParams
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask

View File

@@ -451,8 +451,9 @@ class WordTTSService(TTSService):
class STTService(AIService):
"""STTService is a base class for speech-to-text services."""
def __init__(self, **kwargs):
def __init__(self, audio_passthrough=False, **kwargs):
super().__init__(**kwargs)
self._audio_passthrough = audio_passthrough
self._settings: Dict[str, Any] = {}
@abstractmethod
@@ -490,8 +491,11 @@ class STTService(AIService):
if isinstance(frame, AudioRawFrame):
# In this service we accumulate audio internally and at the end we
# push a TextFrame. We don't really want to push audio frames down.
# push a TextFrame. We also push audio downstream in case someone
# else needs it.
await self.process_audio_frame(frame)
if self._audio_passthrough:
await self.push_frame(frame, direction)
elif isinstance(frame, STTUpdateSettingsFrame):
await self._update_settings(frame.settings)
else: