from __future__ import annotations from loguru import logger from pipecat.audio.vad.silero import SileroVADAnalyzer from pipecat.audio.vad.vad_analyzer import VADParams from pipecat.frames.frames import ( LLMRunFrame, OutputTransportMessageUrgentFrame, TTSSpeakFrame, ) from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask from pipecat.processors.aggregators.llm_context import LLMContext from pipecat.processors.aggregators.llm_response_universal import ( LLMContextAggregatorPair, LLMUserAggregatorParams, UserTurnStoppedMessage, ) from pipecat.serializers.protobuf import ProtobufFrameSerializer from pipecat.serializers.base_serializer import FrameSerializer from pipecat.transports.websocket.fastapi import ( FastAPIWebsocketParams, FastAPIWebsocketTransport, ) from pipecat.turns.user_stop.speech_timeout_user_turn_stop_strategy import ( SpeechTimeoutUserTurnStopStrategy, ) from pipecat.turns.user_turn_strategies import UserTurnStrategies from .config import EngineConfig from .product_protocol import ProductWebsocketSerializer from .services import create_llm_service, create_stt_service, create_tts_service from .text_input import ProductTextInputProcessor from .text_stream import ProductAssistantTurnStoppedMessage, ProductTextStreamProcessor from .transcript_stream import ProductTranscriptStreamProcessor from .turn_start import InterruptionGateUserTurnStartStrategy async def run_voice_pipeline(websocket, config: EngineConfig) -> None: await run_pipeline_with_serializer( websocket, config, serializer=ProtobufFrameSerializer(), client_label="Pipecat protobuf", ) async def run_product_voice_pipeline(websocket, config: EngineConfig) -> None: await run_pipeline_with_serializer( websocket, config, serializer=ProductWebsocketSerializer( sample_rate=config.audio.sample_rate_hz, channels=config.audio.channels, ), client_label="Product JSON", ) async def run_pipeline_with_serializer( websocket, config: EngineConfig, *, serializer: FrameSerializer, client_label: str, ) -> None: transport = FastAPIWebsocketTransport( websocket=websocket, params=FastAPIWebsocketParams( audio_in_enabled=True, audio_out_enabled=True, audio_in_sample_rate=config.audio.sample_rate_hz, audio_out_sample_rate=config.audio.sample_rate_hz, audio_in_channels=config.audio.channels, audio_out_channels=config.audio.channels, serializer=serializer, session_timeout=None, ), ) stt = create_stt_service(config.services.stt, config.audio) llm = create_llm_service(config.services.llm) tts = create_tts_service(config.services.tts, config.audio) messages = [{"role": "system", "content": config.agent.system_prompt}] if config.agent.greeting and config.agent.greeting_mode == "generated": messages.append({"role": "system", "content": config.agent.greeting}) context = LLMContext(messages) vad_params = VADParams( confidence=config.turn.vad.confidence, start_secs=config.turn.vad.start_secs, stop_secs=config.turn.vad.stop_secs, min_volume=config.turn.vad.min_volume, ) # Replace pipecat's default stop strategy (Smart Turn v3) with a simple # silence-timeout strategy. Smart Turn v3 was finalizing every short # Chinese phrase as a complete turn, which caused one logical utterance # to become several LLM calls and several user bubbles in the UI. The # timeout strategy waits for `user_speech_timeout_sec` of silence # (re-armed every time the user resumes speaking) before declaring the # turn finished — which is what we actually want for streaming ASRs. user_turn_strategies = UserTurnStrategies( start=[ InterruptionGateUserTurnStartStrategy( min_chars_when_bot_speaking=config.turn.interruption_min_chars, allowed_short_replies=config.turn.interruption_short_replies, use_interim=config.turn.interruption_use_interim, ), ], stop=[ SpeechTimeoutUserTurnStopStrategy( user_speech_timeout=config.turn.user_speech_timeout_sec, ), ], ) user_aggregator, _ = LLMContextAggregatorPair( context, user_params=LLMUserAggregatorParams( vad_analyzer=SileroVADAnalyzer(params=vad_params), user_turn_strategies=user_turn_strategies, ), ) text_stream = ProductTextStreamProcessor(context) pipeline = Pipeline( [ transport.input(), ProductTextInputProcessor(), stt, ProductTranscriptStreamProcessor(), user_aggregator, llm, text_stream, tts, transport.output(), ] ) task = PipelineTask( pipeline, params=PipelineParams( audio_in_sample_rate=config.audio.sample_rate_hz, audio_out_sample_rate=config.audio.sample_rate_hz, enable_metrics=True, enable_usage_metrics=True, enable_heartbeats=True, ), idle_timeout_secs=config.session.inactivity_timeout_sec, ) @transport.event_handler("on_client_connected") async def on_client_connected(_transport, _client): logger.info(f"{client_label} websocket client connected") if config.agent.greeting_mode == "fixed" and config.agent.greeting: await task.queue_frames([TTSSpeakFrame(config.agent.greeting)]) elif config.agent.greeting_mode == "generated": await task.queue_frames([LLMRunFrame()]) @transport.event_handler("on_client_disconnected") async def on_client_disconnected(_transport, _client): logger.info(f"{client_label} websocket client disconnected") await task.cancel() @transport.event_handler("on_session_timeout") async def on_session_timeout(_transport, _client): logger.info(f"{client_label} websocket session timed out") await task.cancel() @user_aggregator.event_handler("on_user_turn_stopped") async def on_user_turn_stopped(_aggregator, _strategy, message: UserTurnStoppedMessage): logger.info(f"User: {message.content}") text = (message.content or "").strip() if not text: return await task.queue_frame( OutputTransportMessageUrgentFrame( message={ "type": "input.transcript.final", "text": text, "user_id": message.user_id, "timestamp": message.timestamp, } ) ) @text_stream.event_handler("on_assistant_turn_stopped") async def on_assistant_turn_stopped( _aggregator, message: ProductAssistantTurnStoppedMessage ): logger.info(f"Assistant: {message.content}") runner = PipelineRunner(handle_sigint=False) await runner.run(task)