bot-runner-ified classic-pipelinepy

hacking classic-pipeline.py to be runnable by bot_runner
khk working on fast-bot-runner bot_runner
2024-06-24 01:37:19 +00:00 · 2024-06-23 20:56:21 -04:00 · 2024-06-23 20:28:37 -04:00 · 2024-06-23 19:28:31 -04:00 · 2024-06-23 22:40:58 +00:00 · 2024-06-23 18:13:59 -04:00
8 changed files with 1034 additions and 0 deletions
--- a/examples/fast-bot-metrics/bot.py
+++ b/examples/fast-bot-metrics/bot.py
@@ -0,0 +1 @@
 classic-pipeline.py
--- a/examples/fast-bot-metrics/bot_runner.py
+++ b/examples/fast-bot-metrics/bot_runner.py
@@ -0,0 +1,170 @@
 """
 bot_runner.py
 HTTP service that listens for incoming calls from either Daily or Twilio,
 provisioning a room and starting a Pipecat bot in response.
 Refer to README for more information.
 """
 import os
 import argparse
 import subprocess
 from pydantic import BaseModel, ValidationError
 from typing import Optional
 from pipecat.transports.services.helpers.daily_rest import DailyRESTHelper, DailyRoomObject, DailyRoomProperties, DailyRoomSipParams, DailyRoomParams
 from fastapi import FastAPI, Request, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, PlainTextResponse
 from twilio.twiml.voice_response import VoiceResponse
 from bot import BotSettings
 from dotenv import load_dotenv
 load_dotenv(override=True)
 # ------------ Configuration ------------ #
 MAX_SESSION_TIME = 5 * 60  # 5 minutes
 REQUIRED_ENV_VARS = ['DAILY_API_URL', 'DAILY_API_KEY', 'DEEPGRAM_API_KEY']
 daily_rest_helper = DailyRESTHelper(
    os.getenv("DAILY_API_KEY", ""),
    os.getenv("DAILY_API_URL", 'https://api.daily.co/v1'))
 class RunnerSettings(BaseModel):
    prompt: Optional[str] = "You are a helpful assistant."
    deepgram_voice: Optional[str] = os.getenv("DEEPGRAM_VOICE")
    openai_model: Optional[str] = os.getenv("OPENAI_MODEL")
    openai_api_key: Optional[str] = os.getenv("OPENAI_API_KEY")
    test: Optional[bool] = None
 # ----------------- API ----------------- #
 app = FastAPI()
 app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"]
 )
 # ----------------- Main ----------------- #
@app.post("/start_bot")
 async def start_bot(request: Request) -> JSONResponse:
    runner_settings = RunnerSettings()
    try:
        request_body = await request.body()
        if len(request_body) > 0:
            runner_settings = RunnerSettings.model_validate_json(request_body)
    except ValidationError as e:
        raise HTTPException(
            status_code=400,
            detail=f"Invalid request: {e}")
    except Exception as e:
        # If no data in request, pass
        pass
    # Is this a webhook creation request?
    if runner_settings.test is not None:
        return JSONResponse({"test": True})
    # Use specified room URL, or create a new one if not specified
    room_url = os.getenv("DAILY_SAMPLE_ROOM_URL", "")
    if not room_url:
        params = DailyRoomParams(
            properties=DailyRoomProperties()
        )
        try:
            room: DailyRoomObject = daily_rest_helper.create_room(params=params)
        except Exception as e:
            raise HTTPException(
                status_code=500,
                detail=f"Unable to provision room {e}")
    else:
        # Check passed room URL exists, we should assume that it already has a sip set up
        try:
            room: DailyRoomObject = daily_rest_helper.get_room_from_url(room_url)
        except Exception:
            raise HTTPException(
                status_code=500, detail=f"Room not found: {room_url}")
    # Give the agent a token to join the session
    token = daily_rest_helper.get_token(room.url, MAX_SESSION_TIME)
    if not room or not token:
        raise HTTPException(
            status_code=500, detail=f"Failed to get token for room: {room_url}")
    # Spawn a new agent, and join the user session
    # Note: this is mostly for demonstration purposes (refer to 'deployment' in README)
    try:
        bot_settings = BotSettings(
            room_url=room.url,
            room_token=token,
            prompt=runner_settings.prompt,
            deepgram_voice=runner_settings.deepgram_voice,
            deepgram_api_key=os.getenv("DEEPGRAM_API_KEY"),
            # deepgram_base_url="http://0.0.0.0:8080/v1/speak",
            openai_model=runner_settings.openai_model,
            openai_api_key=runner_settings.openai_api_key,
            # openai_base_url="http://0.0.0.0:8000/v1",
        )
        bot_settings_str = bot_settings.model_dump_json(exclude_none=True)
        subprocess.Popen(
            [f"python3 -m bot -s '{bot_settings_str}'"],
            shell=True,
            bufsize=1,
            cwd=os.path.dirname(os.path.abspath(__file__)))
    except Exception as e:
        raise HTTPException(
            status_code=500, detail=f"Failed to start subprocess: {e}")
    # Grab a token for the user to join with
    user_token = daily_rest_helper.get_token(room.url, MAX_SESSION_TIME)
    return JSONResponse({
        "room_url": room.url,
        "token": user_token,
    })
 if __name__ == "__main__":
    # Check environment variables
    for env_var in REQUIRED_ENV_VARS:
        if env_var not in os.environ:
            raise Exception(f"Missing environment variable: {env_var}.")
    parser = argparse.ArgumentParser(description="Pipecat Bot Runner")
    parser.add_argument("--host", type=str,
                        default=os.getenv("HOST", "0.0.0.0"), help="Host address")
    parser.add_argument("--port", type=int,
                        default=os.getenv("PORT", 7860), help="Port number")
    parser.add_argument("--reload", action="store_true",
                        default=True, help="Reload code on change")
    config = parser.parse_args()
    try:
        import uvicorn
        uvicorn.run(
            "bot_runner:app",
            host=config.host,
            port=config.port,
            reload=config.reload
        )
    except KeyboardInterrupt:
        print("Pipecat runner shutting down...")
--- a/examples/fast-bot-metrics/classic-pipeline.py
+++ b/examples/fast-bot-metrics/classic-pipeline.py
@@ -0,0 +1,199 @@
 #
 # Copyright (c) 2024, Daily
 #
 # SPDX-License-Identifier: BSD 2-Clause License
 #
 from loguru import logger
 from runner import configure
 import argparse
 import asyncio
 import aiohttp
 import os
 import sys
 from typing import List, Optional
 from pydantic import BaseModel, ValidationError
 from pipecat.vad.vad_analyzer import VADParams
 from pipecat.vad.silero import SileroVADAnalyzer
 from pipecat.transports.services.daily import DailyParams, DailyTransport, DailyTransportMessageFrame
 from pipecat.services.openai import OpenAILLMService, OpenAILLMContext
 from pipecat.services.deepgram import DeepgramSTTService
 from pipecat.pipeline.task import PipelineParams, PipelineTask
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.processors.logger import FrameLogger
 from pipecat.frames.frames import LLMMessagesFrame, EndFrame
 from pipecat.processors.aggregators.llm_response import (
    LLMAssistantResponseAggregator, LLMUserResponseAggregator
 )
 from fastbothelpers import (
    GreedyLLMAggregator,
    ClearableDeepgramTTSService,
    VADGate,
    AudioVolumeTimer,
    TranscriptionTimingLogger
 )
 from dotenv import load_dotenv
 load_dotenv(override=True)
 logger.remove(0)
 logger.add(sys.stderr, level="DEBUG")
 class BotSettings(BaseModel):
    room_url: str
    room_token: str
    bot_name: str = "Pipecat"
    prompt: Optional[str] = "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Respond to what the user said in a creative and helpful way in a few short sentences."
    deepgram_api_key: Optional[str] = None
    deepgram_voice: Optional[str] = "aura-asteria-en"
    deepgram_base_url: Optional[str] = "https://api.deepgram.com/v1/speak"
    openai_api_key: Optional[str] = None
    openai_model: Optional[str] = "gpt-4o"
    openai_base_url: Optional[str] = None
 async def main(settings: BotSettings):
    async with aiohttp.ClientSession() as session:
        transport = DailyTransport(
            settings.room_url,
            settings.room_token,
            settings.bot_name,
            DailyParams(
                audio_out_enabled=True,
                transcription_enabled=False,
                vad_enabled=True,
                vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.200)),
                vad_audio_passthrough=True
            )
        )
        stt = DeepgramSTTService(
            api_key=os.getenv("DEEPGRAM_API_KEY"),
            **({'url': url} if (url := os.getenv("DEEPGRAM_STT_URL")) else {})
        )
        tts = ClearableDeepgramTTSService(
            name="Voice",
            aiohttp_session=session,
            api_key=os.getenv("DEEPGRAM_API_KEY"),
            voice="aura-asteria-en",
            **({'base_url': url} if (url := os.getenv("DEEPGRAM_TTS_BASE_URL")) else {})
        )
        llm = OpenAILLMService(
            name="LLM",
            # To use OpenAI
            api_key=os.getenv("OPENAI_API_KEY"),
            model=os.getenv("OPENAI_MODEL"),
            base_url=os.getenv("OPENAI_BASE_URL")
        )
        messages = [
            {
                "role": "system",
                "content": """You are a helpful assistant in an audio conversation.
 Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers.
 Respond to what the user said in a creative and helpful way. Be concise in your answers to basic questions. If you are asked to elaborate or tell a story, provide a longer response.
 """,
            },
        ]
        avt = AudioVolumeTimer()
        tl = TranscriptionTimingLogger(avt)
        tma_in = LLMUserResponseAggregator(messages)
        tma_out = LLMAssistantResponseAggregator(messages)
        pipeline = Pipeline([
            transport.input(),   # Transport user input
            avt,
            stt,
            tl,
            tma_in,              # User responses
            llm,                 # LLM
            tts,                 # TTS
            transport.output(),  # Transport bot output
            tma_out,             # Assistant spoken responses
        ])
        task = PipelineTask(
            pipeline,
            PipelineParams(
                allow_interruptions=True,
                enable_metrics=True,
                report_only_initial_ttfb=True
            ))
        # When a participant joins, start transcription for that participant so the
        # bot can "hear" and respond to them.
        # @ transport.event_handler("on_participant_joined")
        # async def on_participant_joined(transport, participant):
        #    transport.capture_participant_transcription(participant["id"])
        # When the participant leaves, we exit the bot.
        @transport.event_handler("on_participant_left")
        async def on_participant_left(transport, participant, reason):
            await task.queue_frame(EndFrame())
        # When the first participant joins, the bot should introduce itself.
        @ transport.event_handler("on_first_participant_joined")
        async def on_first_participant_joined(transport, participant):
            messages.append(
                {"role": "system", "content": "Please introduce yourself to the user."})
            await task.queue_frames([LLMMessagesFrame(messages)])
        # Handle "latency-ping" messages. The client will send app messages that look like
        # this:
        #   { "latency-ping": { ts: <client-side timestamp> }}
        #
        # We want to send an immediate pong back to the client from this handler function.
        # Also, we will push a frame into the top of the pipeline and send it after the
        #
        @ transport.event_handler("on_app_message")
        async def on_app_message(transport, message, sender):
            try:
                if "latency-ping" in message:
                    logger.debug(f"Received latency ping app message: {message}")
                    ts = message["latency-ping"]["ts"]
                    # Send immediately
                    transport.output().send_message(DailyTransportMessageFrame(
                        message={"latency-pong-msg-handler": {"ts": ts}},
                        participant_id=sender))
                    # And push to the pipeline for the Daily transport.output to send
                    await tma_in.push_frame(
                        DailyTransportMessageFrame(
                            message={"latency-pong-pipeline-delivery": {"ts": ts}},
                            participant_id=sender))
            except Exception as e:
                logger.debug(f"message handling error: {e} - {message}")
        runner = PipelineRunner()
        await runner.run(task)
 # if __name__ == "__main__":
 #     (url, token) = configure()
 #     asyncio.run(main(url, token))
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Pipecat Bot")
    parser.add_argument("-s", "--settings", type=str, required=True, help="Pipecat bot settings")
    args, unknown = parser.parse_known_args()
    try:
        settings = BotSettings.model_validate_json(args.settings)
        asyncio.run(main(settings))
    except ValidationError as e:
        print(e)
--- a/examples/fast-bot-metrics/env-example
+++ b/examples/fast-bot-metrics/env-example
@@ -0,0 +1,15 @@
 DAILY_SAMPLE_ROOM_URL=...
 DAILY_API_KEY=...
 DEEPGRAM_API_KEY=...
 DEEPGRAM_VOICE=alpha-asteria-en-v2
 # DEEPGRAM_STT_URL=ws://localhost:8080
 # DEEPGRAM_TTS_BASE_URL=http://0.0.0.0:8080/v1/speak
 OPENAI_API_KEY=...
 # OPENAI_API_KEY=
 OPENAI_MODEL=gpt-4o
 # OPENAI_MODEL=meta-llama/Meta-Llama-3-70B-Instruct
 # OPENAI_MODEL=meta-llama/Meta-Llama-3-8B-Instruct
 # OPENAI_MODEL=neuralmagic/Meta-Llama-3-70B-Instruct-FP8
 # OPENAI_BASE_URL=http://0.0.0.0:8000/v1
--- a/examples/fast-bot-metrics/example-bot-runner-bot.py
+++ b/examples/fast-bot-metrics/example-bot-runner-bot.py
@@ -0,0 +1,158 @@
 #
 # Copyright (c) 2024, Daily
 #
 # SPDX-License-Identifier: BSD 2-Clause License
 #
 import argparse
 import asyncio
 import aiohttp
 import sys
 from pydantic import BaseModel, ValidationError
 from typing import Optional
 from pipecat.frames.frames import EndFrame, LLMMessagesFrame
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineParams, PipelineTask
 from pipecat.processors.aggregators.llm_response import (
    LLMAssistantResponseAggregator, LLMUserResponseAggregator)
 from pipecat.services.deepgram import DeepgramTTSService
 from pipecat.services.openai import OpenAILLMService
 from pipecat.transports.services.daily import DailyParams, DailyTransport, DailyTransportMessageFrame
 from pipecat.vad.silero import SileroVADAnalyzer
 from loguru import logger
 logger.remove(0)
 logger.add(sys.stderr, level="DEBUG")
 class BotSettings(BaseModel):
    room_url: str
    room_token: str
    bot_name: str = "Pipecat"
    prompt: Optional[str] = "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Respond to what the user said in a creative and helpful way in a few short sentences."
    deepgram_api_key: Optional[str] = None
    deepgram_voice: Optional[str] = "aura-asteria-en"
    deepgram_base_url: Optional[str] = "https://api.deepgram.com/v1/speak"
    openai_api_key: Optional[str] = None
    openai_model: Optional[str] = "gpt-4o"
    openai_base_url: Optional[str] = None
 async def main(settings: BotSettings):
    async with aiohttp.ClientSession() as session:
        transport = DailyTransport(
            settings.room_url,
            settings.room_token,
            settings.bot_name,
            DailyParams(
                audio_out_enabled=True,
                transcription_enabled=True,
                vad_enabled=True,
                vad_analyzer=SileroVADAnalyzer()
            )
        )
        tts = DeepgramTTSService(
            aiohttp_session=session,
            api_key=settings.deepgram_api_key,
            voice=settings.deepgram_voice,
            base_url=settings.deepgram_base_url
        )
        llm = OpenAILLMService(
            api_key=settings.openai_api_key,
            model=settings.openai_model,
            base_url=settings.openai_base_url
        )
        messages = [
            {
                "role": "system",
                "content": settings.prompt,
            },
        ]
        tma_in = LLMUserResponseAggregator(messages)
        tma_out = LLMAssistantResponseAggregator(messages)
        pipeline = Pipeline([
            transport.input(),   # Transport user input
            tma_in,              # User responses
            llm,                 # LLM
            tts,                 # TTS
            transport.output(),  # Transport bot output
            tma_out,             # Assistant spoken responses
        ])
        task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True, enable_metrics=True))
        # When the first participant joins, the bot should introduce itself.
        @transport.event_handler("on_first_participant_joined")
        async def on_first_participant_joined(transport, participant):
            # Kick off the conversation.
            messages.append(
                {"role": "system", "content": "Please introduce yourself to the user."})
            await task.queue_frame(LLMMessagesFrame(messages))
        # When a participant joins, start transcription for that participant so the
        # bot can "hear" and respond to them.
        @transport.event_handler("on_participant_joined")
        async def on_participant_joined(transport, participant):
            transport.capture_participant_transcription(participant["id"])
        # When the participant leaves, we exit the bot.
        @transport.event_handler("on_participant_left")
        async def on_participant_left(transport, participant, reason):
            await task.queue_frame(EndFrame())
        # If the call is ended make sure we quit as well.
        @transport.event_handler("on_call_state_updated")
        async def on_call_state_updated(transport, state):
            if state == "left":
                await task.queue_frame(EndFrame())
        # Handle "latency-ping" messages. The client will send app messages that look like
        # this:
        #   { "latency-ping": { ts: <client-side timestamp> }}
        #
        # We want to send an immediate pong back to the client from this handler function.
        # Also, we will push a frame into the top of the pipeline and send it after the
        #
        @transport.event_handler("on_app_message")
        async def on_app_message(transport, message, sender):
            try:
                if "latency-ping" in message:
                    logger.debug(f"Received latency ping app message: {message}")
                    ts = message["latency-ping"]["ts"]
                    # Send immediately
                    transport.output().send_message(DailyTransportMessageFrame(
                        message={"latency-pong-msg-handler": {"ts": ts}},
                        participant_id=sender))
                    # And push to the pipeline for the Daily transport.output to send
                    await tma_in.push_frame(
                        DailyTransportMessageFrame(
                            message={"latency-pong-pipeline-delivery": {"ts": ts}},
                            participant_id=sender))
            except Exception as e:
                logger.debug(f"message handling error: {e} - {message}")
        runner = PipelineRunner()
        await runner.run(task)
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Pipecat Bot")
    parser.add_argument("-s", "--settings", type=str, required=True, help="Pipecat bot settings")
    args, unknown = parser.parse_known_args()
    try:
        settings = BotSettings.model_validate_json(args.settings)
        asyncio.run(main(settings))
    except ValidationError as e:
        print(e)
--- a/examples/fast-bot-metrics/fastbothelpers.py
+++ b/examples/fast-bot-metrics/fastbothelpers.py
@@ -0,0 +1,268 @@
 from loguru import logger
 import asyncio
 import math
 import struct
 import time
 from dataclasses import dataclass, field
 from typing import List
 from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
 from pipecat.frames.frames import (
    Frame,
    AudioRawFrame,
    InterimTranscriptionFrame,
    TranscriptionFrame,
    TextFrame,
    StartInterruptionFrame,
    StopInterruptionFrame,
    LLMFullResponseStartFrame,
    TTSStoppedFrame,
    MetricsFrame
 )
 from pipecat.vad.vad_analyzer import VADAnalyzer, VADState
 from pipecat.services.deepgram import DeepgramTTSService, DeepgramSTTService
 from pipecat.services.openai import OpenAILLMContext, OpenAILLMContextFrame
 class GreedyLLMAggregator(FrameProcessor):
    def __init__(self, context: OpenAILLMContext = None, **kwargs):
        super().__init__(**kwargs)
        self.context: OpenAILLMContext = context if context else OpenAILLMContext()
    async def process_frame(self, frame: Frame, direction: FrameDirection):
        await super().process_frame(frame, direction)
        logger.debug(f"{frame}")
        try:
            if isinstance(frame, InterimTranscriptionFrame):
                return
            if isinstance(frame, TranscriptionFrame):
                # append transcribed text to last "user" frame
                if self.context.messages and self.context.messages[-1]["role"] == "user":
                    last_frame = self.context.messages.pop()
                else:
                    last_frame = {"role": "user", "content": ""}
                last_frame["content"] += " " + frame.text
                self.context.messages.append(last_frame)
                oai_context_frame = OpenAILLMContextFrame(context=self.context)
                logger.debug(f"pushing frame {oai_context_frame}")
                await self.push_frame(oai_context_frame)
                return
            await self.push_frame(frame, direction)
        except Exception as e:
            logger.debug(f"error: {e}")
 class ClearableDeepgramTTSService(DeepgramTTSService):
    def __init___(self, **kwargs):
        super().__init(**kwargs)
    async def process_frame(self, frame: Frame, direction: FrameDirection):
        await super().process_frame(frame, direction)
        if isinstance(frame, StartInterruptionFrame):
            self._current_sentence = ""
@dataclass
 class BufferedSentence:
    audio_frames: List[AudioRawFrame] = field(default_factory=list)
    text_frame: TextFrame = None
 class VADGate(FrameProcessor):
    def __init__(
            self,
            vad_analyzer: VADAnalyzer = None,
            context: OpenAILLMContext = None,
            **kwargs):
        super().__init__(**kwargs)
        self.vad_analyzer = vad_analyzer
        self.context = context
        self._audio_pusher_task = None
        self._expect_text_frame_next = False
        self._sentences: List[BufferedSentence] = []
    # queue output from tts one sentence at a time. associate a buffer of audio frames with the content of
    # each text frame.
    #
    # start a coroutine to service the queue and send sentences down the pipeline when possible.
    # 1. do not send anything when we are not in VADState.QUIET
    # 2. if we are in VADState.QUIET, send a sentence, estimate how long it will take for that sentence
    #    to output, sleep until it's time to send another sentence
    # 3. each time we send a sentence, append it to the conversation context
    # 3. when the sentence buffer becomes empty, cancel the coroutine
    # 4. if we get a new LLMFullResponse, treat that as a cancellation, too
    async def process_frame(self, frame: Frame, direction: FrameDirection):
        await super().process_frame(frame, direction)
        try:
            # A TTSService will emit a series of AudioRawFrame objects, then a TTSStoppedFrame,
            # then a TextFrame.
            if self._expect_text_frame_next:
                self._expect_text_frame_next = False
                if isinstance(frame, TextFrame):
                    self._sentences[-1].text_frame = frame
                else:
                    logger.debug(f"expected a text frame, but received {frame}")
                    await self.push_frame(frame, direction)
                return
            else:
                if isinstance(frame, TextFrame):
                    logger.error(f"XXXXXXXXXXXXXXXXXXX received a text frame, wasn't expecting it.")
            if isinstance(frame, AudioRawFrame):
                # if our buffer is empty or has a "finished" sentence at the end,
                # then we need to start buffering a new sentence
                if not self._sentences or self._sentences[-1].text_frame:
                    self._sentences.append(BufferedSentence())
                self._sentences[-1].audio_frames.append(frame)
                await self.maybe_start_audio_pusher_task()
                return
            if isinstance(frame, TTSStoppedFrame):
                self._expect_text_frame_next = True
                await self.push_frame(frame, direction)
                return
            # There are two ways we can be interrupted. During greedy inference, a new
            # LLM response can start. Or, during playout, we can get a traditional
            # user interruption frame.
            if (isinstance(frame, LLMFullResponseStartFrame) or
                    isinstance(frame, StartInterruptionFrame)):
                logger.debug(f"{frame} - Handle interruption in VADGate")
                self._sentences = []
                if self._audio_pusher_task:
                    self._audio_pusher_task.cancel()
                    self._audio_pusher_task = None
                await self.push_frame(frame, direction)
                return
            await self.push_frame(frame, direction)
        except Exception as e:
            logger.debug(f"error: {e}")
    async def maybe_start_audio_pusher_task(self):
        try:
            if self._audio_pusher_task:
                return
            self._audio_pusher_task = self.get_event_loop().create_task(self.push_audio())
        except Exception as e:
            logger.debug(f"Exception {e}")
    async def push_audio(self):
        try:
            while True:
                if not self._sentences:
                    await asyncio.sleep(0.01)
                    continue
                if self.vad_analyzer._vad_state != VADState.QUIET:
                    await asyncio.sleep(0.01)
                    continue
                # we only want to push completed sentence buffers
                if not self._sentences[0].text_frame:
                    await asyncio.sleep(0.01)
                    continue
                s = self._sentences.pop(0)
                if not s.audio_frames:
                    continue
                sample_rate = s.audio_frames[0].sample_rate
                duration = 0
                logger.debug(f"Pushing {len(s.audio_frames)} audio frames")
                for frame in s.audio_frames:
                    await self.push_frame(frame)
                    # assume linear16 encoding (2 bytes per sample). todo: add some more
                    # metadata to AudioRawFrame, maybe
                    duration += (len(frame.audio) / 2 / frame.num_channels) / sample_rate
                await asyncio.sleep(duration - 20 / 1000)
                if self.context:
                    logger.debug(f"Appending assistant message to context: [{s.text_frame.text}]")
                    self.context.messages.append(
                        {"role": "assistant", "content": s.text_frame.text}
                    )
                await self.push_frame(s.text_frame)
        except Exception as e:
            logger.debug(f"Exception {e}")
 class TranscriptionTimingLogger(FrameProcessor):
    def __init__(self, avt):
        super().__init__()
        self.name = "Transcription"
        self._avt = avt
    async def process_frame(self, frame: Frame, direction: FrameDirection):
        try:
            await super().process_frame(frame, direction)
            if isinstance(frame, TranscriptionFrame):
                elapsed = time.time() - self._avt.last_transition_ts
                logger.debug(f"Transcription TTF: {elapsed}")
                await self.push_frame(MetricsFrame(ttfb={self.name: elapsed}))
            await self.push_frame(frame, direction)
        except Exception as e:
            logger.debug(f"Exception {e}")
 class AudioVolumeTimer(FrameProcessor):
    def __init__(self):
        super().__init__()
        self.last_transition_ts = 0
        self._prev_volume = -80
        self._speech_volume_threshold = -50
    async def process_frame(self, frame: Frame, direction: FrameDirection):
        await super().process_frame(frame, direction)
        if isinstance(frame, AudioRawFrame):
            volume = self.calculate_volume(frame)
            # print(f"Audio volume: {volume:.2f} dB")
            if (volume >= self._speech_volume_threshold and
                    self._prev_volume < self._speech_volume_threshold):
                # logger.debug("transition above speech volume threshold")
                self.last_transition_ts = time.time()
            elif (volume < self._speech_volume_threshold and
                    self._prev_volume >= self._speech_volume_threshold):
                # logger.debug("transition below non-speech volume threshold")
                self.last_transition_ts = time.time()
            self._prev_volume = volume
        await self.push_frame(frame, direction)
    def calculate_volume(self, frame: AudioRawFrame) -> float:
        if frame.num_channels != 1:
            raise ValueError(f"Expected 1 channel, got {frame.num_channels}")
        # Unpack audio data into 16-bit integers
        fmt = f"{len(frame.audio)//2}h"
        audio_samples = struct.unpack(fmt, frame.audio)
        # Calculate RMS
        sum_squares = sum(sample**2 for sample in audio_samples)
        rms = math.sqrt(sum_squares / len(audio_samples))
        # Convert RMS to decibels (dB)
        # Reference: maximum value for 16-bit audio is 32767
        if rms > 0:
            db = 20 * math.log10(rms / 32767)
        else:
            db = -96  # Minimum value (almost silent)
        return db
--- a/examples/fast-bot-metrics/runner.py
+++ b/examples/fast-bot-metrics/runner.py
@@ -0,0 +1,58 @@
 import argparse
 import os
 import time
 import urllib
 import requests
 def configure():
    parser = argparse.ArgumentParser(description="Daily AI SDK Bot Sample")
    parser.add_argument(
        "-u",
        "--url",
        type=str,
        required=False,
        help="URL of the Daily room to join")
    parser.add_argument(
        "-k",
        "--apikey",
        type=str,
        required=False,
        help="Daily API Key (needed to create an owner token for the room)",
    )
    args, unknown = parser.parse_known_args()
    url = args.url or os.getenv("DAILY_SAMPLE_ROOM_URL")
    key = args.apikey or os.getenv("DAILY_API_KEY")
    if not url:
        raise Exception(
            "No Daily room specified. use the -u/--url option from the command line, or set DAILY_SAMPLE_ROOM_URL in your environment to specify a Daily room URL.")
    if not key:
        raise Exception("No Daily API key specified. use the -k/--apikey option from the command line, or set DAILY_API_KEY in your environment to specify a Daily API key, available from https://dashboard.daily.co/developers.")
    # Create a meeting token for the given room with an expiration 1 hour in
    # the future.
    room_name: str = urllib.parse.urlparse(url).path[1:]
    expiration: float = time.time() + 60 * 60
    res: requests.Response = requests.post(
        f"https://api.daily.co/v1/meeting-tokens",
        headers={
            "Authorization": f"Bearer {key}"},
        json={
            "properties": {
                "room_name": room_name,
                "is_owner": True,
                "exp": expiration}},
    )
    if res.status_code != 200:
        raise Exception(
            f"Failed to create meeting token: {res.status_code} {res.text}")
    token: str = res.json()["token"]
    return (url, token)
--- a/examples/fast-bot-metrics/vad-gated-pipeline.py
+++ b/examples/fast-bot-metrics/vad-gated-pipeline.py
@@ -0,0 +1,165 @@
 #
 # Copyright (c) 2024, Daily
 #
 # SPDX-License-Identifier: BSD 2-Clause License
 #
 from loguru import logger
 from runner import configure
 import asyncio
 import aiohttp
 import os
 import sys
 from typing import List
 from pipecat.vad.vad_analyzer import VADParams
 from pipecat.vad.silero import SileroVADAnalyzer
 from pipecat.transports.services.daily import DailyParams, DailyTransport, DailyTransportMessageFrame
 from pipecat.services.openai import OpenAILLMService, OpenAILLMContext
 from pipecat.services.deepgram import DeepgramSTTService
 from pipecat.pipeline.task import PipelineParams, PipelineTask
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.processors.logger import FrameLogger
 from pipecat.frames.frames import LLMMessagesFrame
 from fastbothelpers import (
    GreedyLLMAggregator,
    ClearableDeepgramTTSService,
    VADGate,
    AudioVolumeTimer,
    TranscriptionTimingLogger
 )
 from dotenv import load_dotenv
 load_dotenv(override=True)
 logger.remove(0)
 logger.add(sys.stderr, level="DEBUG")
 async def main(room_url: str, token):
    async with aiohttp.ClientSession() as session:
        transport = DailyTransport(
            room_url,
            token,
            "Respond bot",
            DailyParams(
                audio_out_enabled=True,
                transcription_enabled=False,
                vad_enabled=True,
                vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.400)),
                vad_audio_passthrough=True
            )
        )
        stt = DeepgramSTTService(
            api_key=os.getenv("DEEPGRAM_API_KEY"),
            **({'url': url} if (url := os.getenv("DEEPGRAM_STT_URL")) else {})
        )
        tts = ClearableDeepgramTTSService(
            name="Voice",
            aiohttp_session=session,
            api_key=os.getenv("DEEPGRAM_API_KEY"),
            voice="aura-asteria-en",
            **({'base_url': url} if (url := os.getenv("DEEPGRAM_TTS_BASE_URL")) else {})
        )
        llm = OpenAILLMService(
            name="LLM",
            # To use OpenAI
            api_key=os.getenv("OPENAI_API_KEY"),
            model=os.getenv("OPENAI_MODEL"),
            base_url=os.getenv("OPENAI_BASE_URL")
        )
        messages = [
            {
                "role": "system",
                "content": """You are a helpful assistant in an audio conversation.
 Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers.
 Respond to what the user said in a creative and helpful way. Be concise in your answers to basic questions. If you are asked to elaborate or tell a story, provide a longer response.
 """,
            },
        ]
        ctx = OpenAILLMContext()
        greedy = GreedyLLMAggregator(name="greedy", context=ctx)
        gate = VADGate(name="gate", vad_analyzer=transport.input().vad_analyzer(), context=ctx)
        avt = AudioVolumeTimer()
        tl = TranscriptionTimingLogger(avt)
        pipeline = Pipeline([
            transport.input(),   # Transport user input
            avt,
            stt,
            tl,
            greedy,
            llm,                 # LLM
            tts,                 # TTS
            gate,
            transport.output(),  # Transport bot output
            # FrameLogger()
        ])
        task = PipelineTask(
            pipeline,
            PipelineParams(
                allow_interruptions=True,
                enable_metrics=True,
                report_only_initial_ttfb=True
            ))
        # When a participant joins, start transcription for that participant so the
        # bot can "hear" and respond to them.
        # @ transport.event_handler("on_participant_joined")
        # async def on_participant_joined(transport, participant):
        #    transport.capture_participant_transcription(participant["id"])
        # When the first participant joins, the bot should introduce itself.
        @ transport.event_handler("on_first_participant_joined")
        async def on_first_participant_joined(transport, participant):
            messages.append(
                {"role": "system", "content": "Please introduce yourself to the user."})
            await task.queue_frames([LLMMessagesFrame(messages)])
        # Handle "latency-ping" messages. The client will send app messages that look like
        # this:
        #   { "latency-ping": { ts: <client-side timestamp> }}
        #
        # We want to send an immediate pong back to the client from this handler function.
        # Also, we will push a frame into the top of the pipeline and send it after the
        #
        @ transport.event_handler("on_app_message")
        async def on_app_message(transport, message, sender):
            try:
                if "latency-ping" in message:
                    logger.debug(f"Received latency ping app message: {message}")
                    ts = message["latency-ping"]["ts"]
                    # Send immediately
                    transport.output().send_message(DailyTransportMessageFrame(
                        message={"latency-pong-msg-handler": {"ts": ts}},
                        participant_id=sender))
                    # And push to the pipeline for the Daily transport.output to send
                    await tma_in.push_frame(
                        DailyTransportMessageFrame(
                            message={"latency-pong-pipeline-delivery": {"ts": ts}},
                            participant_id=sender))
            except Exception as e:
                logger.debug(f"message handling error: {e} - {message}")
        runner = PipelineRunner()
        await runner.run(task)
 if __name__ == "__main__":
    (url, token) = configure()
    asyncio.run(main(url, token))
Author	SHA1	Message	Date
Kwindla Hultman Kramer	40f2ed41e9	bot-runner-ified classic-pipelinepy	2024-06-24 01:37:19 +00:00
Kwindla Hultman Kramer	0875fca15d	hacking classic-pipeline.py to be runnable by bot_runner	2024-06-23 20:56:21 -04:00
Kwindla Hultman Kramer	cd67885d0b	khk working on fast-bot-runner bot_runner	2024-06-23 20:28:37 -04:00
Kwindla Hultman Kramer	fa25234296	removed unnecessary file	2024-06-23 19:28:31 -04:00
Kwindla Hultman Kramer	93a4c2cec6	shorten stop_secs in vad-gated-pipeline	2024-06-23 22:40:58 +00:00
Kwindla Hultman Kramer	5ea5d42317	changed tts service name to 'Voice'	2024-06-23 18:13:59 -04:00
Kwindla Hultman Kramer	af2982eeb3	classic pipeline now, too, in fast-bot-metrics directory	2024-06-23 18:12:09 -04:00
Kwindla Hultman Kramer	24189b2591	cleaning up overlapping vad code, helpers, and metrics	2024-06-23 16:08:35 -04:00
Kwindla Hultman Kramer	cf87a3cac1	khk local dg configured for tesla-brilliant	2024-06-23 15:43:21 +00:00
Kwindla Hultman Kramer	533b1f8b56	timing transcription delivery	2024-06-22 21:33:35 -04:00