Compare commits

..

1 Commits

Author SHA1 Message Date
Varun Singh
fbb40bfa92 adding codec 2025-04-19 16:43:18 -04:00
11 changed files with 56 additions and 97 deletions

View File

@@ -9,10 +9,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added
- Added `SmartTurnMetricsData`, which contains end-of-turn prediction metrics,
to the `MetricsFrame`. Using `MetricsFrame`, you can now retrieve prediction
confidence scores and processing time metrics from the smart turn analyzers.
- Added support for Application Default Credentials in Google services,
`GoogleSTTService`, `GoogleTTSService`, and `GoogleVertexLLMService`.

View File

@@ -10,27 +10,24 @@ import aiohttp
import modal
from bot import _voice_bot_process
from fastapi import HTTPException
from fastapi.responses import RedirectResponse
from fastapi.responses import JSONResponse
from loguru import logger
MAX_SESSION_TIME = 15 * 60 # 15 minutes
image = (
modal.Image.debian_slim(python_version="3.13")
.apt_install("ffmpeg")
.pip_install_from_requirements("requirements.txt")
.pip_install("pipecat-ai[daily,silero,cartesia,openai]")
.add_local_python_source("bot")
)
app = modal.App("pipecat-modal")
app = modal.App("pipecat-modal", image=image)
image = modal.Image.debian_slim(python_version="3.12").pip_install_from_requirements(
"requirements.txt"
)
@app.function(
image=image,
cpu=1.0,
secrets=[modal.Secret.from_dotenv()],
min_containers=1,
keep_warm=1,
enable_memory_snapshot=True,
max_inputs=1, # Do not reuse instances across requests
retries=0,
@@ -43,7 +40,7 @@ def launch_bot_process(room_url: str, token: str):
image=image,
secrets=[modal.Secret.from_dotenv()],
)
@modal.fastapi_endpoint(method="GET")
@modal.web_endpoint(method="POST")
async def start():
from pipecat.transports.services.helpers.daily_rest import (
DailyRESTHelper,
@@ -80,4 +77,4 @@ async def start():
# Return room URL to the user to join
# Note: in production, you would want to return a token to the user
return RedirectResponse(room.url)
return JSONResponse(content={"room_url": room.url, token: token})

View File

@@ -1,4 +1,5 @@
python-dotenv==1.0.1
modal==0.71.3
pipecat-ai[daily,silero,cartesia,openai]==0.0.52
fastapi==0.115.6
aiohttp==3.11.11

View File

@@ -56,7 +56,7 @@ async def create_daily_room(room_url: str = None, config_body: Dict[str, Any] =
sip_params = None
if capabilities["enable_dialin"]:
sip_params = DailyRoomSipParams(
display_name="dialin-user", video=False, sip_mode="dial-in", num_endpoints=2
display_name="dialin-user", video=False, sip_mode="dial-in", num_endpoints=2, "codecs": {"audio": ["PCMU"]}
)
# Create the properties object with the appropriate settings

View File

@@ -6,14 +6,14 @@ build-backend = "setuptools.build_meta"
name = "pipecat-ai"
dynamic = ["version"]
description = "An open source framework for voice (and multimodal) assistants"
license = "BSD-2-Clause"
license-files = ["LICENSE"]
license = { text = "BSD 2-Clause License" }
readme = "README.md"
requires-python = ">=3.10"
keywords = ["webrtc", "audio", "video", "ai"]
classifiers = [
"Development Status :: 5 - Production/Stable",
"Intended Audience :: Developers",
"License :: OSI Approved :: BSD License",
"Topic :: Communications :: Conferencing",
"Topic :: Multimedia :: Sound/Audio",
"Topic :: Multimedia :: Video",
@@ -92,11 +92,9 @@ websocket = [ "websockets~=13.1", "fastapi~=0.115.6" ]
whisper = [ "faster-whisper~=1.1.1" ]
[tool.setuptools.packages.find]
# All the following settings are optional:
where = ["src"]
[tool.setuptools.package-data]
"pipecat" = ["py.typed"]
[tool.pytest.ini_options]
addopts = "--verbose"
testpaths = ["tests"]

View File

@@ -6,14 +6,13 @@
import time
from abc import abstractmethod
from typing import Any, Dict, Optional, Tuple
from typing import Dict, Optional
import numpy as np
from loguru import logger
from pydantic import BaseModel
from pipecat.audio.turn.base_turn_analyzer import BaseTurnAnalyzer, EndOfTurnState
from pipecat.metrics.metrics import MetricsData, SmartTurnMetricsData
# Default timing parameters
STOP_SECS = 3
@@ -62,6 +61,7 @@ class BaseSmartTurn(BaseTurnAnalyzer):
self._speech_triggered = True
if self._speech_start_time is None:
self._speech_start_time = time.time()
logger.debug(f"Speech started at {self._speech_start_time}")
else:
if self._speech_triggered:
chunk_duration_ms = len(audio_int16) / (self._sample_rate / 1000)
@@ -87,25 +87,28 @@ class BaseSmartTurn(BaseTurnAnalyzer):
return state
def analyze_end_of_turn(self) -> Tuple[EndOfTurnState, Optional[MetricsData]]:
state, result = self._process_speech_segment(self._audio_buffer)
def analyze_end_of_turn(self) -> EndOfTurnState:
logger.debug("Analyzing End of Turn...")
state = self._process_speech_segment(self._audio_buffer)
if state == EndOfTurnState.COMPLETE or USE_ONLY_LAST_VAD_SEGMENT:
self._clear(state)
logger.debug(f"End of Turn result: {state}")
return state, result
return state
def _clear(self, turn_state: EndOfTurnState):
# Reset internal state for next turn
logger.debug("Clearing audio buffer...")
# If the state is still incomplete, keep the _speech_triggered as True
self._speech_triggered = turn_state == EndOfTurnState.INCOMPLETE
self._audio_buffer = []
self._speech_start_time = None
self._silence_ms = 0
def _process_speech_segment(self, audio_buffer) -> Tuple[EndOfTurnState, Optional[MetricsData]]:
def _process_speech_segment(self, audio_buffer) -> EndOfTurnState:
state = EndOfTurnState.INCOMPLETE
if not audio_buffer:
return state, None
return state
# Extract recent audio segment for prediction
start_time = self._speech_start_time - (self._params.pre_speech_ms / 1000)
@@ -121,13 +124,15 @@ class BaseSmartTurn(BaseTurnAnalyzer):
segment_audio_chunks = [chunk for _, chunk in audio_buffer[start_index : end_index + 1]]
segment_audio = np.concatenate(segment_audio_chunks)
logger.debug(f"Segment audio chunks after start index: {len(segment_audio)}")
# Limit maximum duration
max_samples = int(self._params.max_duration_secs * self.sample_rate)
if len(segment_audio) > max_samples:
# slices the array to keep the last max_samples samples, discarding the earlier part.
segment_audio = segment_audio[-max_samples:]
result_data = None
logger.debug(f"Segment audio chunks after limiting duration: {len(segment_audio)}")
if len(segment_audio) > 0:
start_time = time.perf_counter()
@@ -137,33 +142,20 @@ class BaseSmartTurn(BaseTurnAnalyzer):
)
end_time = time.perf_counter()
# Calculate processing time
e2e_processing_time_ms = (end_time - start_time) * 1000
# Prepare the result data
result_data = SmartTurnMetricsData(
processor="BaseSmartTurn",
is_complete=result["prediction"] == 1,
probability=result["probability"],
inference_time_ms=result.get("inference_time", 0) * 1000,
server_total_time_ms=result.get("total_time", 0) * 1000,
e2e_processing_time_ms=e2e_processing_time_ms,
)
logger.trace(f"Prediction: {'Complete' if result_data.is_complete else 'Incomplete'}")
logger.trace(f"Probability of complete: {result_data.probability:.4f}")
logger.trace(f"Inference time: {result_data.inference_time_ms:.2f}ms")
logger.trace(f"Server total time: {result_data.server_total_time_ms:.2f}ms")
logger.trace(f"E2E processing time: {result_data.e2e_processing_time_ms:.2f}ms")
logger.debug("--------")
logger.debug(f"Prediction: {'Complete' if result['prediction'] == 1 else 'Incomplete'}")
logger.debug(f"Probability of complete: {result['probability']:.4f}")
logger.debug(f"Prediction took {(end_time - start_time) * 1000:.2f}ms seconds")
else:
logger.trace(f"params: {self._params}, stop_ms: {self._stop_ms}")
logger.trace("Captured empty audio segment, skipping prediction.")
logger.debug(f"params: {self._params}, stop_ms: {self._stop_ms}")
logger.debug("Captured empty audio segment, skipping prediction.")
return state, result_data
return state
@abstractmethod
def _predict_endpoint(self, buffer: np.ndarray) -> Dict[str, Any]:
"""Abstract method to predict if a turn has ended based on audio.
def _predict_endpoint(self, buffer: np.ndarray) -> Dict[str, any]:
"""
Abstract method to predict if a turn has ended based on audio.
Args:
buffer: Float32 numpy array of audio samples at 16kHz.

View File

@@ -6,9 +6,7 @@
from abc import ABC, abstractmethod
from enum import Enum
from typing import Optional, Tuple
from pipecat.metrics.metrics import MetricsData
from typing import Optional
class EndOfTurnState(Enum):
@@ -17,10 +15,8 @@ class EndOfTurnState(Enum):
class BaseTurnAnalyzer(ABC):
"""Abstract base class for analyzing user end of turn.
This class inherits from BaseObject to leverage its event handling system
while still defining an abstract interface through abstract methods.
"""
Abstract base class for analyzing user end of turn.
"""
def __init__(self, *, sample_rate: Optional[int] = None):
@@ -29,7 +25,8 @@ class BaseTurnAnalyzer(ABC):
@property
def sample_rate(self) -> int:
"""Returns the current sample rate.
"""
Returns the current sample rate.
Returns:
int: The effective sample rate for audio processing.
@@ -37,7 +34,8 @@ class BaseTurnAnalyzer(ABC):
return self._sample_rate
def set_sample_rate(self, sample_rate: int):
"""Sets the sample rate for audio processing.
"""
Sets the sample rate for audio processing.
If the initial sample rate was provided, it will use that; otherwise, it sets to
the provided sample rate.
@@ -50,7 +48,8 @@ class BaseTurnAnalyzer(ABC):
@property
@abstractmethod
def speech_triggered(self) -> bool:
"""Determines if speech has been detected.
"""
Determines if speech has been detected.
Returns:
bool: True if speech is triggered, otherwise False.
@@ -59,7 +58,8 @@ class BaseTurnAnalyzer(ABC):
@abstractmethod
def append_audio(self, buffer: bytes, is_speech: bool) -> EndOfTurnState:
"""Appends audio data for analysis.
"""
Appends audio data for analysis.
Args:
buffer (bytes): The audio data to append.
@@ -71,8 +71,9 @@ class BaseTurnAnalyzer(ABC):
pass
@abstractmethod
def analyze_end_of_turn(self) -> Tuple[EndOfTurnState, Optional[MetricsData]]:
"""Analyzes if an end of turn has occurred based on the audio input.
def analyze_end_of_turn(self) -> EndOfTurnState:
"""
Analyzes if an end of turn has occurred based on the audio input.
Returns:
EndOfTurnState: The result of the end of turn analysis.

View File

@@ -30,13 +30,3 @@ class LLMUsageMetricsData(MetricsData):
class TTSUsageMetricsData(MetricsData):
value: int
class SmartTurnMetricsData(MetricsData):
"""Metrics data for smart turn predictions."""
is_complete: bool
probability: float
inference_time_ms: float
server_total_time_ms: float
e2e_processing_time_ms: float

View File

@@ -0,0 +1 @@

View File

@@ -70,7 +70,7 @@ class OpenAITTSService(TTSService):
if sample_rate and sample_rate != self.OPENAI_SAMPLE_RATE:
logger.warning(
f"OpenAI TTS only supports {self.OPENAI_SAMPLE_RATE}Hz sample rate. "
f"Current rate of {sample_rate}Hz may cause issues."
f"Current rate of {self.sample_rate}Hz may cause issues."
)
super().__init__(sample_rate=sample_rate, **kwargs)

View File

@@ -6,14 +6,11 @@
import asyncio
from concurrent.futures import ThreadPoolExecutor
from typing import Mapping, Optional
from typing import Optional
from loguru import logger
from pipecat.audio.turn.base_turn_analyzer import (
BaseTurnAnalyzer,
EndOfTurnState,
)
from pipecat.audio.turn.base_turn_analyzer import BaseTurnAnalyzer, EndOfTurnState
from pipecat.audio.vad.vad_analyzer import VADAnalyzer, VADState
from pipecat.frames.frames import (
BotInterruptionFrame,
@@ -24,7 +21,6 @@ from pipecat.frames.frames import (
FilterUpdateSettingsFrame,
Frame,
InputAudioRawFrame,
MetricsFrame,
StartFrame,
StartInterruptionFrame,
StopInterruptionFrame,
@@ -33,7 +29,6 @@ from pipecat.frames.frames import (
UserStoppedSpeakingFrame,
VADParamsUpdateFrame,
)
from pipecat.metrics.metrics import MetricsData, SmartTurnMetricsData
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.transports.base_transport import TransportParams
@@ -83,7 +78,6 @@ class BaseInputTransport(FrameProcessor):
# Configure End of turn analyzer.
if self._params.turn_analyzer:
self._params.turn_analyzer.set_sample_rate(self._sample_rate)
# Start audio filter.
if self._params.audio_in_filter:
await self._params.audio_in_filter.start(self._sample_rate)
@@ -222,12 +216,9 @@ class BaseInputTransport(FrameProcessor):
async def _handle_end_of_turn(self):
if self.turn_analyzer:
state, prediction = await self.get_event_loop().run_in_executor(
state = await self.get_event_loop().run_in_executor(
self._executor, self.turn_analyzer.analyze_end_of_turn
)
await self._handle_prediction_result(prediction)
await self._handle_end_of_turn_complete(state)
async def _handle_end_of_turn_complete(self, state: EndOfTurnState):
@@ -272,11 +263,3 @@ class BaseInputTransport(FrameProcessor):
await self.push_frame(frame)
self._audio_in_queue.task_done()
async def _handle_prediction_result(self, result: MetricsData):
"""Handle a prediction result event from the turn analyzer.
Args:
result: The prediction result MetricsData.
"""
await self.push_frame(MetricsFrame(data=[result]))