Compare commits
25 Commits
jpt/fastbo
...
v0.0.35
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8dff460307 | ||
|
|
cce1ddb183 | ||
|
|
8691d14289 | ||
|
|
dd402da9e5 | ||
|
|
2fd04248f1 | ||
|
|
0ac42006f8 | ||
|
|
66e331248d | ||
|
|
4be3e8c87d | ||
|
|
dac033fe61 | ||
|
|
d302cbb114 | ||
|
|
e3b407db28 | ||
|
|
4ef623f09e | ||
|
|
253530a63d | ||
|
|
4f38d989f5 | ||
|
|
84074e90ee | ||
|
|
38aee7d8f2 | ||
|
|
64198313c6 | ||
|
|
d61b6c301c | ||
|
|
83d1931266 | ||
|
|
c31f2ab285 | ||
|
|
0ddc5721b4 | ||
|
|
98bd183bc4 | ||
|
|
aaa154524c | ||
|
|
beced68337 | ||
|
|
94823ab952 |
44
CHANGELOG.md
44
CHANGELOG.md
@@ -5,6 +5,50 @@ All notable changes to **pipecat** will be documented in this file.
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
## [0.0.35] - 2024-06-28
|
||||
|
||||
### Changed
|
||||
|
||||
- `FastAPIWebsocketParams` now require a serializer.
|
||||
|
||||
- `TwilioFrameSerializer` now requires a `streamSid`.
|
||||
|
||||
### Fixed
|
||||
|
||||
- Silero VAD number of frames needs to be 512 for 16000 sample rate or 256 for
|
||||
8000 sample rate.
|
||||
|
||||
## [0.0.34] - 2024-06-25
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed an issue with asynchronous STT services (Deepgram and Azure) that could
|
||||
interruptions to ignore transcriptions.
|
||||
|
||||
- Fixed an issue introduced in 0.0.33 that would cause the LLM to generate
|
||||
shorter output.
|
||||
|
||||
## [0.0.33] - 2024-06-25
|
||||
|
||||
### Changed
|
||||
|
||||
- Upgraded to Cartesia's new Python library 1.0.0. `CartesiaTTSService` now
|
||||
expects a voice ID instead of a voice name (you can get the voice ID from
|
||||
Cartesia's playground). You can also specify the audio `sample_rate` and
|
||||
`encoding` instead of the previous `output_format`.
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed an issue with asynchronous STT services (Deepgram and Azure) that could
|
||||
cause static audio issues and interruptions to not work properly when dealing
|
||||
with multiple LLMs sentences.
|
||||
|
||||
- Fixed an issue that could mix new LLM responses with previous ones when
|
||||
handling interruptions.
|
||||
|
||||
- Fixed a Daily transport blocking situation that occurred while reading audio
|
||||
frames after a participant left the room. Needs daily-python >= 0.10.1.
|
||||
|
||||
## [0.0.32] - 2024-06-22
|
||||
|
||||
### Added
|
||||
|
||||
@@ -38,7 +38,6 @@ async def main(room_url: str, token):
|
||||
"Respond bot",
|
||||
DailyParams(
|
||||
audio_out_enabled=True,
|
||||
audio_out_sample_rate=44100,
|
||||
transcription_enabled=True,
|
||||
vad_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer()
|
||||
@@ -47,8 +46,7 @@ async def main(room_url: str, token):
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
voice_name="British Lady",
|
||||
output_format="pcm_44100"
|
||||
voice_id="a0e99841-438c-4a64-b679-ae501e7d6091", # Barbershop Man
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
|
||||
@@ -66,7 +66,6 @@ async def main(room_url: str, token):
|
||||
"Pipecat",
|
||||
DailyParams(
|
||||
audio_out_enabled=True,
|
||||
audio_out_sample_rate=44100,
|
||||
transcription_enabled=True,
|
||||
vad_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer()
|
||||
@@ -75,20 +74,17 @@ async def main(room_url: str, token):
|
||||
|
||||
news_lady = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
voice_name="Newslady",
|
||||
output_format="pcm_44100"
|
||||
voice_id="bf991597-6c13-47e4-8411-91ec2de5c466", # Newslady
|
||||
)
|
||||
|
||||
british_lady = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
voice_name="British Lady",
|
||||
output_format="pcm_44100"
|
||||
voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady
|
||||
)
|
||||
|
||||
barbershop_man = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
voice_name="Barbershop Man",
|
||||
output_format="pcm_44100"
|
||||
voice_id="a0e99841-438c-4a64-b679-ae501e7d6091", # Barbershop Man
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
|
||||
@@ -899,11 +899,11 @@ brace-expansion@^2.0.1:
|
||||
balanced-match "^1.0.0"
|
||||
|
||||
braces@^3.0.2, braces@~3.0.2:
|
||||
version "3.0.2"
|
||||
resolved "https://registry.yarnpkg.com/braces/-/braces-3.0.2.tgz#3454e1a462ee8d599e236df336cd9ea4f8afe107"
|
||||
integrity sha512-b8um+L1RzM3WDSzvhm6gIz1yfTbBt6YTlcEKAvsmqCZZFw46z626lVj9j1yEPW33H5H+lBQpZMP1k8l+78Ha0A==
|
||||
version "3.0.3"
|
||||
resolved "https://registry.yarnpkg.com/braces/-/braces-3.0.3.tgz#490332f40919452272d55a8480adc0c441358789"
|
||||
integrity "sha1-SQMy9AkZRSJy1VqEgK3AxEE1h4k= sha512-yQbXgO/OSZVD2IsiLlro+7Hf6Q18EJrKSEsdoMzKePKXct3gvD8oLcOQdIzGupr5Fj+EDe8gO/lxc1BzfMpxvA=="
|
||||
dependencies:
|
||||
fill-range "^7.0.1"
|
||||
fill-range "^7.1.1"
|
||||
|
||||
browserslist@^4.23.0:
|
||||
version "4.23.0"
|
||||
@@ -1551,10 +1551,10 @@ file-entry-cache@^6.0.1:
|
||||
dependencies:
|
||||
flat-cache "^3.0.4"
|
||||
|
||||
fill-range@^7.0.1:
|
||||
version "7.0.1"
|
||||
resolved "https://registry.yarnpkg.com/fill-range/-/fill-range-7.0.1.tgz#1919a6a7c75fe38b2c7c77e5198535da9acdda40"
|
||||
integrity sha512-qOo9F+dMUmC2Lcb4BbVvnKJxTPjCm+RRpe4gDuGrzkL7mEVl/djYSu2OdQ2Pa302N4oqkSg9ir6jaLWJ2USVpQ==
|
||||
fill-range@^7.1.1:
|
||||
version "7.1.1"
|
||||
resolved "https://registry.yarnpkg.com/fill-range/-/fill-range-7.1.1.tgz#44265d3cac07e3ea7dc247516380643754a05292"
|
||||
integrity "sha1-RCZdPKwH4+p9wkdRY4BkN1SgUpI= sha512-YsGpe3WHLK8ZYi4tWDg2Jy3ebRz2rXowDxnld4bkQB00cc/1Zw9AWnC0i9ztDJitivtQvaI9KaLyKrc+hBW0yg=="
|
||||
dependencies:
|
||||
to-regex-range "^5.0.1"
|
||||
|
||||
|
||||
@@ -15,6 +15,7 @@ from pipecat.services.deepgram import DeepgramSTTService
|
||||
from pipecat.services.elevenlabs import ElevenLabsTTSService
|
||||
from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketTransport, FastAPIWebsocketParams
|
||||
from pipecat.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.serializers.twilio import TwilioFrameSerializer
|
||||
|
||||
from loguru import logger
|
||||
|
||||
@@ -25,7 +26,7 @@ logger.remove(0)
|
||||
logger.add(sys.stderr, level="DEBUG")
|
||||
|
||||
|
||||
async def run_bot(websocket_client):
|
||||
async def run_bot(websocket_client, stream_sid):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
transport = FastAPIWebsocketTransport(
|
||||
websocket=websocket_client,
|
||||
@@ -34,7 +35,8 @@ async def run_bot(websocket_client):
|
||||
add_wav_header=False,
|
||||
vad_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
vad_audio_passthrough=True
|
||||
vad_audio_passthrough=True,
|
||||
serializer=TwilioFrameSerializer(stream_sid)
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
import json
|
||||
|
||||
import uvicorn
|
||||
|
||||
from fastapi import FastAPI, WebSocket
|
||||
@@ -26,8 +28,13 @@ async def start_call():
|
||||
@app.websocket("/ws")
|
||||
async def websocket_endpoint(websocket: WebSocket):
|
||||
await websocket.accept()
|
||||
start_data = websocket.iter_text()
|
||||
await start_data.__anext__()
|
||||
call_data = json.loads(await start_data.__anext__())
|
||||
print(call_data, flush=True)
|
||||
stream_sid = call_data['start']['streamSid']
|
||||
print("WebSocket connection accepted")
|
||||
await run_bot(websocket)
|
||||
await run_bot(websocket, stream_sid)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
#
|
||||
# pip-compile --all-extras pyproject.toml
|
||||
#
|
||||
aiofiles==23.2.1
|
||||
aiofiles==24.1.0
|
||||
# via deepgram-sdk
|
||||
aiohttp==3.9.5
|
||||
# via
|
||||
@@ -44,15 +44,13 @@ blinker==1.8.2
|
||||
# via flask
|
||||
cachetools==5.3.3
|
||||
# via google-auth
|
||||
cartesia==0.1.1
|
||||
cartesia==1.0.0
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
certifi==2024.6.2
|
||||
# via
|
||||
# httpcore
|
||||
# httpx
|
||||
# requests
|
||||
cffi==1.16.0
|
||||
# via sounddevice
|
||||
charset-normalizer==3.3.2
|
||||
# via requests
|
||||
click==8.1.7
|
||||
@@ -64,7 +62,7 @@ coloredlogs==15.0.1
|
||||
# via onnxruntime
|
||||
ctranslate2==4.3.1
|
||||
# via faster-whisper
|
||||
daily-python==0.10.0
|
||||
daily-python==0.10.1
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
dataclasses-json==0.6.7
|
||||
# via
|
||||
@@ -94,7 +92,7 @@ fastapi-cli==0.0.4
|
||||
# via fastapi
|
||||
faster-whisper==1.0.2
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
filelock==3.15.3
|
||||
filelock==3.15.4
|
||||
# via
|
||||
# huggingface-hub
|
||||
# pyht
|
||||
@@ -121,7 +119,7 @@ future==1.0.0
|
||||
# via pyloudnorm
|
||||
google-ai-generativelanguage==0.6.4
|
||||
# via google-generativeai
|
||||
google-api-core[grpc]==2.19.0
|
||||
google-api-core[grpc]==2.19.1
|
||||
# via
|
||||
# google-ai-generativelanguage
|
||||
# google-api-python-client
|
||||
@@ -139,7 +137,7 @@ google-auth-httplib2==0.2.0
|
||||
# via google-api-python-client
|
||||
google-generativeai==0.5.4
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
googleapis-common-protos==1.63.1
|
||||
googleapis-common-protos==1.63.2
|
||||
# via
|
||||
# google-api-core
|
||||
# grpcio-status
|
||||
@@ -219,7 +217,7 @@ langchain-openai==0.1.9
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
langchain-text-splitters==0.2.1
|
||||
# via langchain
|
||||
langsmith==0.1.81
|
||||
langsmith==0.1.82
|
||||
# via
|
||||
# langchain
|
||||
# langchain-community
|
||||
@@ -338,8 +336,6 @@ pyasn1-modules==0.4.0
|
||||
# via google-auth
|
||||
pyaudio==0.2.14
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
pycparser==2.22
|
||||
# via cffi
|
||||
pydantic==2.7.4
|
||||
# via
|
||||
# anthropic
|
||||
@@ -404,7 +400,7 @@ safetensors==0.4.3
|
||||
# via
|
||||
# timm
|
||||
# transformers
|
||||
scipy==1.13.1
|
||||
scipy==1.14.0
|
||||
# via pyloudnorm
|
||||
shellingham==1.5.4
|
||||
# via typer
|
||||
@@ -416,8 +412,6 @@ sniffio==1.3.1
|
||||
# anyio
|
||||
# httpx
|
||||
# openai
|
||||
sounddevice==0.4.7
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
sqlalchemy==2.0.31
|
||||
# via
|
||||
# langchain
|
||||
@@ -428,7 +422,7 @@ sympy==1.12.1
|
||||
# via
|
||||
# onnxruntime
|
||||
# torch
|
||||
tenacity==8.4.1
|
||||
tenacity==8.4.2
|
||||
# via
|
||||
# langchain
|
||||
# langchain-community
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with Python 3.12
|
||||
# This file is autogenerated by pip-compile with Python 3.10
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile --all-extras pyproject.toml
|
||||
#
|
||||
aiofiles==23.2.1
|
||||
aiofiles==24.1.0
|
||||
# via deepgram-sdk
|
||||
aiohttp==3.9.5
|
||||
# via
|
||||
@@ -28,6 +28,10 @@ anyio==4.4.0
|
||||
# openai
|
||||
# starlette
|
||||
# watchfiles
|
||||
async-timeout==4.0.3
|
||||
# via
|
||||
# aiohttp
|
||||
# langchain
|
||||
attrs==23.2.0
|
||||
# via
|
||||
# aiohttp
|
||||
@@ -40,15 +44,13 @@ blinker==1.8.2
|
||||
# via flask
|
||||
cachetools==5.3.3
|
||||
# via google-auth
|
||||
cartesia==0.1.1
|
||||
cartesia==1.0.0
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
certifi==2024.6.2
|
||||
# via
|
||||
# httpcore
|
||||
# httpx
|
||||
# requests
|
||||
cffi==1.16.0
|
||||
# via sounddevice
|
||||
charset-normalizer==3.3.2
|
||||
# via requests
|
||||
click==8.1.7
|
||||
@@ -60,7 +62,7 @@ coloredlogs==15.0.1
|
||||
# via onnxruntime
|
||||
ctranslate2==4.3.1
|
||||
# via faster-whisper
|
||||
daily-python==0.10.0
|
||||
daily-python==0.10.1
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
dataclasses-json==0.6.7
|
||||
# via
|
||||
@@ -78,6 +80,10 @@ einops==0.8.0
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
email-validator==2.2.0
|
||||
# via fastapi
|
||||
exceptiongroup==1.2.1
|
||||
# via
|
||||
# anyio
|
||||
# pytest
|
||||
fal-client==0.4.0
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
fastapi==0.111.0
|
||||
@@ -86,7 +92,7 @@ fastapi-cli==0.0.4
|
||||
# via fastapi
|
||||
faster-whisper==1.0.2
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
filelock==3.15.3
|
||||
filelock==3.15.4
|
||||
# via
|
||||
# huggingface-hub
|
||||
# pyht
|
||||
@@ -112,7 +118,7 @@ future==1.0.0
|
||||
# via pyloudnorm
|
||||
google-ai-generativelanguage==0.6.4
|
||||
# via google-generativeai
|
||||
google-api-core[grpc]==2.19.0
|
||||
google-api-core[grpc]==2.19.1
|
||||
# via
|
||||
# google-ai-generativelanguage
|
||||
# google-api-python-client
|
||||
@@ -130,7 +136,7 @@ google-auth-httplib2==0.2.0
|
||||
# via google-api-python-client
|
||||
google-generativeai==0.5.4
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
googleapis-common-protos==1.63.1
|
||||
googleapis-common-protos==1.63.2
|
||||
# via
|
||||
# google-api-core
|
||||
# grpcio-status
|
||||
@@ -204,11 +210,11 @@ langchain-core==0.2.9
|
||||
# langchain-community
|
||||
# langchain-openai
|
||||
# langchain-text-splitters
|
||||
langchain-openai==0.1.9
|
||||
langchain-openai==0.1.10
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
langchain-text-splitters==0.2.1
|
||||
# via langchain
|
||||
langsmith==0.1.81
|
||||
langsmith==0.1.82
|
||||
# via
|
||||
# langchain
|
||||
# langchain-community
|
||||
@@ -296,8 +302,6 @@ pyasn1-modules==0.4.0
|
||||
# via google-auth
|
||||
pyaudio==0.2.14
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
pycparser==2.22
|
||||
# via cffi
|
||||
pydantic==2.7.4
|
||||
# via
|
||||
# anthropic
|
||||
@@ -362,7 +366,7 @@ safetensors==0.4.3
|
||||
# via
|
||||
# timm
|
||||
# transformers
|
||||
scipy==1.13.1
|
||||
scipy==1.14.0
|
||||
# via pyloudnorm
|
||||
shellingham==1.5.4
|
||||
# via typer
|
||||
@@ -374,8 +378,6 @@ sniffio==1.3.1
|
||||
# anyio
|
||||
# httpx
|
||||
# openai
|
||||
sounddevice==0.4.7
|
||||
# via pipecat-ai (pyproject.toml)
|
||||
sqlalchemy==2.0.31
|
||||
# via
|
||||
# langchain
|
||||
@@ -386,7 +388,7 @@ sympy==1.12.1
|
||||
# via
|
||||
# onnxruntime
|
||||
# torch
|
||||
tenacity==8.4.1
|
||||
tenacity==8.4.2
|
||||
# via
|
||||
# langchain
|
||||
# langchain-community
|
||||
@@ -400,6 +402,8 @@ tokenizers==0.19.1
|
||||
# anthropic
|
||||
# faster-whisper
|
||||
# transformers
|
||||
tomli==2.0.1
|
||||
# via pytest
|
||||
torch==2.3.1
|
||||
# via
|
||||
# pipecat-ai (pyproject.toml)
|
||||
@@ -423,6 +427,7 @@ typer==0.12.3
|
||||
typing-extensions==4.12.2
|
||||
# via
|
||||
# anthropic
|
||||
# anyio
|
||||
# deepgram-sdk
|
||||
# fastapi
|
||||
# google-generativeai
|
||||
@@ -435,6 +440,7 @@ typing-extensions==4.12.2
|
||||
# torch
|
||||
# typer
|
||||
# typing-inspect
|
||||
# uvicorn
|
||||
typing-inspect==0.9.0
|
||||
# via dataclasses-json
|
||||
ujson==5.10.0
|
||||
|
||||
@@ -36,8 +36,8 @@ Website = "https://pipecat.ai"
|
||||
[project.optional-dependencies]
|
||||
anthropic = [ "anthropic~=0.25.7" ]
|
||||
azure = [ "azure-cognitiveservices-speech~=1.37.0" ]
|
||||
cartesia = [ "numpy~=1.26.0", "sounddevice", "cartesia" ]
|
||||
daily = [ "daily-python~=0.10.0" ]
|
||||
cartesia = [ "cartesia~=1.0.0" ]
|
||||
daily = [ "daily-python~=0.10.1" ]
|
||||
deepgram = [ "deepgram-sdk~=3.2.7" ]
|
||||
examples = [ "python-dotenv~=1.0.0", "flask~=3.0.3", "flask_cors~=4.0.1" ]
|
||||
fal = [ "fal-client~=0.4.0" ]
|
||||
|
||||
@@ -17,8 +17,8 @@ class TwilioFrameSerializer(FrameSerializer):
|
||||
AudioRawFrame: "audio",
|
||||
}
|
||||
|
||||
def __init__(self):
|
||||
self._sid = None
|
||||
def __init__(self, stream_sid: str):
|
||||
self._stream_sid = stream_sid
|
||||
|
||||
def serialize(self, frame: Frame) -> str | bytes | None:
|
||||
if not isinstance(frame, AudioRawFrame):
|
||||
@@ -30,7 +30,7 @@ class TwilioFrameSerializer(FrameSerializer):
|
||||
payload = base64.b64encode(serialized_data).decode("utf-8")
|
||||
answer = {
|
||||
"event": "media",
|
||||
"streamSid": self._sid,
|
||||
"streamSid": self._stream_sid,
|
||||
"media": {
|
||||
"payload": payload
|
||||
}
|
||||
@@ -41,9 +41,6 @@ class TwilioFrameSerializer(FrameSerializer):
|
||||
def deserialize(self, data: str | bytes) -> Frame | None:
|
||||
message = json.loads(data)
|
||||
|
||||
if not self._sid:
|
||||
self._sid = message["streamSid"] if "streamSid" in message else None
|
||||
|
||||
if message["event"] != "media":
|
||||
return None
|
||||
else:
|
||||
|
||||
@@ -16,12 +16,13 @@ from pipecat.frames.frames import (
|
||||
EndFrame,
|
||||
ErrorFrame,
|
||||
Frame,
|
||||
LLMFullResponseEndFrame,
|
||||
StartFrame,
|
||||
StartInterruptionFrame,
|
||||
TTSStartedFrame,
|
||||
TTSStoppedFrame,
|
||||
TextFrame,
|
||||
VisionImageRawFrame,
|
||||
LLMFullResponseEndFrame,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.utils.audio import calculate_audio_volume
|
||||
@@ -114,13 +115,17 @@ class TTSService(AIService):
|
||||
if self._current_sentence.strip().endswith(
|
||||
(".", "?", "!")) and not self._current_sentence.strip().endswith(
|
||||
("Mr,", "Mrs.", "Ms.", "Dr.")):
|
||||
text = self._current_sentence.strip()
|
||||
text = self._current_sentence
|
||||
self._current_sentence = ""
|
||||
|
||||
if text:
|
||||
await self._push_tts_frames(text)
|
||||
|
||||
async def _push_tts_frames(self, text: str):
|
||||
text = text.strip()
|
||||
if not text:
|
||||
return
|
||||
|
||||
await self.push_frame(TTSStartedFrame())
|
||||
await self.process_generator(self.run_tts(text))
|
||||
await self.push_frame(TTSStoppedFrame())
|
||||
@@ -133,14 +138,12 @@ class TTSService(AIService):
|
||||
|
||||
if isinstance(frame, TextFrame):
|
||||
await self._process_text_frame(frame)
|
||||
elif isinstance(frame, EndFrame):
|
||||
if self._current_sentence:
|
||||
await self._push_tts_frames(self._current_sentence)
|
||||
await self.push_frame(frame)
|
||||
elif isinstance(frame, LLMFullResponseEndFrame):
|
||||
if self._current_sentence:
|
||||
await self._push_tts_frames(self._current_sentence.strip())
|
||||
self._current_sentence = ""
|
||||
elif isinstance(frame, StartInterruptionFrame):
|
||||
self._current_sentence = ""
|
||||
await self.push_frame(frame, direction)
|
||||
elif isinstance(frame, LLMFullResponseEndFrame) or isinstance(frame, EndFrame):
|
||||
self._current_sentence = ""
|
||||
await self._push_tts_frames(self._current_sentence)
|
||||
await self.push_frame(frame)
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
@@ -12,7 +12,17 @@ import time
|
||||
from PIL import Image
|
||||
from typing import AsyncGenerator
|
||||
|
||||
from pipecat.frames.frames import AudioRawFrame, CancelFrame, EndFrame, ErrorFrame, Frame, StartFrame, SystemFrame, TranscriptionFrame, URLImageRawFrame
|
||||
from pipecat.frames.frames import (
|
||||
AudioRawFrame,
|
||||
CancelFrame,
|
||||
EndFrame,
|
||||
ErrorFrame,
|
||||
Frame,
|
||||
StartFrame,
|
||||
StartInterruptionFrame,
|
||||
SystemFrame,
|
||||
TranscriptionFrame,
|
||||
URLImageRawFrame)
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.services.ai_services import AIService, TTSService, ImageGenService
|
||||
from pipecat.services.openai import BaseOpenAILLMService
|
||||
@@ -34,7 +44,7 @@ try:
|
||||
except ModuleNotFoundError as e:
|
||||
logger.error(f"Exception: {e}")
|
||||
logger.error(
|
||||
"In order to use Azure TTS, you need to `pip install pipecat-ai[azure]`. Also, set `AZURE_SPEECH_API_KEY` and `AZURE_SPEECH_REGION` environment variables.")
|
||||
"In order to use Azure, you need to `pip install pipecat-ai[azure]`. Also, set `AZURE_SPEECH_API_KEY` and `AZURE_SPEECH_REGION` environment variables.")
|
||||
raise Exception(f"Missing module: {e}")
|
||||
|
||||
|
||||
@@ -123,12 +133,18 @@ class AzureSTTService(AIService):
|
||||
speech_config=speech_config, audio_config=audio_config)
|
||||
self._speech_recognizer.recognized.connect(self._on_handle_recognized)
|
||||
|
||||
# This event will be used to ignore out-of-band transcriptions while we
|
||||
# are itnerrupted.
|
||||
self._is_interrupted_event = asyncio.Event()
|
||||
|
||||
self._create_push_task()
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, SystemFrame):
|
||||
if isinstance(frame, StartInterruptionFrame):
|
||||
await self._handle_interruptions(frame)
|
||||
elif isinstance(frame, SystemFrame):
|
||||
await self.push_frame(frame, direction)
|
||||
elif isinstance(frame, AudioRawFrame):
|
||||
self._audio_stream.write(frame.audio)
|
||||
@@ -148,6 +164,16 @@ class AzureSTTService(AIService):
|
||||
self._push_frame_task.cancel()
|
||||
await self._push_frame_task
|
||||
|
||||
async def _handle_interruptions(self, frame: Frame):
|
||||
# Cancel the task. This will stop pushing frames downstream.
|
||||
self._push_frame_task.cancel()
|
||||
await self._push_frame_task
|
||||
# Push an out-of-band frame (i.e. not using the ordered push
|
||||
# frame task).
|
||||
await self.push_frame(frame)
|
||||
# Create a new queue and task.
|
||||
self._create_push_task()
|
||||
|
||||
def _create_push_task(self):
|
||||
self._push_queue = asyncio.Queue()
|
||||
self._push_frame_task = self.get_event_loop().create_task(self._push_frame_task_handler())
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
from cartesia.tts import AsyncCartesiaTTS
|
||||
from cartesia import AsyncCartesia
|
||||
|
||||
from typing import AsyncGenerator
|
||||
|
||||
@@ -20,22 +20,24 @@ class CartesiaTTSService(TTSService):
|
||||
self,
|
||||
*,
|
||||
api_key: str,
|
||||
voice_name: str,
|
||||
model_id: str = "upbeat-moon",
|
||||
output_format: str = "pcm_16000",
|
||||
voice_id: str,
|
||||
model_id: str = "sonic-english",
|
||||
encoding: str = "pcm_s16le",
|
||||
sample_rate: int = 16000,
|
||||
**kwargs):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self._api_key = api_key
|
||||
self._voice_name = voice_name
|
||||
self._model_id = model_id
|
||||
self._output_format = output_format
|
||||
self._output_format = {
|
||||
"container": "raw",
|
||||
"encoding": encoding,
|
||||
"sample_rate": sample_rate,
|
||||
}
|
||||
|
||||
try:
|
||||
self._client = AsyncCartesiaTTS(api_key=self._api_key)
|
||||
voices = self._client.get_voices()
|
||||
voice_id = voices[self._voice_name]["id"]
|
||||
self._voice = self._client.get_voice_embedding(voice_id=voice_id)
|
||||
self._client = AsyncCartesia(api_key=self._api_key)
|
||||
self._voice = self._client.voices.get(id=voice_id)
|
||||
except Exception as e:
|
||||
logger.error(f"{self} initialization error: {e}")
|
||||
|
||||
@@ -48,16 +50,16 @@ class CartesiaTTSService(TTSService):
|
||||
try:
|
||||
await self.start_ttfb_metrics()
|
||||
|
||||
chunk_generator = await self._client.generate(
|
||||
chunk_generator = await self._client.tts.sse(
|
||||
stream=True,
|
||||
transcript=text,
|
||||
voice=self._voice,
|
||||
voice_embedding=self._voice["embedding"],
|
||||
model_id=self._model_id,
|
||||
output_format=self._output_format,
|
||||
)
|
||||
|
||||
async for chunk in chunk_generator:
|
||||
await self.stop_ttfb_metrics()
|
||||
yield AudioRawFrame(chunk["audio"], chunk["sampling_rate"], 1)
|
||||
yield AudioRawFrame(chunk["audio"], self._output_format["sample_rate"], 1)
|
||||
except Exception as e:
|
||||
logger.error(f"{self} exception: {e}")
|
||||
|
||||
@@ -18,20 +18,28 @@ from pipecat.frames.frames import (
|
||||
Frame,
|
||||
InterimTranscriptionFrame,
|
||||
StartFrame,
|
||||
StartInterruptionFrame,
|
||||
SystemFrame,
|
||||
TranscriptionFrame)
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.services.ai_services import AIService, TTSService
|
||||
|
||||
from deepgram import (
|
||||
DeepgramClient,
|
||||
DeepgramClientOptions,
|
||||
LiveTranscriptionEvents,
|
||||
LiveOptions,
|
||||
)
|
||||
|
||||
from loguru import logger
|
||||
|
||||
# See .env.example for Deepgram configuration needed
|
||||
try:
|
||||
from deepgram import (
|
||||
DeepgramClient,
|
||||
DeepgramClientOptions,
|
||||
LiveTranscriptionEvents,
|
||||
LiveOptions,
|
||||
)
|
||||
except ModuleNotFoundError as e:
|
||||
logger.error(f"Exception: {e}")
|
||||
logger.error(
|
||||
"In order to use Deepgram, you need to `pip install pipecat-ai[deepgram]`. Also, set `DEEPGRAM_API_KEY` environment variable.")
|
||||
raise Exception(f"Missing module: {e}")
|
||||
|
||||
|
||||
class DeepgramTTSService(TTSService):
|
||||
|
||||
@@ -114,7 +122,9 @@ class DeepgramSTTService(AIService):
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, SystemFrame):
|
||||
if isinstance(frame, StartInterruptionFrame):
|
||||
await self._handle_interruptions(frame)
|
||||
elif isinstance(frame, SystemFrame):
|
||||
await self.push_frame(frame, direction)
|
||||
elif isinstance(frame, AudioRawFrame):
|
||||
await self._connection.send(frame.audio)
|
||||
@@ -137,6 +147,16 @@ class DeepgramSTTService(AIService):
|
||||
self._push_frame_task.cancel()
|
||||
await self._push_frame_task
|
||||
|
||||
async def _handle_interruptions(self, frame: Frame):
|
||||
# Cancel the task. This will stop pushing frames downstream.
|
||||
self._push_frame_task.cancel()
|
||||
await self._push_frame_task
|
||||
# Push an out-of-band frame (i.e. not using the ordered push
|
||||
# frame task).
|
||||
await self.push_frame(frame)
|
||||
# Create a new queue and task.
|
||||
self._create_push_task()
|
||||
|
||||
def _create_push_task(self):
|
||||
self._push_queue = asyncio.Queue()
|
||||
self._push_frame_task = self.get_event_loop().create_task(self._push_frame_task_handler())
|
||||
|
||||
@@ -55,7 +55,7 @@ class BaseInputTransport(FrameProcessor):
|
||||
|
||||
async def push_audio_frame(self, frame: AudioRawFrame):
|
||||
if self._params.audio_in_enabled or self._params.vad_enabled:
|
||||
self._audio_in_queue.put_nowait(frame)
|
||||
await self._audio_in_queue.put(frame)
|
||||
|
||||
#
|
||||
# Frame processor
|
||||
@@ -113,10 +113,15 @@ class BaseInputTransport(FrameProcessor):
|
||||
# Make sure we notify about interruptions quickly out-of-band
|
||||
if isinstance(frame, UserStartedSpeakingFrame):
|
||||
logger.debug("User started speaking")
|
||||
# Cancel the task. This will stop pushing frames downstream.
|
||||
self._push_frame_task.cancel()
|
||||
await self._push_frame_task
|
||||
self._create_push_task()
|
||||
# Push an out-of-band frame (i.e. not using the ordered push
|
||||
# frame task) to stop everything, specially at the output
|
||||
# transport.
|
||||
await self.push_frame(StartInterruptionFrame())
|
||||
# Create a new queue and task.
|
||||
self._create_push_task()
|
||||
elif isinstance(frame, UserStoppedSpeakingFrame):
|
||||
logger.debug("User stopped speaking")
|
||||
await self.push_frame(StopInterruptionFrame())
|
||||
|
||||
@@ -35,7 +35,7 @@ except ModuleNotFoundError as e:
|
||||
class FastAPIWebsocketParams(TransportParams):
|
||||
add_wav_header: bool = False
|
||||
audio_frame_size: int = 6400 # 200ms
|
||||
serializer: FrameSerializer = TwilioFrameSerializer()
|
||||
serializer: FrameSerializer
|
||||
|
||||
|
||||
class FastAPIWebsocketCallbacks(BaseModel):
|
||||
@@ -125,7 +125,7 @@ class FastAPIWebsocketTransport(BaseTransport):
|
||||
def __init__(
|
||||
self,
|
||||
websocket: WebSocket,
|
||||
params: FastAPIWebsocketParams = FastAPIWebsocketParams(),
|
||||
params: FastAPIWebsocketParams,
|
||||
input_name: str | None = None,
|
||||
output_name: str | None = None,
|
||||
loop: asyncio.AbstractEventLoop | None = None):
|
||||
|
||||
@@ -209,19 +209,18 @@ class DailyTransportClient(EventHandler):
|
||||
async def read_next_audio_frame(self) -> AudioRawFrame | None:
|
||||
sample_rate = self._params.audio_in_sample_rate
|
||||
num_channels = self._params.audio_in_channels
|
||||
num_frames = int(sample_rate / 100) * 2 # 20ms of audio
|
||||
|
||||
if self._other_participant_has_joined:
|
||||
num_frames = int(sample_rate / 100) * 2 # 20ms of audio
|
||||
|
||||
future = self._loop.create_future()
|
||||
self._speaker.read_frames(num_frames, completion=completion_callback(future))
|
||||
audio = await future
|
||||
future = self._loop.create_future()
|
||||
self._speaker.read_frames(num_frames, completion=completion_callback(future))
|
||||
audio = await future
|
||||
|
||||
if len(audio) > 0:
|
||||
return AudioRawFrame(audio=audio, sample_rate=sample_rate, num_channels=num_channels)
|
||||
else:
|
||||
# If no one has ever joined the meeting `read_frames()` would block,
|
||||
# instead we just wait a bit. daily-python should probably return
|
||||
# silence instead.
|
||||
# If we don't read any audio it could be there's no participant
|
||||
# connected. daily-python will return immediately if that's the
|
||||
# case, so let's sleep for a little bit (i.e. busy wait).
|
||||
await asyncio.sleep(0.01)
|
||||
return None
|
||||
|
||||
|
||||
@@ -36,6 +36,9 @@ class SileroVADAnalyzer(VADAnalyzer):
|
||||
def __init__(self, sample_rate=16000, params: VADParams = VADParams()):
|
||||
super().__init__(sample_rate=sample_rate, num_channels=1, params=params)
|
||||
|
||||
if sample_rate != 16000 and sample_rate != 8000:
|
||||
raise Exception("Silero VAD sample rate needs to be 16000 or 8000")
|
||||
|
||||
logger.debug("Loading Silero VAD model...")
|
||||
|
||||
(self._model, utils) = torch.hub.load(
|
||||
@@ -51,7 +54,7 @@ class SileroVADAnalyzer(VADAnalyzer):
|
||||
#
|
||||
|
||||
def num_frames_required(self) -> int:
|
||||
return int(self.sample_rate / 100) * 4 # 40ms
|
||||
return 512 if self.sample_rate == 16000 else 256
|
||||
|
||||
def voice_confidence(self, buffer) -> float:
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user