Compare commits

...

15 Commits

Author SHA1 Message Date
Aleix Conchillo Flaqué
e3b407db28 Merge pull request #259 from pipecat-ai/aleix/prepare-0.0.33
update CHANGELOG for 0.0.33
2024-06-25 12:05:07 -07:00
Aleix Conchillo Flaqué
4ef623f09e update CHANGELOG for 0.0.33 2024-06-25 11:53:07 -07:00
Aleix Conchillo Flaqué
253530a63d Merge pull request #258 from pipecat-ai/aleix/upgrade-cartesia-1.0.0
services(cartesia): upgrade to new cartesia 1.0.0
2024-06-25 11:52:04 -07:00
Aleix Conchillo Flaqué
4f38d989f5 services(cartesia): upgrade to new cartesia 1.0.0 2024-06-25 11:51:34 -07:00
Aleix Conchillo Flaqué
84074e90ee Merge pull request #257 from pipecat-ai/aleix/cancel-all-tasks-when-interrutpted
cancel all tasks when interrutpted
2024-06-25 11:16:00 -07:00
Aleix Conchillo Flaqué
38aee7d8f2 services(azure): cancel tasks when interrupted and ignore incoming transcriptions 2024-06-25 11:15:26 -07:00
Aleix Conchillo Flaqué
64198313c6 services(deepgram): cancel tasks when interrupted and ignore incoming transcriptions 2024-06-25 11:15:07 -07:00
Aleix Conchillo Flaqué
d61b6c301c transports(base_input): create push tasks after pushing interruption 2024-06-25 11:15:07 -07:00
Aleix Conchillo Flaqué
83d1931266 Merge pull request #256 from pipecat-ai/aleix/tts-cleanup-when-interrupted
services(tts): strip before TTS and cleanup when interrupted
2024-06-25 11:14:32 -07:00
Aleix Conchillo Flaqué
c31f2ab285 services(tts): strip before TTS and cleanup when interrupted 2024-06-25 11:13:19 -07:00
Aleix Conchillo Flaqué
0ddc5721b4 Merge pull request #252 from pipecat-ai/aleix/daily-check-size-read-audio-frames
transports(daily): always check size of read audio frames
2024-06-25 09:45:05 -07:00
Aleix Conchillo Flaqué
98bd183bc4 pyproject: fix cartesia version and update requirements files 2024-06-25 09:43:54 -07:00
Aleix Conchillo Flaqué
aaa154524c Merge pull request #253 from pipecat-ai/aleix/llm-response-use-intermediate-results
aggregators: uses intermediate results for LLMAssistantResponseAggreg…
2024-06-24 19:21:14 -07:00
Aleix Conchillo Flaqué
beced68337 aggregators: uses intermediate results for LLMAssistantResponseAggregator 2024-06-24 17:33:45 -07:00
Aleix Conchillo Flaqué
94823ab952 transports(daily): always check size of read audio frames 2024-06-24 14:56:24 -07:00
13 changed files with 187 additions and 92 deletions

View File

@@ -5,6 +5,27 @@ All notable changes to **pipecat** will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [0.0.33] - 2024-06-25
### Changed
- Upgraded to Cartesia's new Python library 1.0.0. `CartesiaTTSService` now
expects a voice ID instead of a voice name (you can get the voice ID from
Cartesia's playground). You can also specify the audio `sample_rate` and
`encoding` instead of the previous `output_format`.
### Fixed
- Fixed an issue with asynchronous STT services (Deepgram and Azure) that could
cause static audio issues and interruptions to not work properly when dealing
with multiple LLMs sentences.
- Fixed an issue that could mix new LLM responses with previous ones when
handling interruptions.
- Fixed a Daily transport blocking situation that occurred while reading audio
frames after a participant left the room. Needs daily-python >= 0.10.1.
## [0.0.32] - 2024-06-22
### Added

View File

@@ -38,7 +38,6 @@ async def main(room_url: str, token):
"Respond bot",
DailyParams(
audio_out_enabled=True,
audio_out_sample_rate=44100,
transcription_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer()
@@ -47,8 +46,7 @@ async def main(room_url: str, token):
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_name="British Lady",
output_format="pcm_44100"
voice_id="a0e99841-438c-4a64-b679-ae501e7d6091", # Barbershop Man
)
llm = OpenAILLMService(

View File

@@ -66,7 +66,6 @@ async def main(room_url: str, token):
"Pipecat",
DailyParams(
audio_out_enabled=True,
audio_out_sample_rate=44100,
transcription_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer()
@@ -75,20 +74,17 @@ async def main(room_url: str, token):
news_lady = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_name="Newslady",
output_format="pcm_44100"
voice_id="bf991597-6c13-47e4-8411-91ec2de5c466", # Newslady
)
british_lady = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_name="British Lady",
output_format="pcm_44100"
voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady
)
barbershop_man = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_name="Barbershop Man",
output_format="pcm_44100"
voice_id="a0e99841-438c-4a64-b679-ae501e7d6091", # Barbershop Man
)
llm = OpenAILLMService(

View File

@@ -4,7 +4,7 @@
#
# pip-compile --all-extras pyproject.toml
#
aiofiles==23.2.1
aiofiles==24.1.0
# via deepgram-sdk
aiohttp==3.9.5
# via
@@ -44,15 +44,13 @@ blinker==1.8.2
# via flask
cachetools==5.3.3
# via google-auth
cartesia==0.1.1
cartesia==1.0.0
# via pipecat-ai (pyproject.toml)
certifi==2024.6.2
# via
# httpcore
# httpx
# requests
cffi==1.16.0
# via sounddevice
charset-normalizer==3.3.2
# via requests
click==8.1.7
@@ -64,7 +62,7 @@ coloredlogs==15.0.1
# via onnxruntime
ctranslate2==4.3.1
# via faster-whisper
daily-python==0.10.0
daily-python==0.10.1
# via pipecat-ai (pyproject.toml)
dataclasses-json==0.6.7
# via
@@ -94,7 +92,7 @@ fastapi-cli==0.0.4
# via fastapi
faster-whisper==1.0.2
# via pipecat-ai (pyproject.toml)
filelock==3.15.3
filelock==3.15.4
# via
# huggingface-hub
# pyht
@@ -121,7 +119,7 @@ future==1.0.0
# via pyloudnorm
google-ai-generativelanguage==0.6.4
# via google-generativeai
google-api-core[grpc]==2.19.0
google-api-core[grpc]==2.19.1
# via
# google-ai-generativelanguage
# google-api-python-client
@@ -139,7 +137,7 @@ google-auth-httplib2==0.2.0
# via google-api-python-client
google-generativeai==0.5.4
# via pipecat-ai (pyproject.toml)
googleapis-common-protos==1.63.1
googleapis-common-protos==1.63.2
# via
# google-api-core
# grpcio-status
@@ -219,7 +217,7 @@ langchain-openai==0.1.9
# via pipecat-ai (pyproject.toml)
langchain-text-splitters==0.2.1
# via langchain
langsmith==0.1.81
langsmith==0.1.82
# via
# langchain
# langchain-community
@@ -338,8 +336,6 @@ pyasn1-modules==0.4.0
# via google-auth
pyaudio==0.2.14
# via pipecat-ai (pyproject.toml)
pycparser==2.22
# via cffi
pydantic==2.7.4
# via
# anthropic
@@ -404,7 +400,7 @@ safetensors==0.4.3
# via
# timm
# transformers
scipy==1.13.1
scipy==1.14.0
# via pyloudnorm
shellingham==1.5.4
# via typer
@@ -416,8 +412,6 @@ sniffio==1.3.1
# anyio
# httpx
# openai
sounddevice==0.4.7
# via pipecat-ai (pyproject.toml)
sqlalchemy==2.0.31
# via
# langchain
@@ -428,7 +422,7 @@ sympy==1.12.1
# via
# onnxruntime
# torch
tenacity==8.4.1
tenacity==8.4.2
# via
# langchain
# langchain-community

View File

@@ -1,10 +1,10 @@
#
# This file is autogenerated by pip-compile with Python 3.12
# This file is autogenerated by pip-compile with Python 3.10
# by the following command:
#
# pip-compile --all-extras pyproject.toml
#
aiofiles==23.2.1
aiofiles==24.1.0
# via deepgram-sdk
aiohttp==3.9.5
# via
@@ -28,6 +28,10 @@ anyio==4.4.0
# openai
# starlette
# watchfiles
async-timeout==4.0.3
# via
# aiohttp
# langchain
attrs==23.2.0
# via
# aiohttp
@@ -40,15 +44,13 @@ blinker==1.8.2
# via flask
cachetools==5.3.3
# via google-auth
cartesia==0.1.1
cartesia==1.0.0
# via pipecat-ai (pyproject.toml)
certifi==2024.6.2
# via
# httpcore
# httpx
# requests
cffi==1.16.0
# via sounddevice
charset-normalizer==3.3.2
# via requests
click==8.1.7
@@ -60,7 +62,7 @@ coloredlogs==15.0.1
# via onnxruntime
ctranslate2==4.3.1
# via faster-whisper
daily-python==0.10.0
daily-python==0.10.1
# via pipecat-ai (pyproject.toml)
dataclasses-json==0.6.7
# via
@@ -78,6 +80,10 @@ einops==0.8.0
# via pipecat-ai (pyproject.toml)
email-validator==2.2.0
# via fastapi
exceptiongroup==1.2.1
# via
# anyio
# pytest
fal-client==0.4.0
# via pipecat-ai (pyproject.toml)
fastapi==0.111.0
@@ -86,7 +92,7 @@ fastapi-cli==0.0.4
# via fastapi
faster-whisper==1.0.2
# via pipecat-ai (pyproject.toml)
filelock==3.15.3
filelock==3.15.4
# via
# huggingface-hub
# pyht
@@ -112,7 +118,7 @@ future==1.0.0
# via pyloudnorm
google-ai-generativelanguage==0.6.4
# via google-generativeai
google-api-core[grpc]==2.19.0
google-api-core[grpc]==2.19.1
# via
# google-ai-generativelanguage
# google-api-python-client
@@ -130,7 +136,7 @@ google-auth-httplib2==0.2.0
# via google-api-python-client
google-generativeai==0.5.4
# via pipecat-ai (pyproject.toml)
googleapis-common-protos==1.63.1
googleapis-common-protos==1.63.2
# via
# google-api-core
# grpcio-status
@@ -204,11 +210,11 @@ langchain-core==0.2.9
# langchain-community
# langchain-openai
# langchain-text-splitters
langchain-openai==0.1.9
langchain-openai==0.1.10
# via pipecat-ai (pyproject.toml)
langchain-text-splitters==0.2.1
# via langchain
langsmith==0.1.81
langsmith==0.1.82
# via
# langchain
# langchain-community
@@ -296,8 +302,6 @@ pyasn1-modules==0.4.0
# via google-auth
pyaudio==0.2.14
# via pipecat-ai (pyproject.toml)
pycparser==2.22
# via cffi
pydantic==2.7.4
# via
# anthropic
@@ -362,7 +366,7 @@ safetensors==0.4.3
# via
# timm
# transformers
scipy==1.13.1
scipy==1.14.0
# via pyloudnorm
shellingham==1.5.4
# via typer
@@ -374,8 +378,6 @@ sniffio==1.3.1
# anyio
# httpx
# openai
sounddevice==0.4.7
# via pipecat-ai (pyproject.toml)
sqlalchemy==2.0.31
# via
# langchain
@@ -386,7 +388,7 @@ sympy==1.12.1
# via
# onnxruntime
# torch
tenacity==8.4.1
tenacity==8.4.2
# via
# langchain
# langchain-community
@@ -400,6 +402,8 @@ tokenizers==0.19.1
# anthropic
# faster-whisper
# transformers
tomli==2.0.1
# via pytest
torch==2.3.1
# via
# pipecat-ai (pyproject.toml)
@@ -423,6 +427,7 @@ typer==0.12.3
typing-extensions==4.12.2
# via
# anthropic
# anyio
# deepgram-sdk
# fastapi
# google-generativeai
@@ -435,6 +440,7 @@ typing-extensions==4.12.2
# torch
# typer
# typing-inspect
# uvicorn
typing-inspect==0.9.0
# via dataclasses-json
ujson==5.10.0

View File

@@ -36,8 +36,8 @@ Website = "https://pipecat.ai"
[project.optional-dependencies]
anthropic = [ "anthropic~=0.25.7" ]
azure = [ "azure-cognitiveservices-speech~=1.37.0" ]
cartesia = [ "numpy~=1.26.0", "sounddevice", "cartesia" ]
daily = [ "daily-python~=0.10.0" ]
cartesia = [ "cartesia~=1.0.0" ]
daily = [ "daily-python~=0.10.1" ]
deepgram = [ "deepgram-sdk~=3.2.7" ]
examples = [ "python-dotenv~=1.0.0", "flask~=3.0.3", "flask_cors~=4.0.1" ]
fal = [ "fal-client~=0.4.0" ]

View File

@@ -13,7 +13,6 @@ from pipecat.frames.frames import (
Frame,
InterimTranscriptionFrame,
LLMFullResponseEndFrame,
LLMFullResponseStartFrame,
LLMResponseEndFrame,
LLMResponseStartFrame,
LLMMessagesFrame,
@@ -152,8 +151,8 @@ class LLMAssistantResponseAggregator(LLMResponseAggregator):
super().__init__(
messages=messages,
role="assistant",
start_frame=LLMFullResponseStartFrame,
end_frame=LLMFullResponseEndFrame,
start_frame=LLMResponseStartFrame,
end_frame=LLMResponseEndFrame,
accumulator_frame=TextFrame,
handle_interruptions=True
)

View File

@@ -16,12 +16,13 @@ from pipecat.frames.frames import (
EndFrame,
ErrorFrame,
Frame,
LLMFullResponseEndFrame,
StartFrame,
StartInterruptionFrame,
TTSStartedFrame,
TTSStoppedFrame,
TextFrame,
VisionImageRawFrame,
LLMFullResponseEndFrame,
)
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.utils.audio import calculate_audio_volume
@@ -114,13 +115,17 @@ class TTSService(AIService):
if self._current_sentence.strip().endswith(
(".", "?", "!")) and not self._current_sentence.strip().endswith(
("Mr,", "Mrs.", "Ms.", "Dr.")):
text = self._current_sentence.strip()
text = self._current_sentence
self._current_sentence = ""
if text:
await self._push_tts_frames(text)
async def _push_tts_frames(self, text: str):
text = text.strip()
if not text:
return
await self.push_frame(TTSStartedFrame())
await self.process_generator(self.run_tts(text))
await self.push_frame(TTSStoppedFrame())
@@ -133,14 +138,12 @@ class TTSService(AIService):
if isinstance(frame, TextFrame):
await self._process_text_frame(frame)
elif isinstance(frame, EndFrame):
if self._current_sentence:
await self._push_tts_frames(self._current_sentence)
await self.push_frame(frame)
elif isinstance(frame, LLMFullResponseEndFrame):
if self._current_sentence:
await self._push_tts_frames(self._current_sentence.strip())
self._current_sentence = ""
elif isinstance(frame, StartInterruptionFrame):
self._current_sentence = ""
await self.push_frame(frame, direction)
elif isinstance(frame, LLMFullResponseEndFrame) or isinstance(frame, EndFrame):
self._current_sentence = ""
await self._push_tts_frames(self._current_sentence)
await self.push_frame(frame)
else:
await self.push_frame(frame, direction)

View File

@@ -12,7 +12,18 @@ import time
from PIL import Image
from typing import AsyncGenerator
from pipecat.frames.frames import AudioRawFrame, CancelFrame, EndFrame, ErrorFrame, Frame, StartFrame, SystemFrame, TranscriptionFrame, URLImageRawFrame
from pipecat.frames.frames import (
AudioRawFrame,
CancelFrame,
EndFrame,
ErrorFrame,
Frame,
StartFrame,
StartInterruptionFrame,
StopInterruptionFrame,
SystemFrame,
TranscriptionFrame,
URLImageRawFrame)
from pipecat.processors.frame_processor import FrameDirection
from pipecat.services.ai_services import AIService, TTSService, ImageGenService
from pipecat.services.openai import BaseOpenAILLMService
@@ -34,7 +45,7 @@ try:
except ModuleNotFoundError as e:
logger.error(f"Exception: {e}")
logger.error(
"In order to use Azure TTS, you need to `pip install pipecat-ai[azure]`. Also, set `AZURE_SPEECH_API_KEY` and `AZURE_SPEECH_REGION` environment variables.")
"In order to use Azure, you need to `pip install pipecat-ai[azure]`. Also, set `AZURE_SPEECH_API_KEY` and `AZURE_SPEECH_REGION` environment variables.")
raise Exception(f"Missing module: {e}")
@@ -123,12 +134,18 @@ class AzureSTTService(AIService):
speech_config=speech_config, audio_config=audio_config)
self._speech_recognizer.recognized.connect(self._on_handle_recognized)
# This event will be used to ignore out-of-band transcriptions while we
# are itnerrupted.
self._is_interrupted_event = asyncio.Event()
self._create_push_task()
async def process_frame(self, frame: Frame, direction: FrameDirection):
await super().process_frame(frame, direction)
if isinstance(frame, SystemFrame):
if isinstance(frame, StartInterruptionFrame) or isinstance(frame, StopInterruptionFrame):
await self._handle_interruptions(frame)
elif isinstance(frame, SystemFrame):
await self.push_frame(frame, direction)
elif isinstance(frame, AudioRawFrame):
self._audio_stream.write(frame.audio)
@@ -148,6 +165,23 @@ class AzureSTTService(AIService):
self._push_frame_task.cancel()
await self._push_frame_task
async def _handle_interruptions(self, frame: Frame):
if isinstance(frame, StartInterruptionFrame):
# Indicate we are interrupted, we should ignore any out-of-band
# transcriptions.
self._is_interrupted_event.set()
# Cancel the task. This will stop pushing frames downstream.
self._push_frame_task.cancel()
await self._push_frame_task
# Push an out-of-band frame (i.e. not using the ordered push
# frame task).
await self.push_frame(frame)
# Create a new queue and task.
self._create_push_task()
elif isinstance(frame, StopInterruptionFrame):
# We should now be able to receive transcriptions again.
self._is_interrupted_event.clear()
def _create_push_task(self):
self._push_queue = asyncio.Queue()
self._push_frame_task = self.get_event_loop().create_task(self._push_frame_task_handler())
@@ -163,6 +197,9 @@ class AzureSTTService(AIService):
break
def _on_handle_recognized(self, event):
if self._is_interrupted_event.is_set():
return
if event.result.reason == ResultReason.RecognizedSpeech and len(event.result.text) > 0:
direction = FrameDirection.DOWNSTREAM
frame = TranscriptionFrame(event.result.text, "", int(time.time_ns() / 1000000))

View File

@@ -4,7 +4,7 @@
# SPDX-License-Identifier: BSD 2-Clause License
#
from cartesia.tts import AsyncCartesiaTTS
from cartesia import AsyncCartesia
from typing import AsyncGenerator
@@ -20,22 +20,24 @@ class CartesiaTTSService(TTSService):
self,
*,
api_key: str,
voice_name: str,
model_id: str = "upbeat-moon",
output_format: str = "pcm_16000",
voice_id: str,
model_id: str = "sonic-english",
encoding: str = "pcm_s16le",
sample_rate: int = 16000,
**kwargs):
super().__init__(**kwargs)
self._api_key = api_key
self._voice_name = voice_name
self._model_id = model_id
self._output_format = output_format
self._output_format = {
"container": "raw",
"encoding": encoding,
"sample_rate": sample_rate,
}
try:
self._client = AsyncCartesiaTTS(api_key=self._api_key)
voices = self._client.get_voices()
voice_id = voices[self._voice_name]["id"]
self._voice = self._client.get_voice_embedding(voice_id=voice_id)
self._client = AsyncCartesia(api_key=self._api_key)
self._voice = self._client.voices.get(id=voice_id)
except Exception as e:
logger.error(f"{self} initialization error: {e}")
@@ -48,16 +50,16 @@ class CartesiaTTSService(TTSService):
try:
await self.start_ttfb_metrics()
chunk_generator = await self._client.generate(
chunk_generator = await self._client.tts.sse(
stream=True,
transcript=text,
voice=self._voice,
voice_embedding=self._voice["embedding"],
model_id=self._model_id,
output_format=self._output_format,
)
async for chunk in chunk_generator:
await self.stop_ttfb_metrics()
yield AudioRawFrame(chunk["audio"], chunk["sampling_rate"], 1)
yield AudioRawFrame(chunk["audio"], self._output_format["sample_rate"], 1)
except Exception as e:
logger.error(f"{self} exception: {e}")

View File

@@ -18,20 +18,29 @@ from pipecat.frames.frames import (
Frame,
InterimTranscriptionFrame,
StartFrame,
StartInterruptionFrame,
StopInterruptionFrame,
SystemFrame,
TranscriptionFrame)
from pipecat.processors.frame_processor import FrameDirection
from pipecat.services.ai_services import AIService, TTSService
from deepgram import (
DeepgramClient,
DeepgramClientOptions,
LiveTranscriptionEvents,
LiveOptions,
)
from loguru import logger
# See .env.example for Deepgram configuration needed
try:
from deepgram import (
DeepgramClient,
DeepgramClientOptions,
LiveTranscriptionEvents,
LiveOptions,
)
except ModuleNotFoundError as e:
logger.error(f"Exception: {e}")
logger.error(
"In order to use Deepgram, you need to `pip install pipecat-ai[deepgram]`. Also, set `DEEPGRAM_API_KEY` environment variable.")
raise Exception(f"Missing module: {e}")
class DeepgramTTSService(TTSService):
@@ -109,12 +118,18 @@ class DeepgramSTTService(AIService):
self._connection = self._client.listen.asynclive.v("1")
self._connection.on(LiveTranscriptionEvents.Transcript, self._on_message)
# This event will be used to ignore out-of-band transcriptions while we
# are itnerrupted.
self._is_interrupted_event = asyncio.Event()
self._create_push_task()
async def process_frame(self, frame: Frame, direction: FrameDirection):
await super().process_frame(frame, direction)
if isinstance(frame, SystemFrame):
if isinstance(frame, StartInterruptionFrame) or isinstance(frame, StopInterruptionFrame):
await self._handle_interruptions(frame)
elif isinstance(frame, SystemFrame):
await self.push_frame(frame, direction)
elif isinstance(frame, AudioRawFrame):
await self._connection.send(frame.audio)
@@ -137,6 +152,23 @@ class DeepgramSTTService(AIService):
self._push_frame_task.cancel()
await self._push_frame_task
async def _handle_interruptions(self, frame: Frame):
if isinstance(frame, StartInterruptionFrame):
# Indicate we are interrupted, we should ignore any out-of-band
# transcriptions.
self._is_interrupted_event.set()
# Cancel the task. This will stop pushing frames downstream.
self._push_frame_task.cancel()
await self._push_frame_task
# Push an out-of-band frame (i.e. not using the ordered push
# frame task).
await self.push_frame(frame)
# Create a new queue and task.
self._create_push_task()
elif isinstance(frame, StopInterruptionFrame):
# We should now be able to receive transcriptions again.
self._is_interrupted_event.clear()
def _create_push_task(self):
self._push_queue = asyncio.Queue()
self._push_frame_task = self.get_event_loop().create_task(self._push_frame_task_handler())
@@ -152,6 +184,9 @@ class DeepgramSTTService(AIService):
break
async def _on_message(self, *args, **kwargs):
if self._is_interrupted_event.is_set():
return
result = kwargs["result"]
is_final = result.is_final
transcript = result.channel.alternatives[0].transcript

View File

@@ -55,7 +55,7 @@ class BaseInputTransport(FrameProcessor):
async def push_audio_frame(self, frame: AudioRawFrame):
if self._params.audio_in_enabled or self._params.vad_enabled:
self._audio_in_queue.put_nowait(frame)
await self._audio_in_queue.put(frame)
#
# Frame processor
@@ -113,10 +113,15 @@ class BaseInputTransport(FrameProcessor):
# Make sure we notify about interruptions quickly out-of-band
if isinstance(frame, UserStartedSpeakingFrame):
logger.debug("User started speaking")
# Cancel the task. This will stop pushing frames downstream.
self._push_frame_task.cancel()
await self._push_frame_task
self._create_push_task()
# Push an out-of-band frame (i.e. not using the ordered push
# frame task) to stop everything, specially at the output
# transport.
await self.push_frame(StartInterruptionFrame())
# Create a new queue and task.
self._create_push_task()
elif isinstance(frame, UserStoppedSpeakingFrame):
logger.debug("User stopped speaking")
await self.push_frame(StopInterruptionFrame())

View File

@@ -209,19 +209,18 @@ class DailyTransportClient(EventHandler):
async def read_next_audio_frame(self) -> AudioRawFrame | None:
sample_rate = self._params.audio_in_sample_rate
num_channels = self._params.audio_in_channels
num_frames = int(sample_rate / 100) * 2 # 20ms of audio
if self._other_participant_has_joined:
num_frames = int(sample_rate / 100) * 2 # 20ms of audio
future = self._loop.create_future()
self._speaker.read_frames(num_frames, completion=completion_callback(future))
audio = await future
future = self._loop.create_future()
self._speaker.read_frames(num_frames, completion=completion_callback(future))
audio = await future
if len(audio) > 0:
return AudioRawFrame(audio=audio, sample_rate=sample_rate, num_channels=num_channels)
else:
# If no one has ever joined the meeting `read_frames()` would block,
# instead we just wait a bit. daily-python should probably return
# silence instead.
# If we don't read any audio it could be there's no participant
# connected. daily-python will return immediately if that's the
# case, so let's sleep for a little bit (i.e. busy wait).
await asyncio.sleep(0.01)
return None