diff --git a/CHANGELOG.md b/CHANGELOG.md index 6693bcc49..da14cc3ce 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,7 @@ All notable changes to **pipecat** will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [Unreleased] +## [0.0.36] - 2024-07-02 ### Added @@ -61,6 +61,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Other +- Added Fly.io deployment example in `examples/deployment/flyio-example`. + - Added new `17-detect-user-idle.py` example that shows how to use the new `UserIdleProcessor`. diff --git a/examples/foundational/06a-image-sync.py b/examples/foundational/06a-image-sync.py index 686964502..5a5264dec 100644 --- a/examples/foundational/06a-image-sync.py +++ b/examples/foundational/06a-image-sync.py @@ -67,11 +67,12 @@ async def main(room_url: str, token): "Respond bot", DailyParams( audio_out_enabled=True, + camera_out_enabled=True, camera_out_width=1024, camera_out_height=1024, transcription_enabled=True, vad_enabled=True, - vad_analyzer=SileroVADAnalyzer() + vad_analyzer=SileroVADAnalyzer(), ) ) @@ -116,7 +117,7 @@ async def main(room_url: str, token): async def on_first_participant_joined(transport, participant): participant_name = participant["info"]["userName"] or '' transport.capture_participant_transcription(participant["id"]) - await task.queue_frames([TextFrame(f"Hi, this is {participant_name}.")]) + await task.queue_frames([TextFrame(f"Hi there {participant_name}!")]) runner = PipelineRunner() diff --git a/linux-py3.10-requirements.txt b/linux-py3.10-requirements.txt index 5e2b97b5e..48e3b4694 100644 --- a/linux-py3.10-requirements.txt +++ b/linux-py3.10-requirements.txt @@ -17,7 +17,7 @@ aiosignal==1.3.1 # via aiohttp annotated-types==0.7.0 # via pydantic -anthropic==0.25.9 +anthropic==0.28.1 # via # openpipe # pipecat-ai (pyproject.toml) @@ -38,7 +38,7 @@ attrs==23.2.0 # openpipe av==12.2.0 # via faster-whisper -azure-cognitiveservices-speech==1.37.0 +azure-cognitiveservices-speech==1.38.0 # via pipecat-ai (pyproject.toml) blinker==1.8.2 # via flask @@ -117,7 +117,7 @@ fsspec==2024.6.1 # torch future==1.0.0 # via pyloudnorm -google-ai-generativelanguage==0.6.4 +google-ai-generativelanguage==0.6.6 # via google-generativeai google-api-core[grpc]==2.19.1 # via @@ -135,7 +135,7 @@ google-auth==2.31.0 # google-generativeai google-auth-httplib2==0.2.0 # via google-api-python-client -google-generativeai==0.5.4 +google-generativeai==0.7.1 # via pipecat-ai (pyproject.toml) googleapis-common-protos==1.63.2 # via @@ -197,6 +197,8 @@ jinja2==3.1.4 # fastapi # flask # torch +jiter==0.5.0 + # via anthropic jsonpatch==1.33 # via langchain-core jsonpointer==3.0.0 @@ -217,7 +219,7 @@ langchain-openai==0.1.10 # via pipecat-ai (pyproject.toml) langchain-text-splitters==0.2.2 # via langchain -langsmith==0.1.82 +langsmith==0.1.83 # via # langchain # langchain-community @@ -294,12 +296,12 @@ nvidia-nvtx-cu12==12.1.105 # via torch onnxruntime==1.18.1 # via faster-whisper -openai==1.26.0 +openai==1.27.0 # via # langchain-openai # openpipe # pipecat-ai (pyproject.toml) -openpipe==4.14.0 +openpipe==4.16.0 # via pipecat-ai (pyproject.toml) orjson==3.10.5 # via diff --git a/macos-py3.10-requirements.txt b/macos-py3.10-requirements.txt index 154c7d1fb..41a1387cd 100644 --- a/macos-py3.10-requirements.txt +++ b/macos-py3.10-requirements.txt @@ -17,7 +17,7 @@ aiosignal==1.3.1 # via aiohttp annotated-types==0.7.0 # via pydantic -anthropic==0.25.9 +anthropic==0.28.1 # via # openpipe # pipecat-ai (pyproject.toml) @@ -38,7 +38,7 @@ attrs==23.2.0 # openpipe av==12.2.0 # via faster-whisper -azure-cognitiveservices-speech==1.37.0 +azure-cognitiveservices-speech==1.38.0 # via pipecat-ai (pyproject.toml) blinker==1.8.2 # via flask @@ -116,7 +116,7 @@ fsspec==2024.6.1 # torch future==1.0.0 # via pyloudnorm -google-ai-generativelanguage==0.6.4 +google-ai-generativelanguage==0.6.6 # via google-generativeai google-api-core[grpc]==2.19.1 # via @@ -134,7 +134,7 @@ google-auth==2.31.0 # google-generativeai google-auth-httplib2==0.2.0 # via google-api-python-client -google-generativeai==0.5.4 +google-generativeai==0.7.1 # via pipecat-ai (pyproject.toml) googleapis-common-protos==1.63.2 # via @@ -194,6 +194,8 @@ jinja2==3.1.4 # fastapi # flask # torch +jiter==0.5.0 + # via anthropic jsonpatch==1.33 # via langchain-core jsonpointer==3.0.0 @@ -214,7 +216,7 @@ langchain-openai==0.1.10 # via pipecat-ai (pyproject.toml) langchain-text-splitters==0.2.2 # via langchain -langsmith==0.1.82 +langsmith==0.1.83 # via # langchain # langchain-community @@ -260,12 +262,12 @@ numpy==1.26.4 # transformers onnxruntime==1.18.1 # via faster-whisper -openai==1.26.0 +openai==1.27.0 # via # langchain-openai # openpipe # pipecat-ai (pyproject.toml) -openpipe==4.14.0 +openpipe==4.16.0 # via pipecat-ai (pyproject.toml) orjson==3.10.5 # via diff --git a/pyproject.toml b/pyproject.toml index fcc598fca..b0c6f44fb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,25 +34,25 @@ Source = "https://github.com/pipecat-ai/pipecat" Website = "https://pipecat.ai" [project.optional-dependencies] -anthropic = [ "anthropic~=0.25.7" ] -azure = [ "azure-cognitiveservices-speech~=1.37.0" ] -cartesia = [ "cartesia~=1.0.0" ] +anthropic = [ "anthropic~=0.28.1" ] +azure = [ "azure-cognitiveservices-speech~=1.38.0" ] +cartesia = [ "cartesia~=1.0.3" ] daily = [ "daily-python~=0.10.1" ] deepgram = [ "deepgram-sdk~=3.2.7" ] examples = [ "python-dotenv~=1.0.0", "flask~=3.0.3", "flask_cors~=4.0.1" ] -fal = [ "fal-client~=0.4.0" ] +fal = [ "fal-client~=0.4.1" ] gladia = [ "websockets~=12.0" ] -google = [ "google-generativeai~=0.5.3" ] -fireworks = [ "openai~=1.26.0" ] -langchain = [ "langchain~=0.2.1", "langchain-community~=0.2.1", "langchain-openai~=0.1.8" ] +google = [ "google-generativeai~=0.7.1" ] +fireworks = [ "openai~=1.27.0" ] +langchain = [ "langchain~=0.2.6", "langchain-community~=0.2.6", "langchain-openai~=0.1.10" ] local = [ "pyaudio~=0.2.0" ] moondream = [ "einops~=0.8.0", "timm~=0.9.16", "transformers~=4.40.2" ] -openai = [ "openai~=1.26.0" ] -openpipe = [ "openpipe~=4.14.0" ] +openai = [ "openai~=1.27.0" ] +openpipe = [ "openpipe~=4.16.0" ] playht = [ "pyht~=0.0.28" ] -silero = [ "torch~=2.3.0", "torchaudio~=2.3.0" ] +silero = [ "torch~=2.3.1", "torchaudio~=2.3.1" ] websocket = [ "websockets~=12.0", "fastapi~=0.111.0" ] -whisper = [ "faster-whisper~=1.0.2" ] +whisper = [ "faster-whisper~=1.0.3" ] xtts = [ "resampy~=0.4.3" ] [tool.setuptools.packages.find] diff --git a/src/pipecat/services/azure.py b/src/pipecat/services/azure.py index 873934abb..013129d1d 100644 --- a/src/pipecat/services/azure.py +++ b/src/pipecat/services/azure.py @@ -19,12 +19,11 @@ from pipecat.frames.frames import ( ErrorFrame, Frame, StartFrame, - StartInterruptionFrame, SystemFrame, TranscriptionFrame, URLImageRawFrame) from pipecat.processors.frame_processor import FrameDirection -from pipecat.services.ai_services import AIService, AsyncAIService, TTSService, ImageGenService +from pipecat.services.ai_services import AsyncAIService, TTSService, ImageGenService from pipecat.services.openai import BaseOpenAILLMService from loguru import logger @@ -83,7 +82,7 @@ class AzureTTSService(TTSService): return True async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: - logger.debug(f"Generating TTS: {text}") + logger.debug(f"Generating TTS: [{text}]") await self.start_ttfb_metrics() @@ -148,9 +147,11 @@ class AzureSTTService(AsyncAIService): async def stop(self, frame: EndFrame): self._speech_recognizer.stop_continuous_recognition_async() + self._audio_stream.close() async def cancel(self, frame: CancelFrame): self._speech_recognizer.stop_continuous_recognition_async() + self._audio_stream.close() def _on_handle_recognized(self, event): if event.result.reason == ResultReason.RecognizedSpeech and len(event.result.text) > 0: diff --git a/src/pipecat/services/cartesia.py b/src/pipecat/services/cartesia.py index d1aa8b762..c3b6b905b 100644 --- a/src/pipecat/services/cartesia.py +++ b/src/pipecat/services/cartesia.py @@ -8,7 +8,7 @@ from cartesia import AsyncCartesia from typing import AsyncGenerator -from pipecat.frames.frames import AudioRawFrame, Frame +from pipecat.frames.frames import AudioRawFrame, CancelFrame, EndFrame, Frame, StartFrame from pipecat.services.ai_services import TTSService from loguru import logger @@ -28,22 +28,33 @@ class CartesiaTTSService(TTSService): super().__init__(**kwargs) self._api_key = api_key + self._voice_id = voice_id self._model_id = model_id self._output_format = { "container": "raw", "encoding": encoding, "sample_rate": sample_rate, } - - try: - self._client = AsyncCartesia(api_key=self._api_key) - self._voice = self._client.voices.get(id=voice_id) - except Exception as e: - logger.exception(f"{self} initialization error: {e}") + self._client = None def can_generate_metrics(self) -> bool: return True + async def start(self, frame: StartFrame): + try: + self._client = AsyncCartesia(api_key=self._api_key) + self._voice = self._client.voices.get(id=self._voice_id) + except Exception as e: + logger.exception(f"{self} initialization error: {e}") + + async def stop(self, frame: EndFrame): + if self._client: + await self._client.close() + + async def cancel(self, frame: CancelFrame): + if self._client: + await self._client.close() + async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: logger.debug(f"Generating TTS: [{text}]")