Merge pull request #280 from pipecat-ai/aleix/library-updates-070224

library updates 070224 and pipecat 0.0.36
2024-07-02 10:14:03 -07:00
parent 8f6db5e905 3147534e86
commit 9f6411dc0e
7 changed files with 57 additions and 38 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,7 +5,7 @@ All notable changes to **pipecat** will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

-## [Unreleased]
+## [0.0.36] - 2024-07-02

 ### Added

@@ -61,6 +61,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

 ### Other

+- Added Fly.io deployment example in `examples/deployment/flyio-example`.
+
 - Added new `17-detect-user-idle.py` example that shows how to use the new
  `UserIdleProcessor`.

--- a/examples/foundational/06a-image-sync.py
+++ b/examples/foundational/06a-image-sync.py
@@ -67,11 +67,12 @@ async def main(room_url: str, token):
            "Respond bot",
            DailyParams(
                audio_out_enabled=True,
+                camera_out_enabled=True,
                camera_out_width=1024,
                camera_out_height=1024,
                transcription_enabled=True,
                vad_enabled=True,
-                vad_analyzer=SileroVADAnalyzer()
+                vad_analyzer=SileroVADAnalyzer(),
            )
        )

@@ -116,7 +117,7 @@ async def main(room_url: str, token):
        async def on_first_participant_joined(transport, participant):
            participant_name = participant["info"]["userName"] or ''
            transport.capture_participant_transcription(participant["id"])
-            await task.queue_frames([TextFrame(f"Hi, this is {participant_name}.")])
+            await task.queue_frames([TextFrame(f"Hi there {participant_name}!")])

        runner = PipelineRunner()

--- a/linux-py3.10-requirements.txt
+++ b/linux-py3.10-requirements.txt
@@ -17,7 +17,7 @@ aiosignal==1.3.1
    # via aiohttp
 annotated-types==0.7.0
    # via pydantic
-anthropic==0.25.9
+anthropic==0.28.1
    # via
    #   openpipe
    #   pipecat-ai (pyproject.toml)
@@ -38,7 +38,7 @@ attrs==23.2.0
    #   openpipe
 av==12.2.0
    # via faster-whisper
-azure-cognitiveservices-speech==1.37.0
+azure-cognitiveservices-speech==1.38.0
    # via pipecat-ai (pyproject.toml)
 blinker==1.8.2
    # via flask
@@ -117,7 +117,7 @@ fsspec==2024.6.1
    #   torch
 future==1.0.0
    # via pyloudnorm
-google-ai-generativelanguage==0.6.4
+google-ai-generativelanguage==0.6.6
    # via google-generativeai
 google-api-core[grpc]==2.19.1
    # via
@@ -135,7 +135,7 @@ google-auth==2.31.0
    #   google-generativeai
 google-auth-httplib2==0.2.0
    # via google-api-python-client
-google-generativeai==0.5.4
+google-generativeai==0.7.1
    # via pipecat-ai (pyproject.toml)
 googleapis-common-protos==1.63.2
    # via
@@ -197,6 +197,8 @@ jinja2==3.1.4
    #   fastapi
    #   flask
    #   torch
+jiter==0.5.0
+    # via anthropic
 jsonpatch==1.33
    # via langchain-core
 jsonpointer==3.0.0
@@ -217,7 +219,7 @@ langchain-openai==0.1.10
    # via pipecat-ai (pyproject.toml)
 langchain-text-splitters==0.2.2
    # via langchain
-langsmith==0.1.82
+langsmith==0.1.83
    # via
    #   langchain
    #   langchain-community
@@ -294,12 +296,12 @@ nvidia-nvtx-cu12==12.1.105
    # via torch
 onnxruntime==1.18.1
    # via faster-whisper
-openai==1.26.0
+openai==1.27.0
    # via
    #   langchain-openai
    #   openpipe
    #   pipecat-ai (pyproject.toml)
-openpipe==4.14.0
+openpipe==4.16.0
    # via pipecat-ai (pyproject.toml)
 orjson==3.10.5
    # via
--- a/macos-py3.10-requirements.txt
+++ b/macos-py3.10-requirements.txt
@@ -17,7 +17,7 @@ aiosignal==1.3.1
    # via aiohttp
 annotated-types==0.7.0
    # via pydantic
-anthropic==0.25.9
+anthropic==0.28.1
    # via
    #   openpipe
    #   pipecat-ai (pyproject.toml)
@@ -38,7 +38,7 @@ attrs==23.2.0
    #   openpipe
 av==12.2.0
    # via faster-whisper
-azure-cognitiveservices-speech==1.37.0
+azure-cognitiveservices-speech==1.38.0
    # via pipecat-ai (pyproject.toml)
 blinker==1.8.2
    # via flask
@@ -116,7 +116,7 @@ fsspec==2024.6.1
    #   torch
 future==1.0.0
    # via pyloudnorm
-google-ai-generativelanguage==0.6.4
+google-ai-generativelanguage==0.6.6
    # via google-generativeai
 google-api-core[grpc]==2.19.1
    # via
@@ -134,7 +134,7 @@ google-auth==2.31.0
    #   google-generativeai
 google-auth-httplib2==0.2.0
    # via google-api-python-client
-google-generativeai==0.5.4
+google-generativeai==0.7.1
    # via pipecat-ai (pyproject.toml)
 googleapis-common-protos==1.63.2
    # via
@@ -194,6 +194,8 @@ jinja2==3.1.4
    #   fastapi
    #   flask
    #   torch
+jiter==0.5.0
+    # via anthropic
 jsonpatch==1.33
    # via langchain-core
 jsonpointer==3.0.0
@@ -214,7 +216,7 @@ langchain-openai==0.1.10
    # via pipecat-ai (pyproject.toml)
 langchain-text-splitters==0.2.2
    # via langchain
-langsmith==0.1.82
+langsmith==0.1.83
    # via
    #   langchain
    #   langchain-community
@@ -260,12 +262,12 @@ numpy==1.26.4
    #   transformers
 onnxruntime==1.18.1
    # via faster-whisper
-openai==1.26.0
+openai==1.27.0
    # via
    #   langchain-openai
    #   openpipe
    #   pipecat-ai (pyproject.toml)
-openpipe==4.14.0
+openpipe==4.16.0
    # via pipecat-ai (pyproject.toml)
 orjson==3.10.5
    # via
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,25 +34,25 @@ Source = "https://github.com/pipecat-ai/pipecat"
 Website = "https://pipecat.ai"

 [project.optional-dependencies]
-anthropic = [ "anthropic~=0.25.7" ]
-azure = [ "azure-cognitiveservices-speech~=1.37.0" ]
-cartesia = [ "cartesia~=1.0.0" ]
+anthropic = [ "anthropic~=0.28.1" ]
+azure = [ "azure-cognitiveservices-speech~=1.38.0" ]
+cartesia = [ "cartesia~=1.0.3" ]
 daily = [ "daily-python~=0.10.1" ]
 deepgram = [ "deepgram-sdk~=3.2.7" ]
 examples = [ "python-dotenv~=1.0.0", "flask~=3.0.3", "flask_cors~=4.0.1" ]
-fal = [ "fal-client~=0.4.0" ]
+fal = [ "fal-client~=0.4.1" ]
 gladia = [ "websockets~=12.0" ]
-google = [ "google-generativeai~=0.5.3" ]
-fireworks = [ "openai~=1.26.0" ]
-langchain = [ "langchain~=0.2.1", "langchain-community~=0.2.1", "langchain-openai~=0.1.8" ]
+google = [ "google-generativeai~=0.7.1" ]
+fireworks = [ "openai~=1.27.0" ]
+langchain = [ "langchain~=0.2.6", "langchain-community~=0.2.6", "langchain-openai~=0.1.10" ]
 local = [ "pyaudio~=0.2.0" ]
 moondream = [ "einops~=0.8.0", "timm~=0.9.16", "transformers~=4.40.2" ]
-openai = [ "openai~=1.26.0" ]
-openpipe = [ "openpipe~=4.14.0" ]
+openai = [ "openai~=1.27.0" ]
+openpipe = [ "openpipe~=4.16.0" ]
 playht = [ "pyht~=0.0.28" ]
-silero = [ "torch~=2.3.0", "torchaudio~=2.3.0" ]
+silero = [ "torch~=2.3.1", "torchaudio~=2.3.1" ]
 websocket = [ "websockets~=12.0", "fastapi~=0.111.0" ]
-whisper = [ "faster-whisper~=1.0.2" ]
+whisper = [ "faster-whisper~=1.0.3" ]
 xtts = [ "resampy~=0.4.3" ]

 [tool.setuptools.packages.find]
--- a/src/pipecat/services/azure.py
+++ b/src/pipecat/services/azure.py
@@ -19,12 +19,11 @@ from pipecat.frames.frames import (
    ErrorFrame,
    Frame,
    StartFrame,
-    StartInterruptionFrame,
    SystemFrame,
    TranscriptionFrame,
    URLImageRawFrame)
 from pipecat.processors.frame_processor import FrameDirection
-from pipecat.services.ai_services import AIService, AsyncAIService, TTSService, ImageGenService
+from pipecat.services.ai_services import AsyncAIService, TTSService, ImageGenService
 from pipecat.services.openai import BaseOpenAILLMService

 from loguru import logger
@@ -83,7 +82,7 @@ class AzureTTSService(TTSService):
        return True

    async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
-        logger.debug(f"Generating TTS: {text}")
+        logger.debug(f"Generating TTS: [{text}]")

        await self.start_ttfb_metrics()

@@ -148,9 +147,11 @@ class AzureSTTService(AsyncAIService):

    async def stop(self, frame: EndFrame):
        self._speech_recognizer.stop_continuous_recognition_async()
+        self._audio_stream.close()

    async def cancel(self, frame: CancelFrame):
        self._speech_recognizer.stop_continuous_recognition_async()
+        self._audio_stream.close()

    def _on_handle_recognized(self, event):
        if event.result.reason == ResultReason.RecognizedSpeech and len(event.result.text) > 0:
--- a/src/pipecat/services/cartesia.py
+++ b/src/pipecat/services/cartesia.py
@@ -8,7 +8,7 @@ from cartesia import AsyncCartesia

 from typing import AsyncGenerator

-from pipecat.frames.frames import AudioRawFrame, Frame
+from pipecat.frames.frames import AudioRawFrame, CancelFrame, EndFrame, Frame, StartFrame
 from pipecat.services.ai_services import TTSService

 from loguru import logger
@@ -28,22 +28,33 @@ class CartesiaTTSService(TTSService):
        super().__init__(**kwargs)

        self._api_key = api_key
+        self._voice_id = voice_id
        self._model_id = model_id
        self._output_format = {
            "container": "raw",
            "encoding": encoding,
            "sample_rate": sample_rate,
        }
-
-        try:
-            self._client = AsyncCartesia(api_key=self._api_key)
-            self._voice = self._client.voices.get(id=voice_id)
-        except Exception as e:
-            logger.exception(f"{self} initialization error: {e}")
+        self._client = None

    def can_generate_metrics(self) -> bool:
        return True

+    async def start(self, frame: StartFrame):
+        try:
+            self._client = AsyncCartesia(api_key=self._api_key)
+            self._voice = self._client.voices.get(id=self._voice_id)
+        except Exception as e:
+            logger.exception(f"{self} initialization error: {e}")
+
+    async def stop(self, frame: EndFrame):
+        if self._client:
+            await self._client.close()
+
+    async def cancel(self, frame: CancelFrame):
+        if self._client:
+            await self._client.close()
+
    async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
        logger.debug(f"Generating TTS: [{text}]")