diff --git a/CHANGELOG.md b/CHANGELOG.md index aa07b2773..6394f744a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,11 +5,15 @@ All notable changes to **Pipecat** will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [Unreleased] +## [0.0.93] - 2025-11-07 ### Added -- Added support for passing in a `ToolsSchem` in lieu of a list of provider- +- Added support for Sarvam Speech-to-Text service (`SarvamSTTService`) with + streaming WebSocket support for `saarika` (STT) and `saaras` (STT-translate) + models. + +- Added support for passing in a `ToolsSchema` in lieu of a list of provider- specific dicts when initializing `OpenAIRealtimeLLMService` or when updating it using `LLMUpdateSettingsFrame`. @@ -84,6 +88,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed +- Updated `simli-ai` to 0.1.25. + - Improved `concatenate_aggregated_text()` to one word outputs from OpenAI Realtime and Gemini Live. Text fragments are now correctly concatenated without spaces when these patterns are detected. @@ -114,6 +120,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 supported languages before Pipecat's service classes are updated, while still providing guidance on verified languages. +### Removed + +- Removed `needs_mcp_alternate_schema()` from `LLMService`. The mechanism that + relied on it went away. + ### Fixed - Restore backwards compatibility for vision/image features (broken in 0.0.92) @@ -137,18 +148,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Fixed `GoogleLLMService` token counting to avoid double-counting tokens when Gemini sends usage metadata across multiple streaming chunks. -### Removed - -- Removed `needs_mcp_alternate_schema()` from `LLMService`. The mechanism that - relied on it went away. - ## [0.0.92] - 2025-10-31 🎃 "The Haunted Edition" 👻 ### Added -- Added supprt for Sarvam Speech-to-Text service (`SarvamSTTService`) with streaming WebSocket - support for `saarika` (STT) and `saaras` (STT-translate) models. - - Added a new `DeepgramHttpTTSService`, which delivers a meaningful reduction in latency when compared to the `DeepgramTTSService`. diff --git a/examples/foundational/07-interruptible-cartesia-http.py b/examples/foundational/07-interruptible-cartesia-http.py index d121e40ba..569443a79 100644 --- a/examples/foundational/07-interruptible-cartesia-http.py +++ b/examples/foundational/07-interruptible-cartesia-http.py @@ -21,8 +21,8 @@ from pipecat.processors.aggregators.llm_context import LLMContext from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair from pipecat.runner.types import RunnerArguments from pipecat.runner.utils import create_transport +from pipecat.services.cartesia.stt import CartesiaSTTService from pipecat.services.cartesia.tts import CartesiaHttpTTSService -from pipecat.services.deepgram.stt import DeepgramSTTService from pipecat.services.openai.llm import OpenAILLMService from pipecat.transports.base_transport import BaseTransport, TransportParams from pipecat.transports.daily.transport import DailyParams @@ -59,7 +59,7 @@ transport_params = { async def run_bot(transport: BaseTransport, runner_args: RunnerArguments): logger.info(f"Starting bot") - stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY")) + stt = CartesiaSTTService(api_key=os.getenv("CARTESIA_API_KEY")) tts = CartesiaHttpTTSService( api_key=os.getenv("CARTESIA_API_KEY"), diff --git a/examples/foundational/07-interruptible.py b/examples/foundational/07-interruptible.py index 1e7bd5718..81ba692c7 100644 --- a/examples/foundational/07-interruptible.py +++ b/examples/foundational/07-interruptible.py @@ -21,8 +21,8 @@ from pipecat.processors.aggregators.llm_context import LLMContext from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair from pipecat.runner.types import RunnerArguments from pipecat.runner.utils import create_transport -from pipecat.services.cartesia.stt import CartesiaSTTService from pipecat.services.cartesia.tts import CartesiaTTSService +from pipecat.services.deepgram.stt import DeepgramSTTService from pipecat.services.openai.llm import OpenAILLMService from pipecat.transports.base_transport import BaseTransport, TransportParams from pipecat.transports.daily.transport import DailyParams @@ -58,7 +58,7 @@ transport_params = { async def run_bot(transport: BaseTransport, runner_args: RunnerArguments): logger.info(f"Starting bot") - stt = CartesiaSTTService(api_key=os.getenv("CARTESIA_API_KEY")) + stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY")) tts = CartesiaTTSService( api_key=os.getenv("CARTESIA_API_KEY"), diff --git a/examples/foundational/14n-function-calling-perplexity.py b/examples/foundational/14n-function-calling-perplexity.py index 2f37768b7..32ce47150 100644 --- a/examples/foundational/14n-function-calling-perplexity.py +++ b/examples/foundational/14n-function-calling-perplexity.py @@ -77,7 +77,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments): messages = [ { "role": "user", - "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.", + "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way, but try to be brief.", }, ] diff --git a/pyproject.toml b/pyproject.toml index 5ce3e44b4..a203b47b0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -99,9 +99,9 @@ local-smart-turn = [ "coremltools>=8.0", "transformers", "torch>=2.5.0,<3", "tor local-smart-turn-v3 = [ "transformers", "onnxruntime>=1.20.1,<2" ] remote-smart-turn = [] silero = [ "onnxruntime>=1.20.1,<2" ] -simli = [ "simli-ai~=0.1.10"] +simli = [ "simli-ai~=0.1.25"] soniox = [ "pipecat-ai[websockets-base]" ] -soundfile = [ "soundfile~=0.13.0" ] +soundfile = [ "soundfile~=0.13.1" ] speechmatics = [ "speechmatics-rt>=0.5.0" ] strands = [ "strands-agents>=1.9.1,<2" ] tavus=[] diff --git a/scripts/evals/eval.py b/scripts/evals/eval.py index e23a99420..306385988 100644 --- a/scripts/evals/eval.py +++ b/scripts/evals/eval.py @@ -244,10 +244,10 @@ async def run_eval_pipeline( llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY")) - llm.register_function("assert_eval", eval_runner.assert_eval) + llm.register_function("eval_function", eval_runner.assert_eval) eval_function = FunctionSchema( - name="assert_eval", + name="eval_function", description="Called when the user answers a question.", properties={ "result": { @@ -272,13 +272,15 @@ async def run_eval_pipeline( example_prompt, example_image = eval_config.prompt common_system_prompt = ( - "The user might say things other than the answer and that's allowed. " - f"You should only call the eval function when the user: {eval_config.eval}" + "You should only call the eval function if:\n" + "- The user explicitly attempts to answer the question, AND\n" + f"- Their answer can be cleanly evaluated using: {eval_config.eval}\n" + "Ignore greetings, comments, non-answers, or requests for clarification." ) if eval_config.eval_speaks_first: - system_prompt = f"You are an LLM eval, be extremly brief. You will start the conversation by saying: '{example_prompt}'. {common_system_prompt}" + system_prompt = f"You are an evaluation agent, be extremly brief. You will start the conversation by saying: '{example_prompt}'. {common_system_prompt}" else: - system_prompt = f"You are an LLM eval, be extremly brief. Your goal is to first ask one question: {example_prompt}. {common_system_prompt}" + system_prompt = f"You are an evaluation agent, be extremly brief. First, ask one question: {example_prompt}. {common_system_prompt}" messages = [ { diff --git a/scripts/evals/run-release-evals.py b/scripts/evals/run-release-evals.py index 5c66dd75d..da6df053d 100644 --- a/scripts/evals/run-release-evals.py +++ b/scripts/evals/run-release-evals.py @@ -180,7 +180,7 @@ TESTS_26 = [ ("26-gemini-live.py", EVAL_SIMPLE_MATH), ("26a-gemini-live-transcription.py", EVAL_SIMPLE_MATH), ("26b-gemini-live-function-calling.py", EVAL_WEATHER), - ("26c-gemini-live-video.py", EVAL_SIMPLE_MATH), + ("26c-gemini-live-video.py", EVAL_VISION_CAMERA), ("26e-gemini-live-google-search.py", EVAL_ONLINE_SEARCH), ("26h-gemini-live-vertex-function-calling.py", EVAL_WEATHER), # Currently not working. diff --git a/src/pipecat/services/simli/video.py b/src/pipecat/services/simli/video.py index 383a8a3cb..bac54f35b 100644 --- a/src/pipecat/services/simli/video.py +++ b/src/pipecat/services/simli/video.py @@ -158,14 +158,17 @@ class SimliVideoService(FrameProcessor): async def _start_connection(self): """Start the connection to Simli service and begin processing tasks.""" - if not self._initialized: - await self._simli_client.Initialize() - self._initialized = True + try: + if not self._initialized: + await self._simli_client.Initialize() + self._initialized = True - # Create task to consume and process audio and video - await self._simli_client.sendSilence() - self._audio_task = self.create_task(self._consume_and_process_audio()) - self._video_task = self.create_task(self._consume_and_process_video()) + # Create task to consume and process audio and video + await self._simli_client.sendSilence() + self._audio_task = self.create_task(self._consume_and_process_audio()) + self._video_task = self.create_task(self._consume_and_process_video()) + except Exception as e: + logger.error(f"{self}: unable to start connection: {e}") async def _consume_and_process_audio(self): """Consume audio frames from Simli and push them downstream.""" diff --git a/uv.lock b/uv.lock index 577b26399..0dc6c74f8 100644 --- a/uv.lock +++ b/uv.lock @@ -4727,8 +4727,8 @@ requires-dist = [ { name = "resampy", specifier = "~=0.4.3" }, { name = "sarvamai", marker = "extra == 'sarvam'", specifier = "==0.1.21" }, { name = "sentry-sdk", marker = "extra == 'sentry'", specifier = ">=2.28.0,<3" }, - { name = "simli-ai", marker = "extra == 'simli'", specifier = "~=0.1.10" }, - { name = "soundfile", marker = "extra == 'soundfile'", specifier = "~=0.13.0" }, + { name = "simli-ai", marker = "extra == 'simli'", specifier = "~=0.1.25" }, + { name = "soundfile", marker = "extra == 'soundfile'", specifier = "~=0.13.1" }, { name = "soxr", specifier = "~=0.5.0" }, { name = "speechmatics-rt", marker = "extra == 'speechmatics'", specifier = ">=0.5.0" }, { name = "strands-agents", marker = "extra == 'strands'", specifier = ">=1.9.1,<2" }, @@ -6496,7 +6496,7 @@ wheels = [ [[package]] name = "simli-ai" -version = "0.1.19" +version = "0.1.25" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "aiortc" }, @@ -6505,9 +6505,9 @@ dependencies = [ { name = "numpy" }, { name = "websockets" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/14/cf/bd31b76e00d2770a65081701108a39df2267cb585b0c2a000f71de790ee9/simli_ai-0.1.19.tar.gz", hash = "sha256:2ab8c6ec1e232dbf38c77d3920fe88b01acc7ba8d76b865fb5a3f4af968e3172", size = 12682, upload-time = "2025-09-23T14:14:30.263Z" } +sdist = { url = "https://files.pythonhosted.org/packages/64/6a/b28f90baf76f6a60865985f6233ff44abc72d45b66b76658bff3961e20a7/simli_ai-0.1.25.tar.gz", hash = "sha256:7a00b3426dc26a6a421641072c3e49014b7950c621cf4544152f35c58d13fcff", size = 13182, upload-time = "2025-11-06T16:27:08.862Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/63/86/fe2ed1b9d067634c4e0178c33080655c5c1f5b503fec20ac2af699238afb/simli_ai-0.1.19-py3-none-any.whl", hash = "sha256:35bcff89945dcb5f6171996d16d627e64981888c3134bdec7ce925680a17e058", size = 13233, upload-time = "2025-09-23T14:14:27.756Z" }, + { url = "https://files.pythonhosted.org/packages/ac/57/ae1032fd88214ea4ee6d3028c817c12a999eb90a67766bbab31e9819385a/simli_ai-0.1.25-py3-none-any.whl", hash = "sha256:7d01f65321dc9052f25e15d0463af6a20a86c6d37d9a7b3a2c4b01cbec0a54ed", size = 13651, upload-time = "2025-11-06T16:27:07.765Z" }, ] [[package]]