diff --git a/CHANGELOG.md b/CHANGELOG.md index 74506687d..a43553f0c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,8 +7,20 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- Added a `TTSService.includes_inter_frame_spaces` property getter, so that TTS + services that subclass `TTSService` can indicate whether the text in the + `TTSTextFrame`s they push already contain any necessary inter-frame spaces. + ### Fixed +- Fixed subtle issue of assistant context messages ending up with double spaces + between words or sentences. + +- Fixed an issue where `NeuphonicTTSService` wasn't pushing `TTSTextFrame`s, + meaning assistant messages weren't being written to context. + - Fixed an issue with OpenTelemetry where tracing wasn't correctly displaying LLM completions and tools when using the universal `LLMContext`. diff --git a/examples/foundational/07f-interruptible-azure-http.py b/examples/foundational/07f-interruptible-azure-http.py new file mode 100644 index 000000000..63971f156 --- /dev/null +++ b/examples/foundational/07f-interruptible-azure-http.py @@ -0,0 +1,135 @@ +# +# Copyright (c) 2024–2025, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + + +import os + +from dotenv import load_dotenv +from loguru import logger + +from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams +from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3 +from pipecat.audio.vad.silero import SileroVADAnalyzer +from pipecat.audio.vad.vad_analyzer import VADParams +from pipecat.frames.frames import LLMRunFrame +from pipecat.pipeline.pipeline import Pipeline +from pipecat.pipeline.runner import PipelineRunner +from pipecat.pipeline.task import PipelineParams, PipelineTask +from pipecat.processors.aggregators.llm_context import LLMContext +from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair +from pipecat.runner.types import RunnerArguments +from pipecat.runner.utils import create_transport +from pipecat.services.azure.llm import AzureLLMService +from pipecat.services.azure.stt import AzureSTTService +from pipecat.services.azure.tts import AzureHttpTTSService +from pipecat.transports.base_transport import BaseTransport, TransportParams +from pipecat.transports.daily.transport import DailyParams +from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams + +load_dotenv(override=True) + +# We store functions so objects (e.g. SileroVADAnalyzer) don't get +# instantiated. The function will be called when the desired transport gets +# selected. +transport_params = { + "daily": lambda: DailyParams( + audio_in_enabled=True, + audio_out_enabled=True, + vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)), + turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()), + ), + "twilio": lambda: FastAPIWebsocketParams( + audio_in_enabled=True, + audio_out_enabled=True, + vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)), + turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()), + ), + "webrtc": lambda: TransportParams( + audio_in_enabled=True, + audio_out_enabled=True, + vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)), + turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()), + ), +} + + +async def run_bot(transport: BaseTransport, runner_args: RunnerArguments): + logger.info(f"Starting bot") + + stt = AzureSTTService( + api_key=os.getenv("AZURE_SPEECH_API_KEY"), + region=os.getenv("AZURE_SPEECH_REGION"), + ) + + tts = AzureHttpTTSService( + api_key=os.getenv("AZURE_SPEECH_API_KEY"), + region=os.getenv("AZURE_SPEECH_REGION"), + ) + + llm = AzureLLMService( + api_key=os.getenv("AZURE_CHATGPT_API_KEY"), + endpoint=os.getenv("AZURE_CHATGPT_ENDPOINT"), + model=os.getenv("AZURE_CHATGPT_MODEL"), + ) + + messages = [ + { + "role": "system", + "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.", + }, + ] + + context = LLMContext(messages) + context_aggregator = LLMContextAggregatorPair(context) + + pipeline = Pipeline( + [ + transport.input(), # Transport user input + stt, # STT + context_aggregator.user(), # User responses + llm, # LLM + tts, # TTS + transport.output(), # Transport bot output + context_aggregator.assistant(), # Assistant spoken responses + ] + ) + + task = PipelineTask( + pipeline, + params=PipelineParams( + enable_metrics=True, + enable_usage_metrics=True, + ), + idle_timeout_secs=runner_args.pipeline_idle_timeout_secs, + ) + + @transport.event_handler("on_client_connected") + async def on_client_connected(transport, client): + logger.info(f"Client connected") + # Kick off the conversation. + messages.append({"role": "system", "content": "Please introduce yourself to the user."}) + await task.queue_frames([LLMRunFrame()]) + + @transport.event_handler("on_client_disconnected") + async def on_client_disconnected(transport, client): + logger.info(f"Client disconnected") + await task.cancel() + + runner = PipelineRunner(handle_sigint=runner_args.handle_sigint) + + await runner.run(task) + + +async def bot(runner_args: RunnerArguments): + """Main bot entry point compatible with Pipecat Cloud.""" + transport = await create_transport(runner_args, transport_params) + await run_bot(transport, runner_args) + + +if __name__ == "__main__": + from pipecat.runner.run import main + + main() diff --git a/examples/foundational/07n-interruptible-google-http.py b/examples/foundational/07n-interruptible-google-http.py new file mode 100644 index 000000000..136f09d7f --- /dev/null +++ b/examples/foundational/07n-interruptible-google-http.py @@ -0,0 +1,139 @@ +# +# Copyright (c) 2024–2025, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + + +import os + +from dotenv import load_dotenv +from loguru import logger + +from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams +from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3 +from pipecat.audio.vad.silero import SileroVADAnalyzer +from pipecat.audio.vad.vad_analyzer import VADParams +from pipecat.frames.frames import LLMRunFrame +from pipecat.pipeline.pipeline import Pipeline +from pipecat.pipeline.runner import PipelineRunner +from pipecat.pipeline.task import PipelineParams, PipelineTask +from pipecat.processors.aggregators.llm_context import LLMContext +from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair +from pipecat.runner.types import RunnerArguments +from pipecat.runner.utils import create_transport +from pipecat.services.google.llm import GoogleLLMService +from pipecat.services.google.stt import GoogleSTTService +from pipecat.services.google.tts import GoogleHttpTTSService, GoogleTTSService +from pipecat.transcriptions.language import Language +from pipecat.transports.base_transport import BaseTransport, TransportParams +from pipecat.transports.daily.transport import DailyParams +from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams + +load_dotenv(override=True) + +# We store functions so objects (e.g. SileroVADAnalyzer) don't get +# instantiated. The function will be called when the desired transport gets +# selected. +transport_params = { + "daily": lambda: DailyParams( + audio_in_enabled=True, + audio_out_enabled=True, + vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)), + turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()), + ), + "twilio": lambda: FastAPIWebsocketParams( + audio_in_enabled=True, + audio_out_enabled=True, + vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)), + turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()), + ), + "webrtc": lambda: TransportParams( + audio_in_enabled=True, + audio_out_enabled=True, + vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)), + turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()), + ), +} + + +async def run_bot(transport: BaseTransport, runner_args: RunnerArguments): + logger.info(f"Starting bot") + + stt = GoogleSTTService( + params=GoogleSTTService.InputParams(languages=Language.EN_US, model="chirp_3"), + credentials=os.getenv("GOOGLE_TEST_CREDENTIALS"), + location="us", + ) + + tts = GoogleHttpTTSService( + voice_id="en-US-Chirp3-HD-Charon", + params=GoogleHttpTTSService.InputParams(language=Language.EN_US), + credentials=os.getenv("GOOGLE_TEST_CREDENTIALS"), + ) + + llm = GoogleLLMService( + api_key=os.getenv("GOOGLE_API_KEY"), + model="gemini-2.5-flash", + # turn on thinking if you want it + # params=GoogleLLMService.InputParams(extra={"thinking_config": {"thinking_budget": 4096}}),) + ) + + messages = [ + { + "role": "system", + "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.", + }, + ] + + context = LLMContext(messages) + context_aggregator = LLMContextAggregatorPair(context) + + pipeline = Pipeline( + [ + transport.input(), # Transport user input + stt, # STT + context_aggregator.user(), # User respones + llm, # LLM + tts, # TTS + transport.output(), # Transport bot output + context_aggregator.assistant(), # Assistant spoken responses + ] + ) + + task = PipelineTask( + pipeline, + params=PipelineParams( + enable_metrics=True, + enable_usage_metrics=True, + ), + idle_timeout_secs=runner_args.pipeline_idle_timeout_secs, + ) + + @transport.event_handler("on_client_connected") + async def on_client_connected(transport, client): + logger.info(f"Client connected") + # Kick off the conversation. + messages.append({"role": "system", "content": "Please introduce yourself to the user."}) + await task.queue_frames([LLMRunFrame()]) + + @transport.event_handler("on_client_disconnected") + async def on_client_disconnected(transport, client): + logger.info(f"Client disconnected") + await task.cancel() + + runner = PipelineRunner(handle_sigint=runner_args.handle_sigint) + + await runner.run(task) + + +async def bot(runner_args: RunnerArguments): + """Main bot entry point compatible with Pipecat Cloud.""" + transport = await create_transport(runner_args, transport_params) + await run_bot(transport, runner_args) + + +if __name__ == "__main__": + from pipecat.runner.run import main + + main() diff --git a/examples/foundational/07z-interruptible-sarvam-http.py b/examples/foundational/07z-interruptible-sarvam-http.py index 0821167ef..4d06affe5 100644 --- a/examples/foundational/07z-interruptible-sarvam-http.py +++ b/examples/foundational/07z-interruptible-sarvam-http.py @@ -15,6 +15,7 @@ from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3 from pipecat.audio.vad.silero import SileroVADAnalyzer from pipecat.audio.vad.vad_analyzer import VADParams +from pipecat.frames.frames import LLMRunFrame from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask @@ -112,7 +113,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments): logger.info(f"Client connected") # Kick off the conversation. messages.append({"role": "system", "content": "Please introduce yourself to the user."}) - await task.queue_frames([context_aggregator.user().get_context_frame()]) + await task.queue_frames([LLMRunFrame()]) @transport.event_handler("on_client_disconnected") async def on_client_disconnected(transport, client): diff --git a/src/pipecat/services/anthropic/llm.py b/src/pipecat/services/anthropic/llm.py index 2e3e0272d..a5c9ee791 100644 --- a/src/pipecat/services/anthropic/llm.py +++ b/src/pipecat/services/anthropic/llm.py @@ -373,7 +373,9 @@ class AnthropicLLMService(LLMService): if event.type == "content_block_delta": if hasattr(event.delta, "text"): - await self.push_frame(LLMTextFrame(event.delta.text)) + frame = LLMTextFrame(event.delta.text) + frame.includes_inter_frame_spaces = True + await self.push_frame(frame) completion_tokens_estimate += self._estimate_tokens(event.delta.text) elif hasattr(event.delta, "partial_json") and tool_use_block: json_accumulator += event.delta.partial_json diff --git a/src/pipecat/services/asyncai/tts.py b/src/pipecat/services/asyncai/tts.py index fe067e6b1..78cdd7ef8 100644 --- a/src/pipecat/services/asyncai/tts.py +++ b/src/pipecat/services/asyncai/tts.py @@ -146,6 +146,15 @@ class AsyncAITTSService(InterruptibleTTSService): """ return True + @property + def includes_inter_frame_spaces(self) -> bool: + """Indicates that AsyncAI TTSTextFrames include necessary inter-frame spaces. + + Returns: + True, indicating that AsyncAI's text frames include necessary inter-frame spaces. + """ + return True + def language_to_service_language(self, language: Language) -> Optional[str]: """Convert a Language enum to Async language format. @@ -420,6 +429,15 @@ class AsyncAIHttpTTSService(TTSService): """ return True + @property + def includes_inter_frame_spaces(self) -> bool: + """Indicates that AsyncAI TTSTextFrames include necessary inter-frame spaces. + + Returns: + True, indicating that AsyncAI's text frames include necessary inter-frame spaces. + """ + return True + def language_to_service_language(self, language: Language) -> Optional[str]: """Convert a Language enum to Async language format. diff --git a/src/pipecat/services/aws/llm.py b/src/pipecat/services/aws/llm.py index ccbac43b7..147a8c12a 100644 --- a/src/pipecat/services/aws/llm.py +++ b/src/pipecat/services/aws/llm.py @@ -1078,7 +1078,9 @@ class AWSBedrockLLMService(LLMService): if "contentBlockDelta" in event: delta = event["contentBlockDelta"]["delta"] if "text" in delta: - await self.push_frame(LLMTextFrame(delta["text"])) + frame = LLMTextFrame(delta["text"]) + frame.includes_inter_frame_spaces = True + await self.push_frame(frame) completion_tokens_estimate += self._estimate_tokens(delta["text"]) elif "toolUse" in delta and "input" in delta["toolUse"]: # Handle partial JSON for tool use diff --git a/src/pipecat/services/aws/nova_sonic/llm.py b/src/pipecat/services/aws/nova_sonic/llm.py index 4ad9d05ea..2572b03cb 100644 --- a/src/pipecat/services/aws/nova_sonic/llm.py +++ b/src/pipecat/services/aws/nova_sonic/llm.py @@ -1027,7 +1027,9 @@ class AWSNovaSonicLLMService(LLMService): logger.debug(f"Assistant response text added: {text}") # Report the text of the assistant response. - await self.push_frame(TTSTextFrame(text)) + frame = TTSTextFrame(text) + frame.includes_inter_frame_spaces = True + await self.push_frame(frame) # HACK: here we're also buffering the assistant text ourselves as a # backup rather than relying solely on the assistant context aggregator @@ -1060,7 +1062,9 @@ class AWSNovaSonicLLMService(LLMService): # TTSTextFrame would be ignored otherwise (the interruption frame # would have cleared the assistant aggregator state). await self.push_frame(LLMFullResponseStartFrame()) - await self.push_frame(TTSTextFrame(self._assistant_text_buffer)) + frame = TTSTextFrame(self._assistant_text_buffer) + frame.includes_inter_frame_spaces = True + await self.push_frame(frame) self._may_need_repush_assistant_text = False # Report the end of the assistant response. diff --git a/src/pipecat/services/aws/tts.py b/src/pipecat/services/aws/tts.py index f22c42399..cbc35b123 100644 --- a/src/pipecat/services/aws/tts.py +++ b/src/pipecat/services/aws/tts.py @@ -209,6 +209,15 @@ class AWSPollyTTSService(TTSService): """ return True + @property + def includes_inter_frame_spaces(self) -> bool: + """Indicates that AWS TTSTextFrames include necessary inter-frame spaces. + + Returns: + True, indicating that AWS's text frames include necessary inter-frame spaces. + """ + return True + def language_to_service_language(self, language: Language) -> Optional[str]: """Convert a Language enum to AWS Polly language format. diff --git a/src/pipecat/services/azure/tts.py b/src/pipecat/services/azure/tts.py index 15b4f1256..d0ae42796 100644 --- a/src/pipecat/services/azure/tts.py +++ b/src/pipecat/services/azure/tts.py @@ -151,6 +151,15 @@ class AzureBaseTTSService(TTSService): """ return True + @property + def includes_inter_frame_spaces(self) -> bool: + """Indicates that Azure TTSTextFrames include necessary inter-frame spaces. + + Returns: + True, indicating that Azure's text frames include necessary inter-frame spaces. + """ + return True + def language_to_service_language(self, language: Language) -> Optional[str]: """Convert a Language enum to Azure language format. diff --git a/src/pipecat/services/deepgram/tts.py b/src/pipecat/services/deepgram/tts.py index f3869c0ba..2c816e4a9 100644 --- a/src/pipecat/services/deepgram/tts.py +++ b/src/pipecat/services/deepgram/tts.py @@ -79,6 +79,15 @@ class DeepgramTTSService(TTSService): """ return True + @property + def includes_inter_frame_spaces(self) -> bool: + """Indicates that Deepgram TTSTextFrames include necessary inter-frame spaces. + + Returns: + True, indicating that Deepgram's text frames include necessary inter-frame spaces. + """ + return True + @traced_tts async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: """Generate speech from text using Deepgram's TTS API. @@ -168,6 +177,15 @@ class DeepgramHttpTTSService(TTSService): """ return True + @property + def includes_inter_frame_spaces(self) -> bool: + """Indicates that Deepgram TTSTextFrames include necessary inter-frame spaces. + + Returns: + True, indicating that Deepgram's text frames include necessary inter-frame spaces. + """ + return True + @traced_tts async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: """Generate speech from text using Deepgram's TTS API. diff --git a/src/pipecat/services/fish/tts.py b/src/pipecat/services/fish/tts.py index 669d2ce97..1abe6aca1 100644 --- a/src/pipecat/services/fish/tts.py +++ b/src/pipecat/services/fish/tts.py @@ -159,6 +159,15 @@ class FishAudioTTSService(InterruptibleTTSService): """ return True + @property + def includes_inter_frame_spaces(self) -> bool: + """Indicates that Fish Audio TTSTextFrames include necessary inter-frame spaces. + + Returns: + True, indicating that Fish Audio's text frames include necessary inter-frame spaces. + """ + return True + async def set_model(self, model: str): """Set the TTS model and reconnect. diff --git a/src/pipecat/services/google/llm.py b/src/pipecat/services/google/llm.py index 883932b76..ad5fd70a7 100644 --- a/src/pipecat/services/google/llm.py +++ b/src/pipecat/services/google/llm.py @@ -920,7 +920,9 @@ class GoogleLLMService(LLMService): for part in candidate.content.parts: if not part.thought and part.text: search_result += part.text - await self.push_frame(LLMTextFrame(part.text)) + frame = LLMTextFrame(part.text) + frame.includes_inter_frame_spaces = True + await self.push_frame(frame) elif part.function_call: function_call = part.function_call id = function_call.id or str(uuid.uuid4()) diff --git a/src/pipecat/services/google/tts.py b/src/pipecat/services/google/tts.py index bfbbd8a3c..bd3dbc203 100644 --- a/src/pipecat/services/google/tts.py +++ b/src/pipecat/services/google/tts.py @@ -606,6 +606,15 @@ class GoogleTTSService(TTSService): """ return True + @property + def includes_inter_frame_spaces(self) -> bool: + """Indicates that Google TTSTextFrames include necessary inter-frame spaces. + + Returns: + True, indicating that Google's text frames include necessary inter-frame spaces. + """ + return True + def language_to_service_language(self, language: Language) -> Optional[str]: """Convert a Language enum to Google TTS language format. @@ -840,6 +849,15 @@ class GeminiTTSService(TTSService): """ return True + @property + def includes_inter_frame_spaces(self) -> bool: + """Indicates that Gemini TTSTextFrames include necessary inter-frame spaces. + + Returns: + True, indicating that Gemini's text frames include necessary inter-frame spaces. + """ + return True + def language_to_service_language(self, language: Language) -> Optional[str]: """Convert a Language enum to Gemini TTS language format. diff --git a/src/pipecat/services/groq/tts.py b/src/pipecat/services/groq/tts.py index 68ba4a598..6bd49d1a3 100644 --- a/src/pipecat/services/groq/tts.py +++ b/src/pipecat/services/groq/tts.py @@ -105,6 +105,15 @@ class GroqTTSService(TTSService): """ return True + @property + def includes_inter_frame_spaces(self) -> bool: + """Indicates that Groq TTSTextFrames include necessary inter-frame spaces. + + Returns: + True, indicating that Groq's text frames include necessary inter-frame spaces. + """ + return True + @traced_tts async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: """Generate speech from text using Groq's TTS API. diff --git a/src/pipecat/services/hume/tts.py b/src/pipecat/services/hume/tts.py index 34947fb44..f8b2bbf27 100644 --- a/src/pipecat/services/hume/tts.py +++ b/src/pipecat/services/hume/tts.py @@ -110,6 +110,15 @@ class HumeTTSService(TTSService): """ return True + @property + def includes_inter_frame_spaces(self) -> bool: + """Indicates that Hume TTSTextFrames include necessary inter-frame spaces. + + Returns: + True, indicating that Hume's text frames include necessary inter-frame spaces. + """ + return True + async def start(self, frame: StartFrame) -> None: """Start the service. diff --git a/src/pipecat/services/inworld/tts.py b/src/pipecat/services/inworld/tts.py index eef1440e3..9bb939518 100644 --- a/src/pipecat/services/inworld/tts.py +++ b/src/pipecat/services/inworld/tts.py @@ -250,6 +250,15 @@ class InworldTTSService(TTSService): """ return True + @property + def includes_inter_frame_spaces(self) -> bool: + """Indicates that Inworld TTSTextFrames include necessary inter-frame spaces. + + Returns: + True, indicating that Inworld's text frames include necessary inter-frame spaces. + """ + return True + async def start(self, frame: StartFrame): """Start the Inworld TTS service. diff --git a/src/pipecat/services/lmnt/tts.py b/src/pipecat/services/lmnt/tts.py index f71e2a186..538c1ef93 100644 --- a/src/pipecat/services/lmnt/tts.py +++ b/src/pipecat/services/lmnt/tts.py @@ -124,6 +124,15 @@ class LmntTTSService(InterruptibleTTSService): """ return True + @property + def includes_inter_frame_spaces(self) -> bool: + """Indicates that LMNT TTSTextFrames include necessary inter-frame spaces. + + Returns: + True, indicating that LMNT's text frames include necessary inter-frame spaces. + """ + return True + def language_to_service_language(self, language: Language) -> Optional[str]: """Convert a Language enum to LMNT service language format. diff --git a/src/pipecat/services/minimax/tts.py b/src/pipecat/services/minimax/tts.py index c1a8abb99..c0a6b6aaa 100644 --- a/src/pipecat/services/minimax/tts.py +++ b/src/pipecat/services/minimax/tts.py @@ -194,6 +194,15 @@ class MiniMaxHttpTTSService(TTSService): """ return True + @property + def includes_inter_frame_spaces(self) -> bool: + """Indicates that MiniMax TTSTextFrames include necessary inter-frame spaces. + + Returns: + True, indicating that MiniMax's text frames include necessary inter-frame spaces. + """ + return True + def language_to_service_language(self, language: Language) -> Optional[str]: """Convert a Language enum to MiniMax service language format. diff --git a/src/pipecat/services/neuphonic/tts.py b/src/pipecat/services/neuphonic/tts.py index 3449dea0c..22a6e9999 100644 --- a/src/pipecat/services/neuphonic/tts.py +++ b/src/pipecat/services/neuphonic/tts.py @@ -117,7 +117,6 @@ class NeuphonicTTSService(InterruptibleTTSService): """ super().__init__( aggregate_sentences=aggregate_sentences, - push_text_frames=False, push_stop_frames=True, stop_frame_timeout_s=2.0, sample_rate=sample_rate, @@ -152,6 +151,15 @@ class NeuphonicTTSService(InterruptibleTTSService): """ return True + @property + def includes_inter_frame_spaces(self) -> bool: + """Indicates that Neuphonic TTSTextFrames include necessary inter-frame spaces. + + Returns: + True, indicating that Neuphonic's text frames include necessary inter-frame spaces. + """ + return True + def language_to_service_language(self, language: Language) -> Optional[str]: """Convert a Language enum to Neuphonic service language format. @@ -437,6 +445,15 @@ class NeuphonicHttpTTSService(TTSService): """ return True + @property + def includes_inter_frame_spaces(self) -> bool: + """Indicates that Neuphonic TTSTextFrames include necessary inter-frame spaces. + + Returns: + True, indicating that Neuphonic's text frames include necessary inter-frame spaces. + """ + return True + def language_to_service_language(self, language: Language) -> Optional[str]: """Convert a Language enum to Neuphonic service language format. diff --git a/src/pipecat/services/openai/base_llm.py b/src/pipecat/services/openai/base_llm.py index d020e1106..5a8c1ab31 100644 --- a/src/pipecat/services/openai/base_llm.py +++ b/src/pipecat/services/openai/base_llm.py @@ -390,7 +390,9 @@ class BaseOpenAILLMService(LLMService): # Keep iterating through the response to collect all the argument fragments arguments += tool_call.function.arguments elif chunk.choices[0].delta.content: - await self.push_frame(LLMTextFrame(chunk.choices[0].delta.content)) + frame = LLMTextFrame(chunk.choices[0].delta.content) + frame.includes_inter_frame_spaces = True + await self.push_frame(frame) # When gpt-4o-audio / gpt-4o-mini-audio is used for llm or stt+llm # we need to get LLMTextFrame for the transcript diff --git a/src/pipecat/services/piper/tts.py b/src/pipecat/services/piper/tts.py index fa43a720c..73addb9d1 100644 --- a/src/pipecat/services/piper/tts.py +++ b/src/pipecat/services/piper/tts.py @@ -66,6 +66,15 @@ class PiperTTSService(TTSService): """ return True + @property + def includes_inter_frame_spaces(self) -> bool: + """Indicates that Piper TTSTextFrames include necessary inter-frame spaces. + + Returns: + True, indicating that Piper's text frames include necessary inter-frame spaces. + """ + return True + @traced_tts async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: """Generate speech from text using Piper's HTTP API. diff --git a/src/pipecat/services/rime/tts.py b/src/pipecat/services/rime/tts.py index f0dd6b297..0ac37c471 100644 --- a/src/pipecat/services/rime/tts.py +++ b/src/pipecat/services/rime/tts.py @@ -496,6 +496,15 @@ class RimeHttpTTSService(TTSService): """ return True + @property + def includes_inter_frame_spaces(self) -> bool: + """Indicates that Rime TTSTextFrames include necessary inter-frame spaces. + + Returns: + True, indicating that Rime's text frames include necessary inter-frame spaces. + """ + return True + def language_to_service_language(self, language: Language) -> str | None: """Convert pipecat language to Rime language code. diff --git a/src/pipecat/services/riva/tts.py b/src/pipecat/services/riva/tts.py index 3554c5558..d051965ed 100644 --- a/src/pipecat/services/riva/tts.py +++ b/src/pipecat/services/riva/tts.py @@ -112,6 +112,15 @@ class RivaTTSService(TTSService): riva.client.proto.riva_tts_pb2.RivaSynthesisConfigRequest() ) + @property + def includes_inter_frame_spaces(self) -> bool: + """Indicates that Riva TTSTextFrames include necessary inter-frame spaces. + + Returns: + True, indicating that Riva's text frames include necessary inter-frame spaces. + """ + return True + async def set_model(self, model: str): """Attempt to set the TTS model. diff --git a/src/pipecat/services/sambanova/llm.py b/src/pipecat/services/sambanova/llm.py index 5ed600457..76f11e81c 100644 --- a/src/pipecat/services/sambanova/llm.py +++ b/src/pipecat/services/sambanova/llm.py @@ -176,7 +176,9 @@ class SambaNovaLLMService(OpenAILLMService): # type: ignore # Keep iterating through the response to collect all the argument fragments arguments += tool_call.function.arguments elif chunk.choices[0].delta.content: - await self.push_frame(LLMTextFrame(chunk.choices[0].delta.content)) + frame = LLMTextFrame(chunk.choices[0].delta.content) + frame.includes_inter_frame_spaces = True + await self.push_frame(frame) # When gpt-4o-audio / gpt-4o-mini-audio is used for llm or stt+llm # we need to get LLMTextFrame for the transcript diff --git a/src/pipecat/services/sarvam/tts.py b/src/pipecat/services/sarvam/tts.py index e8582227a..9ff037938 100644 --- a/src/pipecat/services/sarvam/tts.py +++ b/src/pipecat/services/sarvam/tts.py @@ -195,6 +195,15 @@ class SarvamHttpTTSService(TTSService): """ return True + @property + def includes_inter_frame_spaces(self) -> bool: + """Indicates that Sarvam TTSTextFrames include necessary inter-frame spaces. + + Returns: + True, indicating that Sarvam's text frames include necessary inter-frame spaces. + """ + return True + def language_to_service_language(self, language: Language) -> Optional[str]: """Convert a Language enum to Sarvam AI language format. @@ -458,6 +467,15 @@ class SarvamTTSService(InterruptibleTTSService): """ return True + @property + def includes_inter_frame_spaces(self) -> bool: + """Indicates that Sarvam TTSTextFrames include necessary inter-frame spaces. + + Returns: + True, indicating that Sarvam's text frames include necessary inter-frame spaces. + """ + return True + def language_to_service_language(self, language: Language) -> Optional[str]: """Convert a Language enum to Sarvam AI language format. diff --git a/src/pipecat/services/speechmatics/tts.py b/src/pipecat/services/speechmatics/tts.py index b8fe172e7..e115d5a7c 100644 --- a/src/pipecat/services/speechmatics/tts.py +++ b/src/pipecat/services/speechmatics/tts.py @@ -105,6 +105,15 @@ class SpeechmaticsTTSService(TTSService): """ return True + @property + def includes_inter_frame_spaces(self) -> bool: + """Indicates that Speechmatics TTSTextFrames include necessary inter-frame spaces. + + Returns: + True, indicating that Speechmatics's text frames include necessary inter-frame spaces. + """ + return True + @traced_tts async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: """Generate speech from text using Speechmatics' HTTP API. diff --git a/src/pipecat/services/tts_service.py b/src/pipecat/services/tts_service.py index b356c7244..29c54f497 100644 --- a/src/pipecat/services/tts_service.py +++ b/src/pipecat/services/tts_service.py @@ -192,6 +192,23 @@ class TTSService(AIService): CHUNK_SECONDS = 0.5 return int(self.sample_rate * CHUNK_SECONDS * 2) # 2 bytes/sample + @property + def includes_inter_frame_spaces(self) -> bool: + """Indicates whether TTSTextFrames include necesary inter-frame spaces. + + When True, the TTSTextFrame objects pushed by this service already + include all necessary spaces between subsequent frames. When False, + downstream processors (like the assistant context aggregator) may need + to add spacing. + + Subclasses should override this property to return True if their text + generation process already includes necessary inter-frame spaces. + + Returns: + False by default. Subclasses can override to return True. + """ + return False + async def set_model(self, model: str): """Set the TTS model to use. @@ -490,7 +507,9 @@ class TTSService(AIService): if self._push_text_frames: # We send the original text after the audio. This way, if we are # interrupted, the text is not added to the assistant context. - await self.push_frame(TTSTextFrame(text)) + frame = TTSTextFrame(text) + frame.includes_inter_frame_spaces = self.includes_inter_frame_spaces + await self.push_frame(frame) async def _stop_frame_handler(self): has_started = False