D'oh! My TTS "inter-frame-spaces" logic was *way* overcomplicated (and fundamentally mistaken, though it happened to work)

Now: - For TTS word-by-word output and `TTSSpeakFrames`: `TTSTextFrame`s' have `includes_inter_frame_spaces=False`. - For all other TTS output: `TTSTextFrame` pass through the received text frames' `includes_inter_frame_spaces` value. So far, this value has always been `True`: LLMs send text chunks already containing all necessary spaces. - `LLMTextFrame`s set `includes_inter_frame_spaces=False` at init time, per the aforementioned assumption.
2025-11-14 21:39:18 -05:00
parent 35ff44b799
commit f3b254e335
28 changed files with 33 additions and 255 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,10 +12,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Added `ElevenLabsRealtimeSTTService` which implements the Realtime STT
  service from ElevenLabs.

- Added a `TTSService.includes_inter_frame_spaces` property getter, so that TTS
-  services that subclass `TTSService` can indicate whether the text in the
-  `TTSTextFrame`s they push already contain any necessary inter-frame spaces.
-
 ### Changed

 - Updated all STT and TTS services to use consistent error handling pattern with
@@ -56,8 +52,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

 ### Added

- Added ai-coustics integrated VAD (`AICVADAnalyzer`) with `AICFilter` factory and 
-  example wiring; leverages the enhancement model for robust detection with no 
+- Added ai-coustics integrated VAD (`AICVADAnalyzer`) with `AICFilter` factory and
+  example wiring; leverages the enhancement model for robust detection with no
  ONNX dependency or added processing complexity.

 ## [0.0.94] - 2025-11-10
--- a/src/pipecat/frames/frames.py
+++ b/src/pipecat/frames/frames.py
@@ -352,7 +352,10 @@ class TextFrame(DataFrame):
 class LLMTextFrame(TextFrame):
    """Text frame generated by LLM services."""

-    pass
+    def __post_init__(self):
+        super().__post_init__()
+        # LLM services send text frames with all necessary spaces included
+        self.includes_inter_frame_spaces = True


@dataclass
--- a/src/pipecat/services/anthropic/llm.py
+++ b/src/pipecat/services/anthropic/llm.py
@@ -373,9 +373,7 @@ class AnthropicLLMService(LLMService):

                if event.type == "content_block_delta":
                    if hasattr(event.delta, "text"):
-                        frame = LLMTextFrame(event.delta.text)
-                        frame.includes_inter_frame_spaces = True
-                        await self.push_frame(frame)
+                        await self.push_frame(LLMTextFrame(event.delta.text))
                        completion_tokens_estimate += self._estimate_tokens(event.delta.text)
                    elif hasattr(event.delta, "partial_json") and tool_use_block:
                        json_accumulator += event.delta.partial_json
--- a/src/pipecat/services/asyncai/tts.py
+++ b/src/pipecat/services/asyncai/tts.py
@@ -146,15 +146,6 @@ class AsyncAITTSService(InterruptibleTTSService):
        """
        return True

-    @property
-    def includes_inter_frame_spaces(self) -> bool:
-        """Indicates that AsyncAI TTSTextFrames include necessary inter-frame spaces.
-
-        Returns:
-            True, indicating that AsyncAI's text frames include necessary inter-frame spaces.
-        """
-        return True
-
    def language_to_service_language(self, language: Language) -> Optional[str]:
        """Convert a Language enum to Async language format.

@@ -433,15 +424,6 @@ class AsyncAIHttpTTSService(TTSService):
        """
        return True

-    @property
-    def includes_inter_frame_spaces(self) -> bool:
-        """Indicates that AsyncAI TTSTextFrames include necessary inter-frame spaces.
-
-        Returns:
-            True, indicating that AsyncAI's text frames include necessary inter-frame spaces.
-        """
-        return True
-
    def language_to_service_language(self, language: Language) -> Optional[str]:
        """Convert a Language enum to Async language format.

--- a/src/pipecat/services/aws/llm.py
+++ b/src/pipecat/services/aws/llm.py
@@ -1078,9 +1078,7 @@ class AWSBedrockLLMService(LLMService):
                    if "contentBlockDelta" in event:
                        delta = event["contentBlockDelta"]["delta"]
                        if "text" in delta:
-                            frame = LLMTextFrame(delta["text"])
-                            frame.includes_inter_frame_spaces = True
-                            await self.push_frame(frame)
+                            await self.push_frame(LLMTextFrame(delta["text"]))
                            completion_tokens_estimate += self._estimate_tokens(delta["text"])
                        elif "toolUse" in delta and "input" in delta["toolUse"]:
                            # Handle partial JSON for tool use
--- a/src/pipecat/services/aws/tts.py
+++ b/src/pipecat/services/aws/tts.py
@@ -209,15 +209,6 @@ class AWSPollyTTSService(TTSService):
        """
        return True

-    @property
-    def includes_inter_frame_spaces(self) -> bool:
-        """Indicates that AWS TTSTextFrames include necessary inter-frame spaces.
-
-        Returns:
-            True, indicating that AWS's text frames include necessary inter-frame spaces.
-        """
-        return True
-
    def language_to_service_language(self, language: Language) -> Optional[str]:
        """Convert a Language enum to AWS Polly language format.

--- a/src/pipecat/services/azure/tts.py
+++ b/src/pipecat/services/azure/tts.py
@@ -151,15 +151,6 @@ class AzureBaseTTSService(TTSService):
        """
        return True

-    @property
-    def includes_inter_frame_spaces(self) -> bool:
-        """Indicates that Azure TTSTextFrames include necessary inter-frame spaces.
-
-        Returns:
-            True, indicating that Azure's text frames include necessary inter-frame spaces.
-        """
-        return True
-
    def language_to_service_language(self, language: Language) -> Optional[str]:
        """Convert a Language enum to Azure language format.

--- a/src/pipecat/services/deepgram/tts.py
+++ b/src/pipecat/services/deepgram/tts.py
@@ -79,15 +79,6 @@ class DeepgramTTSService(TTSService):
        """
        return True

-    @property
-    def includes_inter_frame_spaces(self) -> bool:
-        """Indicates that Deepgram TTSTextFrames include necessary inter-frame spaces.
-
-        Returns:
-            True, indicating that Deepgram's text frames include necessary inter-frame spaces.
-        """
-        return True
-
    @traced_tts
    async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
        """Generate speech from text using Deepgram's TTS API.
@@ -177,15 +168,6 @@ class DeepgramHttpTTSService(TTSService):
        """
        return True

-    @property
-    def includes_inter_frame_spaces(self) -> bool:
-        """Indicates that Deepgram TTSTextFrames include necessary inter-frame spaces.
-
-        Returns:
-            True, indicating that Deepgram's text frames include necessary inter-frame spaces.
-        """
-        return True
-
    @traced_tts
    async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
        """Generate speech from text using Deepgram's TTS API.
--- a/src/pipecat/services/fish/tts.py
+++ b/src/pipecat/services/fish/tts.py
@@ -159,15 +159,6 @@ class FishAudioTTSService(InterruptibleTTSService):
        """
        return True

-    @property
-    def includes_inter_frame_spaces(self) -> bool:
-        """Indicates that Fish Audio TTSTextFrames include necessary inter-frame spaces.
-
-        Returns:
-            True, indicating that Fish Audio's text frames include necessary inter-frame spaces.
-        """
-        return True
-
    async def set_model(self, model: str):
        """Set the TTS model and reconnect.

--- a/src/pipecat/services/google/gemini_live/llm.py
+++ b/src/pipecat/services/google/gemini_live/llm.py
@@ -1452,8 +1452,6 @@ class GeminiLiveLLMService(LLMService):
            self._bot_text_buffer += text
            self._search_result_buffer += text  # Also accumulate for grounding
            frame = LLMTextFrame(text=text)
-            # Gemini Live text already includes any necessary inter-chunk spaces
-            frame.includes_inter_frame_spaces = True
            await self.push_frame(frame)

        # Check for grounding metadata in server content
--- a/src/pipecat/services/google/llm.py
+++ b/src/pipecat/services/google/llm.py
@@ -920,9 +920,7 @@ class GoogleLLMService(LLMService):
                        for part in candidate.content.parts:
                            if not part.thought and part.text:
                                search_result += part.text
-                                frame = LLMTextFrame(part.text)
-                                frame.includes_inter_frame_spaces = True
-                                await self.push_frame(frame)
+                                await self.push_frame(LLMTextFrame(part.text))
                            elif part.function_call:
                                function_call = part.function_call
                                id = function_call.id or str(uuid.uuid4())
--- a/src/pipecat/services/google/tts.py
+++ b/src/pipecat/services/google/tts.py
@@ -596,15 +596,6 @@ class GoogleHttpTTSService(TTSService):
        """
        return True

-    @property
-    def includes_inter_frame_spaces(self) -> bool:
-        """Indicates that Google TTSTextFrames include necessary inter-frame spaces.
-
-        Returns:
-            True, indicating that Google's text frames include necessary inter-frame spaces.
-        """
-        return True
-
    def language_to_service_language(self, language: Language) -> Optional[str]:
        """Convert a Language enum to Google TTS language format.

@@ -803,15 +794,6 @@ class GoogleBaseTTSService(TTSService):
        """
        return True

-    @property
-    def includes_inter_frame_spaces(self) -> bool:
-        """Indicates that Google and Gemini TTSTextFrames include necessary inter-frame spaces.
-
-        Returns:
-            True, indicating that Google's text frames include necessary inter-frame spaces.
-        """
-        return True
-
    def language_to_service_language(self, language: Language) -> Optional[str]:
        """Convert a Language enum to Google TTS language format.

--- a/src/pipecat/services/groq/tts.py
+++ b/src/pipecat/services/groq/tts.py
@@ -111,15 +111,6 @@ class GroqTTSService(TTSService):
        """
        return True

-    @property
-    def includes_inter_frame_spaces(self) -> bool:
-        """Indicates that Groq TTSTextFrames include necessary inter-frame spaces.
-
-        Returns:
-            True, indicating that Groq's text frames include necessary inter-frame spaces.
-        """
-        return True
-
    @traced_tts
    async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
        """Generate speech from text using Groq's TTS API.
--- a/src/pipecat/services/hume/tts.py
+++ b/src/pipecat/services/hume/tts.py
@@ -110,15 +110,6 @@ class HumeTTSService(TTSService):
        """
        return True

-    @property
-    def includes_inter_frame_spaces(self) -> bool:
-        """Indicates that Hume TTSTextFrames include necessary inter-frame spaces.
-
-        Returns:
-            True, indicating that Hume's text frames include necessary inter-frame spaces.
-        """
-        return True
-
    async def start(self, frame: StartFrame) -> None:
        """Start the service.

--- a/src/pipecat/services/inworld/tts.py
+++ b/src/pipecat/services/inworld/tts.py
@@ -250,15 +250,6 @@ class InworldTTSService(TTSService):
        """
        return True

-    @property
-    def includes_inter_frame_spaces(self) -> bool:
-        """Indicates that Inworld TTSTextFrames include necessary inter-frame spaces.
-
-        Returns:
-            True, indicating that Inworld's text frames include necessary inter-frame spaces.
-        """
-        return True
-
    async def start(self, frame: StartFrame):
        """Start the Inworld TTS service.

--- a/src/pipecat/services/lmnt/tts.py
+++ b/src/pipecat/services/lmnt/tts.py
@@ -124,15 +124,6 @@ class LmntTTSService(InterruptibleTTSService):
        """
        return True

-    @property
-    def includes_inter_frame_spaces(self) -> bool:
-        """Indicates that LMNT TTSTextFrames include necessary inter-frame spaces.
-
-        Returns:
-            True, indicating that LMNT's text frames include necessary inter-frame spaces.
-        """
-        return True
-
    def language_to_service_language(self, language: Language) -> Optional[str]:
        """Convert a Language enum to LMNT service language format.

--- a/src/pipecat/services/minimax/tts.py
+++ b/src/pipecat/services/minimax/tts.py
@@ -194,15 +194,6 @@ class MiniMaxHttpTTSService(TTSService):
        """
        return True

-    @property
-    def includes_inter_frame_spaces(self) -> bool:
-        """Indicates that MiniMax TTSTextFrames include necessary inter-frame spaces.
-
-        Returns:
-            True, indicating that MiniMax's text frames include necessary inter-frame spaces.
-        """
-        return True
-
    def language_to_service_language(self, language: Language) -> Optional[str]:
        """Convert a Language enum to MiniMax service language format.

--- a/src/pipecat/services/neuphonic/tts.py
+++ b/src/pipecat/services/neuphonic/tts.py
@@ -151,15 +151,6 @@ class NeuphonicTTSService(InterruptibleTTSService):
        """
        return True

-    @property
-    def includes_inter_frame_spaces(self) -> bool:
-        """Indicates that Neuphonic TTSTextFrames include necessary inter-frame spaces.
-
-        Returns:
-            True, indicating that Neuphonic's text frames include necessary inter-frame spaces.
-        """
-        return True
-
    def language_to_service_language(self, language: Language) -> Optional[str]:
        """Convert a Language enum to Neuphonic service language format.

@@ -449,15 +440,6 @@ class NeuphonicHttpTTSService(TTSService):
        """
        return True

-    @property
-    def includes_inter_frame_spaces(self) -> bool:
-        """Indicates that Neuphonic TTSTextFrames include necessary inter-frame spaces.
-
-        Returns:
-            True, indicating that Neuphonic's text frames include necessary inter-frame spaces.
-        """
-        return True
-
    def language_to_service_language(self, language: Language) -> Optional[str]:
        """Convert a Language enum to Neuphonic service language format.

--- a/src/pipecat/services/openai/base_llm.py
+++ b/src/pipecat/services/openai/base_llm.py
@@ -390,9 +390,7 @@ class BaseOpenAILLMService(LLMService):
                    # Keep iterating through the response to collect all the argument fragments
                    arguments += tool_call.function.arguments
            elif chunk.choices[0].delta.content:
-                frame = LLMTextFrame(chunk.choices[0].delta.content)
-                frame.includes_inter_frame_spaces = True
-                await self.push_frame(frame)
+                await self.push_frame(LLMTextFrame(chunk.choices[0].delta.content))

            # When gpt-4o-audio / gpt-4o-mini-audio is used for llm or stt+llm
            # we need to get LLMTextFrame for the transcript
--- a/src/pipecat/services/openai/realtime/llm.py
+++ b/src/pipecat/services/openai/realtime/llm.py
@@ -678,8 +678,6 @@ class OpenAIRealtimeLLMService(LLMService):
        # the output modality is "text"
        if evt.delta:
            frame = LLMTextFrame(evt.delta)
-            # OpenAI Realtime text already includes any necessary inter-chunk spaces
-            frame.includes_inter_frame_spaces = True
            await self.push_frame(frame)

    async def _handle_evt_audio_transcript_delta(self, evt):
--- a/src/pipecat/services/openai/tts.py
+++ b/src/pipecat/services/openai/tts.py
@@ -131,15 +131,6 @@ class OpenAITTSService(TTSService):
        """
        return True

-    @property
-    def includes_inter_frame_spaces(self) -> bool:
-        """Indicates that OpenAI TTSTextFrames include necessary inter-frame spaces.
-
-        Returns:
-            True, indicating that OpenAI's text frames include necessary inter-frame spaces.
-        """
-        return True
-
    async def set_model(self, model: str):
        """Set the TTS model to use.

--- a/src/pipecat/services/piper/tts.py
+++ b/src/pipecat/services/piper/tts.py
@@ -66,15 +66,6 @@ class PiperTTSService(TTSService):
        """
        return True

-    @property
-    def includes_inter_frame_spaces(self) -> bool:
-        """Indicates that Piper TTSTextFrames include necessary inter-frame spaces.
-
-        Returns:
-            True, indicating that Piper's text frames include necessary inter-frame spaces.
-        """
-        return True
-
    @traced_tts
    async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
        """Generate speech from text using Piper's HTTP API.
--- a/src/pipecat/services/rime/tts.py
+++ b/src/pipecat/services/rime/tts.py
@@ -501,15 +501,6 @@ class RimeHttpTTSService(TTSService):
        """
        return True

-    @property
-    def includes_inter_frame_spaces(self) -> bool:
-        """Indicates that Rime TTSTextFrames include necessary inter-frame spaces.
-
-        Returns:
-            True, indicating that Rime's text frames include necessary inter-frame spaces.
-        """
-        return True
-
    def language_to_service_language(self, language: Language) -> str | None:
        """Convert pipecat language to Rime language code.

--- a/src/pipecat/services/riva/tts.py
+++ b/src/pipecat/services/riva/tts.py
@@ -113,15 +113,6 @@ class RivaTTSService(TTSService):
            riva.client.proto.riva_tts_pb2.RivaSynthesisConfigRequest()
        )

-    @property
-    def includes_inter_frame_spaces(self) -> bool:
-        """Indicates that Riva TTSTextFrames include necessary inter-frame spaces.
-
-        Returns:
-            True, indicating that Riva's text frames include necessary inter-frame spaces.
-        """
-        return True
-
    async def set_model(self, model: str):
        """Attempt to set the TTS model.

--- a/src/pipecat/services/sambanova/llm.py
+++ b/src/pipecat/services/sambanova/llm.py
@@ -176,9 +176,7 @@ class SambaNovaLLMService(OpenAILLMService):  # type: ignore
                    # Keep iterating through the response to collect all the argument fragments
                    arguments += tool_call.function.arguments
            elif chunk.choices[0].delta.content:
-                frame = LLMTextFrame(chunk.choices[0].delta.content)
-                frame.includes_inter_frame_spaces = True
-                await self.push_frame(frame)
+                await self.push_frame(LLMTextFrame(chunk.choices[0].delta.content))

            # When gpt-4o-audio / gpt-4o-mini-audio is used for llm or stt+llm
            # we need to get LLMTextFrame for the transcript
--- a/src/pipecat/services/sarvam/tts.py
+++ b/src/pipecat/services/sarvam/tts.py
@@ -195,15 +195,6 @@ class SarvamHttpTTSService(TTSService):
        """
        return True

-    @property
-    def includes_inter_frame_spaces(self) -> bool:
-        """Indicates that Sarvam TTSTextFrames include necessary inter-frame spaces.
-
-        Returns:
-            True, indicating that Sarvam's text frames include necessary inter-frame spaces.
-        """
-        return True
-
    def language_to_service_language(self, language: Language) -> Optional[str]:
        """Convert a Language enum to Sarvam AI language format.

@@ -467,15 +458,6 @@ class SarvamTTSService(InterruptibleTTSService):
        """
        return True

-    @property
-    def includes_inter_frame_spaces(self) -> bool:
-        """Indicates that Sarvam TTSTextFrames include necessary inter-frame spaces.
-
-        Returns:
-            True, indicating that Sarvam's text frames include necessary inter-frame spaces.
-        """
-        return True
-
    def language_to_service_language(self, language: Language) -> Optional[str]:
        """Convert a Language enum to Sarvam AI language format.

--- a/src/pipecat/services/speechmatics/tts.py
+++ b/src/pipecat/services/speechmatics/tts.py
@@ -105,15 +105,6 @@ class SpeechmaticsTTSService(TTSService):
        """
        return True

-    @property
-    def includes_inter_frame_spaces(self) -> bool:
-        """Indicates that Speechmatics TTSTextFrames include necessary inter-frame spaces.
-
-        Returns:
-            True, indicating that Speechmatics's text frames include necessary inter-frame spaces.
-        """
-        return True
-
    @traced_tts
    async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
        """Generate speech from text using Speechmatics' HTTP API.
--- a/src/pipecat/services/tts_service.py
+++ b/src/pipecat/services/tts_service.py
@@ -142,6 +142,7 @@ class TTSService(AIService):
        self._voice_id: str = ""
        self._settings: Dict[str, Any] = {}
        self._text_aggregator: BaseTextAggregator = text_aggregator or SimpleTextAggregator()
+        self._aggregated_text_includes_inter_frame_spaces: bool = False
        self._text_filters: Sequence[BaseTextFilter] = text_filters or []
        self._transport_destination: Optional[str] = transport_destination
        self._tracing_enabled: bool = False
@@ -192,23 +193,6 @@ class TTSService(AIService):
        CHUNK_SECONDS = 0.5
        return int(self.sample_rate * CHUNK_SECONDS * 2)  # 2 bytes/sample

-    @property
-    def includes_inter_frame_spaces(self) -> bool:
-        """Indicates whether TTSTextFrames include necesary inter-frame spaces.
-
-        When True, the TTSTextFrame objects pushed by this service already
-        include all necessary spaces between subsequent frames. When False,
-        downstream processors (like the assistant context aggregator) may need
-        to add spacing.
-
-        Subclasses should override this property to return True if their text
-        generation process already includes necessary inter-frame spaces.
-
-        Returns:
-            False by default. Subclasses can override to return True.
-        """
-        return False
-
    async def set_model(self, model: str):
        """Set the TTS model to use.

@@ -369,9 +353,16 @@ class TTSService(AIService):
            await self._maybe_pause_frame_processing()

            sentence = self._text_aggregator.text
+            includes_inter_frame_spaces = self._aggregated_text_includes_inter_frame_spaces
+
+            # Reset aggregator state
            await self._text_aggregator.reset()
            self._processing_text = False
-            await self._push_tts_frames(sentence)
+            self._aggregated_text_includes_inter_frame_spaces = False
+
+            await self._push_tts_frames(
+                sentence, includes_inter_frame_spaces=includes_inter_frame_spaces
+            )
            if isinstance(frame, LLMFullResponseEndFrame):
                if self._push_text_frames:
                    await self.push_frame(frame, direction)
@@ -380,7 +371,8 @@ class TTSService(AIService):
        elif isinstance(frame, TTSSpeakFrame):
            # Store if we were processing text or not so we can set it back.
            processing_text = self._processing_text
-            await self._push_tts_frames(frame.text)
+            # Assumption: text in TTSSpeakFrame does not include inter-frame spaces
+            await self._push_tts_frames(frame.text, includes_inter_frame_spaces=False)
            # We pause processing incoming frames because we are sending data to
            # the TTS. We pause to avoid audio overlapping.
            await self._maybe_pause_frame_processing()
@@ -474,11 +466,17 @@ class TTSService(AIService):
            text = frame.text
        else:
            text = await self._text_aggregator.aggregate(frame.text)
+            # Assumption: whether inter-frame spaces are included shouldn't
+            # change during aggregation, so we can just use the latest frame's
+            # value
+            self._aggregated_text_includes_inter_frame_spaces = frame.includes_inter_frame_spaces

        if text:
-            await self._push_tts_frames(text)
+            await self._push_tts_frames(
+                text, includes_inter_frame_spaces=frame.includes_inter_frame_spaces
+            )

-    async def _push_tts_frames(self, text: str):
+    async def _push_tts_frames(self, text: str, includes_inter_frame_spaces: bool):
        # Remove leading newlines only
        text = text.lstrip("\n")

@@ -508,7 +506,7 @@ class TTSService(AIService):
            # We send the original text after the audio. This way, if we are
            # interrupted, the text is not added to the assistant context.
            frame = TTSTextFrame(text)
-            frame.includes_inter_frame_spaces = self.includes_inter_frame_spaces
+            frame.includes_inter_frame_spaces = includes_inter_frame_spaces
            await self.push_frame(frame)

    async def _stop_frame_handler(self):
@@ -635,6 +633,8 @@ class WordTTSService(TTSService):
                frame = TTSStoppedFrame()
                frame.pts = last_pts
            else:
+                # Assumption: word-by-word text frames don't include spaces, so
+                # we can rely on the default includes_inter_frame_spaces=False
                frame = TTSTextFrame(word)
                frame.pts = self._initial_word_timestamp + timestamp
            if frame: