Better handle Gemini non-function thought signatures

2025-12-08 15:56:40 -05:00
parent 1249ee3de3
commit 229ff794d6
2 changed files with 83 additions and 30 deletions
--- a/src/pipecat/adapters/services/gemini_adapter.py
+++ b/src/pipecat/adapters/services/gemini_adapter.py
@@ -209,7 +209,7 @@ class GeminiLLMAdapter(BaseLLMAdapter[GeminiLLMInvocationParams]):
        system_instruction = None
        messages = []
        tool_call_id_to_name_mapping = {}
-        non_fn_signed_parts = []
+        non_fn_thought_signatures = []

        # Process each message, converting to Google format as needed
        for message in universal_context_messages:
@@ -230,16 +230,17 @@ class GeminiLLMAdapter(BaseLLMAdapter[GeminiLLMInvocationParams]):
                    )
                    continue

-                # Special handling for non-function-call-related thought
-                # signature messages (Gemini 3 Pro mainly, but possibly others,
-                # too, especially when functions are involved in the
-                # conversation)
+                # Special handling for non-function-call-related thought-
+                # signature-containing messages
                if (
                    isinstance(message.message, dict)
                    and message.message.get("type") == "non_fn_thought_signature"
-                    and (signed_part := message.message.get("signed_part"))
+                    and (thought_signature := message.message.get("signature"))
+                    and (bookmark := message.message.get("bookmark"))
                ):
-                    non_fn_signed_parts.append(signed_part)
+                    non_fn_thought_signatures.append(
+                        {"signature": thought_signature, "bookmark": bookmark}
+                    )
                    continue

                # Fall back to assuming that the message is already in Google
@@ -269,7 +270,7 @@ class GeminiLLMAdapter(BaseLLMAdapter[GeminiLLMInvocationParams]):

        # Apply non-function-call-related thought signatures to the appropriate
        # messages
-        self._apply_non_function_thought_signatures_to_messages(non_fn_signed_parts, messages)
+        self._apply_non_function_thought_signatures_to_messages(non_fn_thought_signatures, messages)

        # Check if we only have function-related messages (no regular text)
        has_regular_messages = any(
@@ -476,21 +477,37 @@ class GeminiLLMAdapter(BaseLLMAdapter[GeminiLLMInvocationParams]):
            break

    def _apply_non_function_thought_signatures_to_messages(
-        self, signed_parts: List[Part], messages: List[Content]
+        self, thought_signatures: List[dict], messages: List[Content]
    ) -> None:
        """Apply (optional, but recommended) non-function-call-related thought signatures to the last part of corresponding non-function-call assistant messages.

        Gemini 3 Pro (and, somewhat surprisingly, other models, too, when
-        functions are involved in the conversation) outputs a thought signature
+        functions are involved in the conversation) outputs thought signatures
        at the end of assistant responses.

        Args:
-            signed_parts: A list of signed received Parts containing thought signatures to apply.
+            thought_signatures: A list of dicts containing:
+                - "signature": a thought signature
+                - "bookmark": a bookmark to identify the message part to apply the signature to.
+                  The bookmark may contain either:
+                    - "text"
+                    - "inline_data"
            messages: List of messages to search through.
        """
-        if not signed_parts:
+        if not thought_signatures:
            return

+        # For debugging, print out thought signatures and their bookmarks
+        logger.trace(f"Thought signatures to apply: {len(thought_signatures)}")
+        for ts in thought_signatures:
+            bookmark = ts.get("bookmark")
+            if bookmark.get("text"):
+                text = bookmark["text"]
+                log_display_text = f"{text[:50]}..." if len(text) > 50 else text
+                logger.trace(f" - At text: {log_display_text}")
+            elif bookmark.get("inline_data"):
+                logger.trace(f" - At inline data")
+
        # Find all assistant (model) messages that aren't function calls
        non_fn_assistant_messages = []
        for message in messages:
@@ -507,9 +524,10 @@ class GeminiLLMAdapter(BaseLLMAdapter[GeminiLLMInvocationParams]):
        # Apply thought signatures to the corresponding assistant messages
        # Match them using content heuristics, maintaining order (messages without signatures are skipped)
        message_start_index = 0  # Track where to start searching for the next match
-        for signed_part in signed_parts:
-            thought_signature = getattr(signed_part, "thought_signature", None)
-            if not thought_signature:
+        for thought_signature_dict in thought_signatures:
+            signature = thought_signature_dict.get("signature")
+            bookmark = thought_signature_dict.get("bookmark")
+            if not signature:
                continue

            # Search through remaining non-function assistant messages for a match
@@ -521,31 +539,41 @@ class GeminiLLMAdapter(BaseLLMAdapter[GeminiLLMInvocationParams]):
                last_part = message.parts[-1]
                matched = False

-                # Check if signed part has text and last message part text has the same text or
+                # If it's a text bookmark, check that the last message part text has the same text or
                # - is a prefix of that text (in case spoken text was truncated due to interruption)
-                # - is prefixed by that text (in case signed part was not the end of the assistant response...
-                #   which is NOT supposed to happen, according to Google's docs, but seems to, for long responses...)
-                if hasattr(signed_part, "text") and signed_part.text:
+                # - is prefixed by that text (in case bookmark represents just first chunk of multi-chunk text)
+                if bookmark_text := bookmark.get("text"):
                    if hasattr(last_part, "text") and last_part.text:
                        # Normalize whitespace for comparison
-                        signed_text = " ".join(signed_part.text.split())
+                        signed_text = " ".join(bookmark_text.split())
                        last_text = " ".join(last_part.text.split())
                        if (
                            last_text == signed_text
                            or signed_text.startswith(last_text)
                            or last_text.startswith(signed_text)
                        ):
-                            last_part.thought_signature = thought_signature
+                            log_display_text = (
+                                f"{last_part.text[:50]}..."
+                                if len(last_part.text) > 50
+                                else last_part.text
+                            )
+                            logger.trace(
+                                f"Applying thought signature to part with matching text: {log_display_text}"
+                            )
+                            last_part.thought_signature = signature
                            matched = True

                # Check if signed part has inline_data and last message part has matching inline_data
-                elif hasattr(signed_part, "inline_data") and signed_part.inline_data:
+                elif inline_data := bookmark.get("inline_data"):
                    if (
                        hasattr(last_part, "inline_data")
                        and last_part.inline_data
-                        and last_part.inline_data.data == signed_part.inline_data.data
+                        and last_part.inline_data.data == inline_data.data
                    ):
-                        last_part.thought_signature = thought_signature
+                        logger.trace(
+                            f"Applying thought signature to part with matching inline_data"
+                        )
+                        last_part.thought_signature = signature
                        matched = True

                # If we found a match, update start index and stop searching for this signed part
--- a/src/pipecat/services/google/llm.py
+++ b/src/pipecat/services/google/llm.py
@@ -942,6 +942,7 @@ class GoogleLLMService(LLMService):
            )

            function_calls = []
+            previous_part = None
            async for chunk in response:
                # Stop TTFB metrics after the first chunk
                await self.stop_ttfb_metrics()
@@ -1005,26 +1006,50 @@ class GoogleLLMService(LLMService):
                                )
                                await self.push_frame(frame)

-                            # With Gemini 3 Pro (and, somewhat surprisingly,
-                            # other models models, too, especially when
+                            # With Gemini 3 Pro (and, contrary to Google's
+                            # docs, other models models, too, especially when
                            # functions are involved in the conversation),
-                            # thought signatures can be included in any kind of
-                            # part, not just function calls. It will come in
-                            # the last part of a response.
+                            # thought signatures can be associated with any
+                            # kind of Part, not just function calls.
+                            #
+                            # They should always be included in the last
+                            # response Part. (*)
+                            #
+                            # (*) Since we're using the streaming API, though,
+                            # where text Parts may be split across multiple
+                            # chunks (each represented by a Part, confusingly),
+                            # signatures may actually appear with the first
+                            # chunk (Gemini 2.5) or in a trailing empty-text
+                            # chunk (Gemini 3 Pro).
                            if part.thought_signature and not part.function_call:
+                                # Save a "bookmark" for the signature, so we
+                                # can later stick it in the right place in
+                                # context when sending it back to the LLM to
+                                # continue the conversation.
+                                bookmark = {}
+                                if part.inline_data and part.inline_data.data:
+                                    bookmark["inline_data"] = {"inline_data": part.inline_data}
+                                elif part.text is not None:
+                                    # Account for Gemini 3 Pro trailing
+                                    # empty-text chunk by using search_result,
+                                    # which accumulates all text so far.
+                                    bookmark["text"] = search_result
                                await self.push_frame(
                                    LLMMessagesAppendFrame(
                                        [
                                            self.get_llm_adapter().create_llm_specific_message(
                                                {
                                                    "type": "non_fn_thought_signature",
-                                                    "signed_part": part,
+                                                    "signature": part.thought_signature,
+                                                    "bookmark": bookmark,
                                                }
                                            )
                                        ]
                                    )
                                )

+                            previous_part = part
+
                    if (
                        candidate.grounding_metadata
                        and candidate.grounding_metadata.grounding_chunks