Merge pull request #2088 from pipecat-ai/khk/gemini-thinking-default

Turn off thinking for Gemini models by default
2025-06-30 10:32:54 -07:00
parent 0ecfa827e6 55cfea776f
commit 224d2cedc8
5 changed files with 45 additions and 5 deletions
--- a/examples/foundational/07n-interruptible-google.py
+++ b/examples/foundational/07n-interruptible-google.py
@@ -61,7 +61,12 @@ async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_si
        credentials=os.getenv("GOOGLE_TEST_CREDENTIALS"),
    )

-    llm = GoogleLLMService(api_key=os.getenv("GOOGLE_API_KEY"))
+    llm = GoogleLLMService(
+        api_key=os.getenv("GOOGLE_API_KEY"),
+        model="gemini-2.5-flash",
+        # turn on thinking if you want it
+        # params=GoogleLLMService.InputParams(extra={"thinking_config": {"thinking_budget": 4096}}),)
+    )

    messages = [
        {
--- a/examples/foundational/07s-interruptible-google-audio-in.py
+++ b/examples/foundational/07s-interruptible-google-audio-in.py
@@ -214,7 +214,12 @@ transport_params = {
 async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_sigint: bool):
    logger.info(f"Starting bot")

-    llm = GoogleLLMService(api_key=os.getenv("GOOGLE_API_KEY"), model="gemini-2.0-flash-001")
+    llm = GoogleLLMService(
+        api_key=os.getenv("GOOGLE_API_KEY"),
+        model="gemini-2.5-flash",
+        # turn on thinking if you want it
+        # params=GoogleLLMService.InputParams(extra={"thinking_config": {"thinking_budget": 4096}}),
+    )

    tts = GoogleTTSService(
        voice_id="en-US-Chirp3-HD-Charon",
--- a/src/pipecat/metrics/metrics.py
+++ b/src/pipecat/metrics/metrics.py
@@ -22,6 +22,7 @@ class LLMTokenUsage(BaseModel):
    total_tokens: int
    cache_read_input_tokens: Optional[int] = None
    cache_creation_input_tokens: Optional[int] = None
+    reasoning_tokens: Optional[int] = None


 class LLMUsageMetricsData(MetricsData):
--- a/src/pipecat/processors/metrics/frame_processor_metrics.py
+++ b/src/pipecat/processors/metrics/frame_processor_metrics.py
@@ -165,9 +165,12 @@ class FrameProcessorMetrics(BaseObject):
        Returns:
            MetricsFrame containing LLM usage data.
        """
-        logger.debug(
-            f"{self._processor_name()} prompt tokens: {tokens.prompt_tokens}, completion tokens: {tokens.completion_tokens}"
-        )
+        logstr = f"{self._processor_name()} prompt tokens: {tokens.prompt_tokens}, completion tokens: {tokens.completion_tokens}"
+        if tokens.cache_read_input_tokens:
+            logstr += f", cache read input tokens: {tokens.cache_read_input_tokens}"
+        if tokens.reasoning_tokens:
+            logstr += f", reasoning tokens: {tokens.reasoning_tokens}"
+        logger.debug(logstr)
        value = LLMUsageMetricsData(
            processor=self._processor_name(), model=self._model_name(), value=tokens
        )
--- a/src/pipecat/services/google/llm.py
+++ b/src/pipecat/services/google/llm.py
@@ -638,6 +638,20 @@ class GoogleLLMService(LLMService):
    def _create_client(self, api_key: str):
        self._client = genai.Client(api_key=api_key)

+    def _maybe_unset_thinking_budget(self, generation_params: Dict[str, Any]):
+        try:
+            # There's no way to introspect on model capabilities, so
+            # to check for models that we know default to thinkin on
+            # and can be configured to turn it off.
+            if not self._model_name.startswith("gemini-2.5-flash"):
+                return
+            # If thinking_config is already set, don't override it.
+            if "thinking_config" in generation_params:
+                return
+            generation_params.setdefault("thinking_config", {})["thinking_budget"] = 0
+        except Exception as e:
+            logger.exception(f"Failed to unset thinking budget: {e}")
+
    @traced_llm
    async def _process_context(self, context: OpenAILLMContext):
        await self.push_frame(LLMFullResponseStartFrame())
@@ -645,6 +659,8 @@ class GoogleLLMService(LLMService):
        prompt_tokens = 0
        completion_tokens = 0
        total_tokens = 0
+        cache_read_input_tokens = 0
+        reasoning_tokens = 0

        grounding_metadata = None
        search_result = ""
@@ -684,6 +700,12 @@ class GoogleLLMService(LLMService):
                if v is not None
            }

+            if self._settings["extra"]:
+                generation_params.update(self._settings["extra"])
+
+            # possibly modify generation_params (in place) to set thinking to off by default
+            self._maybe_unset_thinking_budget(generation_params)
+
            generation_config = (
                GenerateContentConfig(**generation_params) if generation_params else None
            )
@@ -703,6 +725,8 @@ class GoogleLLMService(LLMService):
                    prompt_tokens += chunk.usage_metadata.prompt_token_count or 0
                    completion_tokens += chunk.usage_metadata.candidates_token_count or 0
                    total_tokens += chunk.usage_metadata.total_token_count or 0
+                    cache_read_input_tokens += chunk.usage_metadata.cached_content_token_count or 0
+                    reasoning_tokens += chunk.usage_metadata.thoughts_token_count or 0

                if not chunk.candidates:
                    continue
@@ -784,6 +808,8 @@ class GoogleLLMService(LLMService):
                    prompt_tokens=prompt_tokens,
                    completion_tokens=completion_tokens,
                    total_tokens=total_tokens,
+                    cache_read_input_tokens=cache_read_input_tokens,
+                    reasoning_tokens=reasoning_tokens,
                )
            )
            await self.push_frame(LLMFullResponseEndFrame())