From 3afa30894fa067262cfa5722f96f36708ea33899 Mon Sep 17 00:00:00 2001
From: Kwindla Hultman Kramer <kwindla@daily.co>
Date: Sat, 28 Jun 2025 12:23:35 -0700
Subject: [PATCH] Turn off thinking for Gemini models by default

---
 .../foundational/07n-interruptible-google.py  |  7 ++++-
 .../07s-interruptible-google-audio-in.py      |  7 ++++-
 src/pipecat/metrics/metrics.py                |  1 +
 .../metrics/frame_processor_metrics.py        |  9 ++++---
 src/pipecat/services/google/llm.py            | 26 +++++++++++++++++++
 5 files changed, 45 insertions(+), 5 deletions(-)

diff --git a/examples/foundational/07n-interruptible-google.py b/examples/foundational/07n-interruptible-google.py
index e8613e082..319c97f78 100644
--- a/examples/foundational/07n-interruptible-google.py
+++ b/examples/foundational/07n-interruptible-google.py
@@ -61,7 +61,12 @@ async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_si
         credentials=os.getenv("GOOGLE_TEST_CREDENTIALS"),
     )
 
-    llm = GoogleLLMService(api_key=os.getenv("GOOGLE_API_KEY"))
+    llm = GoogleLLMService(
+        api_key=os.getenv("GOOGLE_API_KEY"),
+        model="gemini-2.5-flash",
+        # turn on thinking if you want it
+        # params=GoogleLLMService.InputParams(extra={"thinking_config": {"thinking_budget": 4096}}),)
+    )
 
     messages = [
         {
diff --git a/examples/foundational/07s-interruptible-google-audio-in.py b/examples/foundational/07s-interruptible-google-audio-in.py
index 9a7aa24b1..67701c53b 100644
--- a/examples/foundational/07s-interruptible-google-audio-in.py
+++ b/examples/foundational/07s-interruptible-google-audio-in.py
@@ -214,7 +214,12 @@ transport_params = {
 async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_sigint: bool):
     logger.info(f"Starting bot")
 
-    llm = GoogleLLMService(api_key=os.getenv("GOOGLE_API_KEY"), model="gemini-2.0-flash-001")
+    llm = GoogleLLMService(
+        api_key=os.getenv("GOOGLE_API_KEY"),
+        model="gemini-2.5-flash",
+        # turn on thinking if you want it
+        # params=GoogleLLMService.InputParams(extra={"thinking_config": {"thinking_budget": 4096}}),
+    )
 
     tts = GoogleTTSService(
         voice_id="en-US-Chirp3-HD-Charon",
diff --git a/src/pipecat/metrics/metrics.py b/src/pipecat/metrics/metrics.py
index 262254ffd..fbd5f9c8c 100644
--- a/src/pipecat/metrics/metrics.py
+++ b/src/pipecat/metrics/metrics.py
@@ -22,6 +22,7 @@ class LLMTokenUsage(BaseModel):
     total_tokens: int
     cache_read_input_tokens: Optional[int] = None
     cache_creation_input_tokens: Optional[int] = None
+    reasoning_tokens: Optional[int] = None
 
 
 class LLMUsageMetricsData(MetricsData):
diff --git a/src/pipecat/processors/metrics/frame_processor_metrics.py b/src/pipecat/processors/metrics/frame_processor_metrics.py
index cf08f85f6..c7033dd6e 100644
--- a/src/pipecat/processors/metrics/frame_processor_metrics.py
+++ b/src/pipecat/processors/metrics/frame_processor_metrics.py
@@ -103,9 +103,12 @@ class FrameProcessorMetrics(BaseObject):
         return MetricsFrame(data=[processing])
 
     async def start_llm_usage_metrics(self, tokens: LLMTokenUsage):
-        logger.debug(
-            f"{self._processor_name()} prompt tokens: {tokens.prompt_tokens}, completion tokens: {tokens.completion_tokens}"
-        )
+        logstr = f"{self._processor_name()} prompt tokens: {tokens.prompt_tokens}, completion tokens: {tokens.completion_tokens}"
+        if tokens.cache_read_input_tokens:
+            logstr += f", cache read input tokens: {tokens.cache_read_input_tokens}"
+        if tokens.reasoning_tokens:
+            logstr += f", reasoning tokens: {tokens.reasoning_tokens}"
+        logger.debug(logstr)
         value = LLMUsageMetricsData(
             processor=self._processor_name(), model=self._model_name(), value=tokens
         )
diff --git a/src/pipecat/services/google/llm.py b/src/pipecat/services/google/llm.py
index 6b8f51f33..ad961cac1 100644
--- a/src/pipecat/services/google/llm.py
+++ b/src/pipecat/services/google/llm.py
@@ -634,6 +634,20 @@ class GoogleLLMService(LLMService):
     def _create_client(self, api_key: str):
         self._client = genai.Client(api_key=api_key)
 
+    def _maybe_unset_thinking_budget(self, generation_params: Dict[str, Any]):
+        try:
+            # There's no way to introspect on model capabilities, so
+            # to check for models that we know default to thinkin on
+            # and can be configured to turn it off.
+            if not self._model_name.startswith("gemini-2.5-flash"):
+                return
+            # If thinking_config is already set, don't override it.
+            if "thinking_config" in generation_params:
+                return
+            generation_params.setdefault("thinking_config", {})["thinking_budget"] = 0
+        except Exception as e:
+            logger.exception(f"Failed to unset thinking budget: {e}")
+
     @traced_llm
     async def _process_context(self, context: OpenAILLMContext):
         await self.push_frame(LLMFullResponseStartFrame())
@@ -641,6 +655,8 @@ class GoogleLLMService(LLMService):
         prompt_tokens = 0
         completion_tokens = 0
         total_tokens = 0
+        cache_read_input_tokens = 0
+        reasoning_tokens = 0
 
         grounding_metadata = None
         search_result = ""
@@ -680,6 +696,12 @@ class GoogleLLMService(LLMService):
                 if v is not None
             }
 
+            if self._settings["extra"]:
+                generation_params.update(self._settings["extra"])
+
+            # possibly modify generation_params (in place) to set thinking to off by default
+            self._maybe_unset_thinking_budget(generation_params)
+
             generation_config = (
                 GenerateContentConfig(**generation_params) if generation_params else None
             )
@@ -699,6 +721,8 @@ class GoogleLLMService(LLMService):
                     prompt_tokens += chunk.usage_metadata.prompt_token_count or 0
                     completion_tokens += chunk.usage_metadata.candidates_token_count or 0
                     total_tokens += chunk.usage_metadata.total_token_count or 0
+                    cache_read_input_tokens += chunk.usage_metadata.cached_content_token_count or 0
+                    reasoning_tokens += chunk.usage_metadata.thoughts_token_count or 0
 
                 if not chunk.candidates:
                     continue
@@ -780,6 +804,8 @@ class GoogleLLMService(LLMService):
                     prompt_tokens=prompt_tokens,
                     completion_tokens=completion_tokens,
                     total_tokens=total_tokens,
+                    cache_read_input_tokens=cache_read_input_tokens,
+                    reasoning_tokens=reasoning_tokens,
                 )
             )
             await self.push_frame(LLMFullResponseEndFrame())