From 3afa30894fa067262cfa5722f96f36708ea33899 Mon Sep 17 00:00:00 2001 From: Kwindla Hultman Kramer Date: Sat, 28 Jun 2025 12:23:35 -0700 Subject: [PATCH] Turn off thinking for Gemini models by default --- .../foundational/07n-interruptible-google.py | 7 ++++- .../07s-interruptible-google-audio-in.py | 7 ++++- src/pipecat/metrics/metrics.py | 1 + .../metrics/frame_processor_metrics.py | 9 ++++--- src/pipecat/services/google/llm.py | 26 +++++++++++++++++++ 5 files changed, 45 insertions(+), 5 deletions(-) diff --git a/examples/foundational/07n-interruptible-google.py b/examples/foundational/07n-interruptible-google.py index e8613e082..319c97f78 100644 --- a/examples/foundational/07n-interruptible-google.py +++ b/examples/foundational/07n-interruptible-google.py @@ -61,7 +61,12 @@ async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_si credentials=os.getenv("GOOGLE_TEST_CREDENTIALS"), ) - llm = GoogleLLMService(api_key=os.getenv("GOOGLE_API_KEY")) + llm = GoogleLLMService( + api_key=os.getenv("GOOGLE_API_KEY"), + model="gemini-2.5-flash", + # turn on thinking if you want it + # params=GoogleLLMService.InputParams(extra={"thinking_config": {"thinking_budget": 4096}}),) + ) messages = [ { diff --git a/examples/foundational/07s-interruptible-google-audio-in.py b/examples/foundational/07s-interruptible-google-audio-in.py index 9a7aa24b1..67701c53b 100644 --- a/examples/foundational/07s-interruptible-google-audio-in.py +++ b/examples/foundational/07s-interruptible-google-audio-in.py @@ -214,7 +214,12 @@ transport_params = { async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_sigint: bool): logger.info(f"Starting bot") - llm = GoogleLLMService(api_key=os.getenv("GOOGLE_API_KEY"), model="gemini-2.0-flash-001") + llm = GoogleLLMService( + api_key=os.getenv("GOOGLE_API_KEY"), + model="gemini-2.5-flash", + # turn on thinking if you want it + # params=GoogleLLMService.InputParams(extra={"thinking_config": {"thinking_budget": 4096}}), + ) tts = GoogleTTSService( voice_id="en-US-Chirp3-HD-Charon", diff --git a/src/pipecat/metrics/metrics.py b/src/pipecat/metrics/metrics.py index 262254ffd..fbd5f9c8c 100644 --- a/src/pipecat/metrics/metrics.py +++ b/src/pipecat/metrics/metrics.py @@ -22,6 +22,7 @@ class LLMTokenUsage(BaseModel): total_tokens: int cache_read_input_tokens: Optional[int] = None cache_creation_input_tokens: Optional[int] = None + reasoning_tokens: Optional[int] = None class LLMUsageMetricsData(MetricsData): diff --git a/src/pipecat/processors/metrics/frame_processor_metrics.py b/src/pipecat/processors/metrics/frame_processor_metrics.py index cf08f85f6..c7033dd6e 100644 --- a/src/pipecat/processors/metrics/frame_processor_metrics.py +++ b/src/pipecat/processors/metrics/frame_processor_metrics.py @@ -103,9 +103,12 @@ class FrameProcessorMetrics(BaseObject): return MetricsFrame(data=[processing]) async def start_llm_usage_metrics(self, tokens: LLMTokenUsage): - logger.debug( - f"{self._processor_name()} prompt tokens: {tokens.prompt_tokens}, completion tokens: {tokens.completion_tokens}" - ) + logstr = f"{self._processor_name()} prompt tokens: {tokens.prompt_tokens}, completion tokens: {tokens.completion_tokens}" + if tokens.cache_read_input_tokens: + logstr += f", cache read input tokens: {tokens.cache_read_input_tokens}" + if tokens.reasoning_tokens: + logstr += f", reasoning tokens: {tokens.reasoning_tokens}" + logger.debug(logstr) value = LLMUsageMetricsData( processor=self._processor_name(), model=self._model_name(), value=tokens ) diff --git a/src/pipecat/services/google/llm.py b/src/pipecat/services/google/llm.py index 6b8f51f33..ad961cac1 100644 --- a/src/pipecat/services/google/llm.py +++ b/src/pipecat/services/google/llm.py @@ -634,6 +634,20 @@ class GoogleLLMService(LLMService): def _create_client(self, api_key: str): self._client = genai.Client(api_key=api_key) + def _maybe_unset_thinking_budget(self, generation_params: Dict[str, Any]): + try: + # There's no way to introspect on model capabilities, so + # to check for models that we know default to thinkin on + # and can be configured to turn it off. + if not self._model_name.startswith("gemini-2.5-flash"): + return + # If thinking_config is already set, don't override it. + if "thinking_config" in generation_params: + return + generation_params.setdefault("thinking_config", {})["thinking_budget"] = 0 + except Exception as e: + logger.exception(f"Failed to unset thinking budget: {e}") + @traced_llm async def _process_context(self, context: OpenAILLMContext): await self.push_frame(LLMFullResponseStartFrame()) @@ -641,6 +655,8 @@ class GoogleLLMService(LLMService): prompt_tokens = 0 completion_tokens = 0 total_tokens = 0 + cache_read_input_tokens = 0 + reasoning_tokens = 0 grounding_metadata = None search_result = "" @@ -680,6 +696,12 @@ class GoogleLLMService(LLMService): if v is not None } + if self._settings["extra"]: + generation_params.update(self._settings["extra"]) + + # possibly modify generation_params (in place) to set thinking to off by default + self._maybe_unset_thinking_budget(generation_params) + generation_config = ( GenerateContentConfig(**generation_params) if generation_params else None ) @@ -699,6 +721,8 @@ class GoogleLLMService(LLMService): prompt_tokens += chunk.usage_metadata.prompt_token_count or 0 completion_tokens += chunk.usage_metadata.candidates_token_count or 0 total_tokens += chunk.usage_metadata.total_token_count or 0 + cache_read_input_tokens += chunk.usage_metadata.cached_content_token_count or 0 + reasoning_tokens += chunk.usage_metadata.thoughts_token_count or 0 if not chunk.candidates: continue @@ -780,6 +804,8 @@ class GoogleLLMService(LLMService): prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, total_tokens=total_tokens, + cache_read_input_tokens=cache_read_input_tokens, + reasoning_tokens=reasoning_tokens, ) ) await self.push_frame(LLMFullResponseEndFrame())