diff --git a/examples/foundational/07n-interruptible-google.py b/examples/foundational/07n-interruptible-google.py index e8613e082..319c97f78 100644 --- a/examples/foundational/07n-interruptible-google.py +++ b/examples/foundational/07n-interruptible-google.py @@ -61,7 +61,12 @@ async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_si credentials=os.getenv("GOOGLE_TEST_CREDENTIALS"), ) - llm = GoogleLLMService(api_key=os.getenv("GOOGLE_API_KEY")) + llm = GoogleLLMService( + api_key=os.getenv("GOOGLE_API_KEY"), + model="gemini-2.5-flash", + # turn on thinking if you want it + # params=GoogleLLMService.InputParams(extra={"thinking_config": {"thinking_budget": 4096}}),) + ) messages = [ { diff --git a/examples/foundational/07s-interruptible-google-audio-in.py b/examples/foundational/07s-interruptible-google-audio-in.py index 9a7aa24b1..67701c53b 100644 --- a/examples/foundational/07s-interruptible-google-audio-in.py +++ b/examples/foundational/07s-interruptible-google-audio-in.py @@ -214,7 +214,12 @@ transport_params = { async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_sigint: bool): logger.info(f"Starting bot") - llm = GoogleLLMService(api_key=os.getenv("GOOGLE_API_KEY"), model="gemini-2.0-flash-001") + llm = GoogleLLMService( + api_key=os.getenv("GOOGLE_API_KEY"), + model="gemini-2.5-flash", + # turn on thinking if you want it + # params=GoogleLLMService.InputParams(extra={"thinking_config": {"thinking_budget": 4096}}), + ) tts = GoogleTTSService( voice_id="en-US-Chirp3-HD-Charon", diff --git a/src/pipecat/metrics/metrics.py b/src/pipecat/metrics/metrics.py index 262254ffd..fbd5f9c8c 100644 --- a/src/pipecat/metrics/metrics.py +++ b/src/pipecat/metrics/metrics.py @@ -22,6 +22,7 @@ class LLMTokenUsage(BaseModel): total_tokens: int cache_read_input_tokens: Optional[int] = None cache_creation_input_tokens: Optional[int] = None + reasoning_tokens: Optional[int] = None class LLMUsageMetricsData(MetricsData): diff --git a/src/pipecat/processors/metrics/frame_processor_metrics.py b/src/pipecat/processors/metrics/frame_processor_metrics.py index 386164afe..fd93241ed 100644 --- a/src/pipecat/processors/metrics/frame_processor_metrics.py +++ b/src/pipecat/processors/metrics/frame_processor_metrics.py @@ -165,9 +165,12 @@ class FrameProcessorMetrics(BaseObject): Returns: MetricsFrame containing LLM usage data. """ - logger.debug( - f"{self._processor_name()} prompt tokens: {tokens.prompt_tokens}, completion tokens: {tokens.completion_tokens}" - ) + logstr = f"{self._processor_name()} prompt tokens: {tokens.prompt_tokens}, completion tokens: {tokens.completion_tokens}" + if tokens.cache_read_input_tokens: + logstr += f", cache read input tokens: {tokens.cache_read_input_tokens}" + if tokens.reasoning_tokens: + logstr += f", reasoning tokens: {tokens.reasoning_tokens}" + logger.debug(logstr) value = LLMUsageMetricsData( processor=self._processor_name(), model=self._model_name(), value=tokens ) diff --git a/src/pipecat/services/google/llm.py b/src/pipecat/services/google/llm.py index bd56b8416..86ed4dd88 100644 --- a/src/pipecat/services/google/llm.py +++ b/src/pipecat/services/google/llm.py @@ -638,6 +638,20 @@ class GoogleLLMService(LLMService): def _create_client(self, api_key: str): self._client = genai.Client(api_key=api_key) + def _maybe_unset_thinking_budget(self, generation_params: Dict[str, Any]): + try: + # There's no way to introspect on model capabilities, so + # to check for models that we know default to thinkin on + # and can be configured to turn it off. + if not self._model_name.startswith("gemini-2.5-flash"): + return + # If thinking_config is already set, don't override it. + if "thinking_config" in generation_params: + return + generation_params.setdefault("thinking_config", {})["thinking_budget"] = 0 + except Exception as e: + logger.exception(f"Failed to unset thinking budget: {e}") + @traced_llm async def _process_context(self, context: OpenAILLMContext): await self.push_frame(LLMFullResponseStartFrame()) @@ -645,6 +659,8 @@ class GoogleLLMService(LLMService): prompt_tokens = 0 completion_tokens = 0 total_tokens = 0 + cache_read_input_tokens = 0 + reasoning_tokens = 0 grounding_metadata = None search_result = "" @@ -684,6 +700,12 @@ class GoogleLLMService(LLMService): if v is not None } + if self._settings["extra"]: + generation_params.update(self._settings["extra"]) + + # possibly modify generation_params (in place) to set thinking to off by default + self._maybe_unset_thinking_budget(generation_params) + generation_config = ( GenerateContentConfig(**generation_params) if generation_params else None ) @@ -703,6 +725,8 @@ class GoogleLLMService(LLMService): prompt_tokens += chunk.usage_metadata.prompt_token_count or 0 completion_tokens += chunk.usage_metadata.candidates_token_count or 0 total_tokens += chunk.usage_metadata.total_token_count or 0 + cache_read_input_tokens += chunk.usage_metadata.cached_content_token_count or 0 + reasoning_tokens += chunk.usage_metadata.thoughts_token_count or 0 if not chunk.candidates: continue @@ -784,6 +808,8 @@ class GoogleLLMService(LLMService): prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, total_tokens=total_tokens, + cache_read_input_tokens=cache_read_input_tokens, + reasoning_tokens=reasoning_tokens, ) ) await self.push_frame(LLMFullResponseEndFrame())