Merge pull request #2088 from pipecat-ai/khk/gemini-thinking-default

Turn off thinking for Gemini models by default
This commit is contained in:
Kwindla Hultman Kramer
2025-06-30 10:32:54 -07:00
committed by GitHub
5 changed files with 45 additions and 5 deletions

View File

@@ -61,7 +61,12 @@ async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_si
credentials=os.getenv("GOOGLE_TEST_CREDENTIALS"),
)
llm = GoogleLLMService(api_key=os.getenv("GOOGLE_API_KEY"))
llm = GoogleLLMService(
api_key=os.getenv("GOOGLE_API_KEY"),
model="gemini-2.5-flash",
# turn on thinking if you want it
# params=GoogleLLMService.InputParams(extra={"thinking_config": {"thinking_budget": 4096}}),)
)
messages = [
{

View File

@@ -214,7 +214,12 @@ transport_params = {
async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_sigint: bool):
logger.info(f"Starting bot")
llm = GoogleLLMService(api_key=os.getenv("GOOGLE_API_KEY"), model="gemini-2.0-flash-001")
llm = GoogleLLMService(
api_key=os.getenv("GOOGLE_API_KEY"),
model="gemini-2.5-flash",
# turn on thinking if you want it
# params=GoogleLLMService.InputParams(extra={"thinking_config": {"thinking_budget": 4096}}),
)
tts = GoogleTTSService(
voice_id="en-US-Chirp3-HD-Charon",

View File

@@ -22,6 +22,7 @@ class LLMTokenUsage(BaseModel):
total_tokens: int
cache_read_input_tokens: Optional[int] = None
cache_creation_input_tokens: Optional[int] = None
reasoning_tokens: Optional[int] = None
class LLMUsageMetricsData(MetricsData):

View File

@@ -165,9 +165,12 @@ class FrameProcessorMetrics(BaseObject):
Returns:
MetricsFrame containing LLM usage data.
"""
logger.debug(
f"{self._processor_name()} prompt tokens: {tokens.prompt_tokens}, completion tokens: {tokens.completion_tokens}"
)
logstr = f"{self._processor_name()} prompt tokens: {tokens.prompt_tokens}, completion tokens: {tokens.completion_tokens}"
if tokens.cache_read_input_tokens:
logstr += f", cache read input tokens: {tokens.cache_read_input_tokens}"
if tokens.reasoning_tokens:
logstr += f", reasoning tokens: {tokens.reasoning_tokens}"
logger.debug(logstr)
value = LLMUsageMetricsData(
processor=self._processor_name(), model=self._model_name(), value=tokens
)

View File

@@ -638,6 +638,20 @@ class GoogleLLMService(LLMService):
def _create_client(self, api_key: str):
self._client = genai.Client(api_key=api_key)
def _maybe_unset_thinking_budget(self, generation_params: Dict[str, Any]):
try:
# There's no way to introspect on model capabilities, so
# to check for models that we know default to thinkin on
# and can be configured to turn it off.
if not self._model_name.startswith("gemini-2.5-flash"):
return
# If thinking_config is already set, don't override it.
if "thinking_config" in generation_params:
return
generation_params.setdefault("thinking_config", {})["thinking_budget"] = 0
except Exception as e:
logger.exception(f"Failed to unset thinking budget: {e}")
@traced_llm
async def _process_context(self, context: OpenAILLMContext):
await self.push_frame(LLMFullResponseStartFrame())
@@ -645,6 +659,8 @@ class GoogleLLMService(LLMService):
prompt_tokens = 0
completion_tokens = 0
total_tokens = 0
cache_read_input_tokens = 0
reasoning_tokens = 0
grounding_metadata = None
search_result = ""
@@ -684,6 +700,12 @@ class GoogleLLMService(LLMService):
if v is not None
}
if self._settings["extra"]:
generation_params.update(self._settings["extra"])
# possibly modify generation_params (in place) to set thinking to off by default
self._maybe_unset_thinking_budget(generation_params)
generation_config = (
GenerateContentConfig(**generation_params) if generation_params else None
)
@@ -703,6 +725,8 @@ class GoogleLLMService(LLMService):
prompt_tokens += chunk.usage_metadata.prompt_token_count or 0
completion_tokens += chunk.usage_metadata.candidates_token_count or 0
total_tokens += chunk.usage_metadata.total_token_count or 0
cache_read_input_tokens += chunk.usage_metadata.cached_content_token_count or 0
reasoning_tokens += chunk.usage_metadata.thoughts_token_count or 0
if not chunk.candidates:
continue
@@ -784,6 +808,8 @@ class GoogleLLMService(LLMService):
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
total_tokens=total_tokens,
cache_read_input_tokens=cache_read_input_tokens,
reasoning_tokens=reasoning_tokens,
)
)
await self.push_frame(LLMFullResponseEndFrame())