Merge pull request #2088 from pipecat-ai/khk/gemini-thinking-default
Turn off thinking for Gemini models by default
This commit is contained in:
@@ -61,7 +61,12 @@ async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_si
|
||||
credentials=os.getenv("GOOGLE_TEST_CREDENTIALS"),
|
||||
)
|
||||
|
||||
llm = GoogleLLMService(api_key=os.getenv("GOOGLE_API_KEY"))
|
||||
llm = GoogleLLMService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
model="gemini-2.5-flash",
|
||||
# turn on thinking if you want it
|
||||
# params=GoogleLLMService.InputParams(extra={"thinking_config": {"thinking_budget": 4096}}),)
|
||||
)
|
||||
|
||||
messages = [
|
||||
{
|
||||
|
||||
@@ -214,7 +214,12 @@ transport_params = {
|
||||
async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_sigint: bool):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
llm = GoogleLLMService(api_key=os.getenv("GOOGLE_API_KEY"), model="gemini-2.0-flash-001")
|
||||
llm = GoogleLLMService(
|
||||
api_key=os.getenv("GOOGLE_API_KEY"),
|
||||
model="gemini-2.5-flash",
|
||||
# turn on thinking if you want it
|
||||
# params=GoogleLLMService.InputParams(extra={"thinking_config": {"thinking_budget": 4096}}),
|
||||
)
|
||||
|
||||
tts = GoogleTTSService(
|
||||
voice_id="en-US-Chirp3-HD-Charon",
|
||||
|
||||
@@ -22,6 +22,7 @@ class LLMTokenUsage(BaseModel):
|
||||
total_tokens: int
|
||||
cache_read_input_tokens: Optional[int] = None
|
||||
cache_creation_input_tokens: Optional[int] = None
|
||||
reasoning_tokens: Optional[int] = None
|
||||
|
||||
|
||||
class LLMUsageMetricsData(MetricsData):
|
||||
|
||||
@@ -165,9 +165,12 @@ class FrameProcessorMetrics(BaseObject):
|
||||
Returns:
|
||||
MetricsFrame containing LLM usage data.
|
||||
"""
|
||||
logger.debug(
|
||||
f"{self._processor_name()} prompt tokens: {tokens.prompt_tokens}, completion tokens: {tokens.completion_tokens}"
|
||||
)
|
||||
logstr = f"{self._processor_name()} prompt tokens: {tokens.prompt_tokens}, completion tokens: {tokens.completion_tokens}"
|
||||
if tokens.cache_read_input_tokens:
|
||||
logstr += f", cache read input tokens: {tokens.cache_read_input_tokens}"
|
||||
if tokens.reasoning_tokens:
|
||||
logstr += f", reasoning tokens: {tokens.reasoning_tokens}"
|
||||
logger.debug(logstr)
|
||||
value = LLMUsageMetricsData(
|
||||
processor=self._processor_name(), model=self._model_name(), value=tokens
|
||||
)
|
||||
|
||||
@@ -638,6 +638,20 @@ class GoogleLLMService(LLMService):
|
||||
def _create_client(self, api_key: str):
|
||||
self._client = genai.Client(api_key=api_key)
|
||||
|
||||
def _maybe_unset_thinking_budget(self, generation_params: Dict[str, Any]):
|
||||
try:
|
||||
# There's no way to introspect on model capabilities, so
|
||||
# to check for models that we know default to thinkin on
|
||||
# and can be configured to turn it off.
|
||||
if not self._model_name.startswith("gemini-2.5-flash"):
|
||||
return
|
||||
# If thinking_config is already set, don't override it.
|
||||
if "thinking_config" in generation_params:
|
||||
return
|
||||
generation_params.setdefault("thinking_config", {})["thinking_budget"] = 0
|
||||
except Exception as e:
|
||||
logger.exception(f"Failed to unset thinking budget: {e}")
|
||||
|
||||
@traced_llm
|
||||
async def _process_context(self, context: OpenAILLMContext):
|
||||
await self.push_frame(LLMFullResponseStartFrame())
|
||||
@@ -645,6 +659,8 @@ class GoogleLLMService(LLMService):
|
||||
prompt_tokens = 0
|
||||
completion_tokens = 0
|
||||
total_tokens = 0
|
||||
cache_read_input_tokens = 0
|
||||
reasoning_tokens = 0
|
||||
|
||||
grounding_metadata = None
|
||||
search_result = ""
|
||||
@@ -684,6 +700,12 @@ class GoogleLLMService(LLMService):
|
||||
if v is not None
|
||||
}
|
||||
|
||||
if self._settings["extra"]:
|
||||
generation_params.update(self._settings["extra"])
|
||||
|
||||
# possibly modify generation_params (in place) to set thinking to off by default
|
||||
self._maybe_unset_thinking_budget(generation_params)
|
||||
|
||||
generation_config = (
|
||||
GenerateContentConfig(**generation_params) if generation_params else None
|
||||
)
|
||||
@@ -703,6 +725,8 @@ class GoogleLLMService(LLMService):
|
||||
prompt_tokens += chunk.usage_metadata.prompt_token_count or 0
|
||||
completion_tokens += chunk.usage_metadata.candidates_token_count or 0
|
||||
total_tokens += chunk.usage_metadata.total_token_count or 0
|
||||
cache_read_input_tokens += chunk.usage_metadata.cached_content_token_count or 0
|
||||
reasoning_tokens += chunk.usage_metadata.thoughts_token_count or 0
|
||||
|
||||
if not chunk.candidates:
|
||||
continue
|
||||
@@ -784,6 +808,8 @@ class GoogleLLMService(LLMService):
|
||||
prompt_tokens=prompt_tokens,
|
||||
completion_tokens=completion_tokens,
|
||||
total_tokens=total_tokens,
|
||||
cache_read_input_tokens=cache_read_input_tokens,
|
||||
reasoning_tokens=reasoning_tokens,
|
||||
)
|
||||
)
|
||||
await self.push_frame(LLMFullResponseEndFrame())
|
||||
|
||||
Reference in New Issue
Block a user