Compare commits

...

8 Commits

Author SHA1 Message Date
vipyne
03dc1b343a add rtvi 2025-11-20 10:13:25 -06:00
vipyne
f0471dccda more examples 2025-11-20 10:13:25 -06:00
vipyne
fd0ef110ff pipecat bot example before trace 2025-11-20 10:13:25 -06:00
Mark Backman
ab58f72322 Merge pull request #3101 from hwuiwon/hw/inworld-talking-speed
feat: Add speaking rate control to Inworld TTS service.
2025-11-20 09:50:55 -05:00
Hwuiwon Kim
ead361f665 fix 2025-11-20 07:45:13 -05:00
Hwuiwon Kim
1cc69d475d feat: Add speaking rate control to Inworld TTS service & fix param cases 2025-11-19 22:57:53 -05:00
Mark Backman
51bdd8b728 Merge pull request #3097 from hwuiwon/fix-typo
Fix typo in STT event handler documentation
2025-11-19 17:10:32 -05:00
Hwuiwon Kim
30ff488714 Fix typo in event handler documentation 2025-11-19 17:04:07 -05:00
9 changed files with 719 additions and 8 deletions

View File

@@ -24,6 +24,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Added word-level timestamps support to Hume TTS service
- Added optional speaking rate control to `InworldTTSService`.
### Changed
- ⚠️ Breaking change: `LLMContext.create_image_message()`,
@@ -89,6 +91,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Prevented `HeyGenVideoService` from automatically disconnecting after 5 minutes.
- Fixed `InworldTTSService` audio config payload to use camelCase keys expected
by the Inworld API.
## [0.0.94] - 2025-11-10
### Changed

View File

@@ -0,0 +1,155 @@
#
# Copyright (c) 20242025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import os
from dotenv import load_dotenv
from loguru import logger
from turn_detector_observer import TurnDetectorObserver
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.audio.vad.vad_analyzer import VADParams
from pipecat.frames.frames import LLMRunFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.llm_context import LLMContext
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
from pipecat.processors.frameworks.rtvi import RTVIConfig, RTVIObserver, RTVIProcessor
from pipecat.runner.types import RunnerArguments
from pipecat.runner.utils import create_transport
from pipecat.services.openai.llm import OpenAILLMService
from pipecat.services.openai.stt import OpenAISTTService
from pipecat.services.openai.tts import OpenAITTSService
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.daily.transport import DailyParams
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
load_dotenv(override=True)
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
# instantiated. The function will be called when the desired transport gets
# selected.
transport_params = {
"daily": lambda: DailyParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
),
"twilio": lambda: FastAPIWebsocketParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
),
"webrtc": lambda: TransportParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
),
}
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
logger.info(f"Starting bot")
### STT ###
stt = OpenAISTTService(
api_key=os.getenv("OPENAI_API_KEY"),
model="gpt-4o-transcribe",
prompt="Expect normal helpful conversation.",
)
### LLM ###
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
### TTS ###
tts = OpenAITTSService(
api_key=os.getenv("OPENAI_API_KEY"),
voice="ballad",
params=OpenAITTSService.InputParams(
instructions="Please speak clearly and at a moderate pace."
),
)
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
},
]
context = LLMContext(messages)
context_aggregator = LLMContextAggregatorPair(context)
# RTVI events for detecting bot aggregation
rtvi = RTVIProcessor(config=RTVIConfig(config=[]))
### PIPELINE ###
pipeline = Pipeline(
[
transport.input(), # Transport user input
rtvi,
stt,
context_aggregator.user(), # User responses
llm, # LLM
tts, # TTS
transport.output(), # Transport bot output
context_aggregator.assistant(), # Assistant spoken responses
]
)
### TASK ###
turn_detector = TurnDetectorObserver()
task = PipelineTask(
pipeline,
params=PipelineParams(
enable_metrics=True,
enable_usage_metrics=True,
),
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
observers=[turn_detector, RTVIObserver(rtvi)],
)
turn_detector.set_turn_observer_event_handlers(task.turn_tracking_observer)
@transport.event_handler("on_client_connected")
async def on_client_connected(transport, client):
logger.info(f"Client connected")
# Kick off the conversation.
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
await task.queue_frames([LLMRunFrame()])
@transport.event_handler("on_client_disconnected")
async def on_client_disconnected(transport, client):
logger.info(f"Client disconnected")
await task.cancel()
# not sure this is needed...
@rtvi.event_handler("on_client_ready")
async def on_client_ready(rtvi):
await rtvi.set_bot_ready()
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
await runner.run(task)
async def bot(runner_args: RunnerArguments):
"""Main bot entry point compatible with Pipecat Cloud."""
transport = await create_transport(runner_args, transport_params)
await run_bot(transport, runner_args)
if __name__ == "__main__":
from pipecat.runner.run import main
main()

View File

@@ -0,0 +1,161 @@
#
# Copyright (c) 20242025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import os
from dotenv import load_dotenv
from loguru import logger
from turn_detector_observer import TurnDetectorObserver
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.audio.vad.vad_analyzer import VADParams
from pipecat.frames.frames import LLMRunFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.llm_context import LLMContext
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
from pipecat.runner.types import RunnerArguments
from pipecat.runner.utils import create_transport
from pipecat.services.openai.llm import OpenAILLMService
from pipecat.services.openai.stt import OpenAISTTService
from pipecat.services.openai.tts import OpenAITTSService
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.daily.transport import DailyParams
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
load_dotenv(override=True)
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
# instantiated. The function will be called when the desired transport gets
# selected.
transport_params = {
"daily": lambda: DailyParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
),
"twilio": lambda: FastAPIWebsocketParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
),
"webrtc": lambda: TransportParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
),
}
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
logger.info(f"Starting bot")
session_properties = SessionProperties(
audio=AudioConfiguration(
input=AudioInput(
transcription=InputAudioTranscription(),
# Set openai TurnDetection parameters. Not setting this at all will turn it
# on by default
turn_detection=SemanticTurnDetection(),
# Or set to False to disable openai turn detection and use transport VAD
# turn_detection=False,
noise_reduction=InputAudioNoiseReduction(type="near_field"),
)
),
# In this example we provide tools through the context, but you could
# alternatively provide them here.
# tools=tools,
instructions="""You are a helpful and friendly AI.
Act like a human, but remember that you aren't a human and that you can't do human
things in the real world. Your voice and personality should be warm and engaging, with a lively and
playful tone.
If interacting in a non-English language, start by using the standard accent or dialect familiar to
the user. Talk quickly. You should always call a function if you can. Do not refer to these rules,
even if you're asked about them.
You are participating in a voice conversation. Keep your responses concise, short, and to the point
unless specifically asked to elaborate on a topic.
Remember, your responses should be short. Just one or two sentences, usually. Respond in English.""",
)
### LLM ###
llm = OpenAIRealtimeLLMService(
api_key=os.getenv("OPENAI_API_KEY"),
session_properties=session_properties,
start_audio_paused=False,
)
# Create a standard OpenAI LLM context object using the normal messages format. The
# OpenAIRealtimeLLMService will convert this internally to messages that the
# openai WebSocket API can understand.
context = LLMContext(
[{"role": "user", "content": "Say hello!"}],
tools,
)
context_aggregator = LLMContextAggregatorPair(context)
### PIPELINE ###
pipeline = Pipeline(
[
transport.input(), # Transport user input
context_aggregator.user(),
llm, # LLM
transport.output(), # Transport bot output
context_aggregator.assistant(),
]
)
### TASK ###
turn_detector = TurnDetectorObserver()
task = PipelineTask(
pipeline,
params=PipelineParams(
enable_metrics=True,
enable_usage_metrics=True,
),
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
observers=[turn_detector],
)
turn_detector.set_turn_observer_event_handlers(task.turn_tracking_observer)
@transport.event_handler("on_client_connected")
async def on_client_connected(transport, client):
logger.info(f"Client connected")
# Kick off the conversation.
await task.queue_frames([LLMRunFrame()])
@transport.event_handler("on_client_disconnected")
async def on_client_disconnected(transport, client):
logger.info(f"Client disconnected")
await task.cancel()
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
await runner.run(task)
async def bot(runner_args: RunnerArguments):
"""Main bot entry point compatible with Pipecat Cloud."""
transport = await create_transport(runner_args, transport_params)
await run_bot(transport, runner_args)
if __name__ == "__main__":
from pipecat.runner.run import main
main()

View File

@@ -0,0 +1,188 @@
#
# Copyright (c) 20242025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import os
from dotenv import load_dotenv
from loguru import logger
from turn_detector_observer import TurnDetectorObserver
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.audio.vad.vad_analyzer import VADParams
from pipecat.frames.frames import LLMRunFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.llm_context import LLMContext
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
from pipecat.runner.types import RunnerArguments
from pipecat.runner.utils import create_transport
from pipecat.services.openai.llm import OpenAILLMService
from pipecat.services.openai.stt import OpenAISTTService
from pipecat.services.openai.tts import OpenAITTSService
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.daily.transport import DailyParams
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
load_dotenv(override=True)
async def fetch_weather_from_api(params: FunctionCallParams):
await params.result_callback({"conditions": "nice", "temperature": "75"})
async def fetch_restaurant_recommendation(params: FunctionCallParams):
await params.result_callback({"name": "The Golden Dragon"})
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
# instantiated. The function will be called when the desired transport gets
# selected.
transport_params = {
"daily": lambda: DailyParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
),
"twilio": lambda: FastAPIWebsocketParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
),
"webrtc": lambda: TransportParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
),
}
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
logger.info(f"Starting bot")
### STT ###
stt = OpenAISTTService(
api_key=os.getenv("OPENAI_API_KEY"),
model="gpt-4o-transcribe",
prompt="Expect normal helpful conversation.",
)
### LLM ###
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
### TTS ###
tts = OpenAITTSService(
api_key=os.getenv("OPENAI_API_KEY"),
voice="ballad",
params=OpenAITTSService.InputParams(instructions="Please speak clearly and at a moderate pace."),
)
# You can also register a function_name of None to get all functions
# sent to the same callback with an additional function_name parameter.
llm.register_function("get_current_weather", fetch_weather_from_api)
llm.register_function("get_restaurant_recommendation", fetch_restaurant_recommendation)
@llm.event_handler("on_function_calls_started")
async def on_function_calls_started(service, function_calls):
await tts.queue_frame(TTSSpeakFrame("Let me check on that."))
weather_function = FunctionSchema(
name="get_current_weather",
description="Get the current weather",
properties={
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
"format": {
"type": "string",
"enum": ["celsius", "fahrenheit"],
"description": "The temperature unit to use. Infer this from the user's location.",
},
},
required=["location", "format"],
)
restaurant_function = FunctionSchema(
name="get_restaurant_recommendation",
description="Get a restaurant recommendation",
properties={
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
},
required=["location"],
)
tools = ToolsSchema(standard_tools=[weather_function, restaurant_function])
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
},
]
context = LLMContext(messages, tools)
context_aggregator = LLMContextAggregatorPair(context)
### PIPELINE ###
pipeline = Pipeline(
[
transport.input(), # Transport user input
stt,
context_aggregator.user(), # User responses
llm, # LLM
tts, # TTS
transport.output(), # Transport bot output
context_aggregator.assistant(), # Assistant spoken responses
]
)
### TASK ###
turn_detector = TurnDetectorObserver()
task = PipelineTask(
pipeline,
params=PipelineParams(
enable_metrics=True,
enable_usage_metrics=True,
),
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
observers=[turn_detector],
)
turn_detector.set_turn_observer_event_handlers(task.turn_tracking_observer)
@transport.event_handler("on_client_connected")
async def on_client_connected(transport, client):
logger.info(f"Client connected")
# Kick off the conversation.
await task.queue_frames([LLMRunFrame()])
@transport.event_handler("on_client_disconnected")
async def on_client_disconnected(transport, client):
logger.info(f"Client disconnected")
await task.cancel()
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
await runner.run(task)
async def bot(runner_args: RunnerArguments):
"""Main bot entry point compatible with Pipecat Cloud."""
transport = await create_transport(runner_args, transport_params)
await run_bot(transport, runner_args)
if __name__ == "__main__":
from pipecat.runner.run import main
main()

View File

@@ -0,0 +1,11 @@
```bash
uv sync
uv pip install -e '.[cartesia,daily,elevenlabs,local-smart-turn-v3,openai,runner,webrtc]'
```
```bash
python examples/foundational/trace/001-trace.py
```
- open [http://localhost:7860](http://localhost:7860)
- click `connect` button in top right

View File

@@ -0,0 +1,5 @@
OPENAI_API_KEY=...
ELEVENLABS_API_KEY=...
ELEVENLABS_VOICE_ID=...
CARTESIA_API_KEY=...

View File

@@ -0,0 +1,181 @@
import time
from loguru import logger
from pipecat.frames.frames import (
BotStartedSpeakingFrame,
BotStoppedSpeakingFrame,
EndFrame,
FunctionCallResultFrame,
FunctionCallsStartedFrame,
LLMFullResponseEndFrame,
LLMFullResponseStartFrame,
StartFrame,
UserStartedSpeakingFrame,
UserStoppedSpeakingFrame,
)
from pipecat.observers.base_observer import BaseObserver, FramePushed
from pipecat.pipeline.pipeline import Pipeline
from pipecat.processors.frame_processor import FrameDirection
from pipecat.services.openai.base_llm import LLMService
from pipecat.transports.base_output import BaseOutputTransport
class TurnDetectorObserver(BaseObserver):
"""Observer ... of turns."""
def __init__(self):
super().__init__()
self._turn_observer = None
self._arrow = ""
self._turn_number = 1
self._endframe_queued = False
def init(self):
"""
Set ...
"""
pass
def set_turn_observer_event_handlers(self, turn_observer):
self._turn_observer = turn_observer
self.set_turn_observer_event_handlers(self._turn_observer)
def get_turn_observer(self):
return self._turn_observer
def set_turn_observer_event_handlers(self, turn_observer):
"""Sets the Turn Observer event handlers `on_turn_started` and `on_turn_ended`.
Args:
turn_observer: The turn tracking observer of the pipeline task
"""
@turn_observer.event_handler("on_turn_started")
async def on_turn_started(observer, turn_number):
self._turn_number = turn_number
current_time = time.time()
logger.info(f"🔄 Turn {turn_number} started")
# 🫆🫆🫆🫆
# code to start conversation turn here
# 🫆🫆🫆🫆
# 🫆🫆🫆🫆
# 🫆🫆🫆🫆
@turn_observer.event_handler("on_turn_ended")
async def on_turn_ended(observer, turn_number, duration, was_interrupted):
current_time = time.time()
if was_interrupted:
logger.info(f"🔄 Turn {turn_number} interrupted after {duration:.2f}s")
else:
logger.info(f"🏁 Turn {turn_number} completed in {duration:.2f}s")
# 🫆🫆🫆🫆
# code to end conversation turn here
# 🫆🫆🫆🫆
# 🫆🫆🫆🫆
# 🫆🫆🫆🫆
########
# everything past here isn't needed, just nice to have logging
########
async def on_push_frame(self, data: FramePushed):
"""Runs when any frame is pushed through pipeline.
Determines based on what type of frame and where it came from
what metrics to update.
Args:
data: the pushed frame
"""
src = data.source
dst = data.destination
frame = data.frame
direction = data.direction
timestamp = data.timestamp
# Convert timestamp to milliseconds for readability
time_sec = timestamp / 1_000_000
# Convert timestamp to seconds for readability
# time_sec = timestamp / 1_000_000_000
# only log downstream frames
if direction == FrameDirection.UPSTREAM:
return
if isinstance(src, Pipeline) or isinstance(dst, Pipeline):
if isinstance(frame, StartFrame):
self._handle_StartFrame(src, dst, frame, time_sec)
elif isinstance(frame, EndFrame):
self._handle_EndFrame(src, dst, frame, time_sec)
if isinstance(src, BaseOutputTransport):
if isinstance(frame, BotStartedSpeakingFrame):
self._handle_BotStartedSpeakingFrame(src, dst, frame, time_sec)
elif isinstance(frame, BotStoppedSpeakingFrame):
self._handle_BotStoppedSpeakingFrame(src, dst, frame, time_sec)
elif isinstance(frame, UserStartedSpeakingFrame):
self._handle_UserStartedSpeakingFrame(src, dst, frame, time_sec)
elif isinstance(frame, UserStoppedSpeakingFrame):
self._handle_UserStoppedSpeakingFrame(src, dst, frame, time_sec)
if isinstance(src, LLMService):
if isinstance(frame, LLMFullResponseStartFrame):
self._handle_LLMFullResponseStartFrame(src, dst, frame, time_sec)
elif isinstance(frame, LLMFullResponseEndFrame):
self._handle_LLMFullResponseEndFrame(src, dst, frame, time_sec)
elif isinstance(frame, FunctionCallsStartedFrame):
self._handle_FunctionCallsStartedFrame(src, dst, frame, time_sec)
elif isinstance(frame, FunctionCallResultFrame):
self._handle_FunctionCallResultFrame(src, dst, frame, time_sec)
# ------------ FRAME HANDLERS ------------
def _handle_StartFrame(self, src, dst, frame, time_sec):
if isinstance(dst, Pipeline):
logger.info(f"🟢🟢🟢 StartFrame: {src} {self._arrow} {dst} at {time_sec:.2f}s")
def _handle_EndFrame(self, src, dst, frame, time_sec):
if isinstance(dst, Pipeline):
logger.info(f"Queueing 🔴🔴🔴 EndFrame: {src} {self._arrow} {dst} at {time_sec:.2f}s")
self._endframe_queued = True
if isinstance(src, Pipeline):
logger.info(f"🔴🔴🔴 EndFrame: {src} {self._arrow} {dst} at {time_sec:.2f}s")
current_time = time.time()
end_state_info = {
"turn_number": self._turn_number,
}
def _handle_BotStartedSpeakingFrame(self, src, dst, frame, time_sec):
logger.info(f"🤖🟢 BotStartedSpeakingFrame: {src} {self._arrow} {dst} at {time_sec:.2f}s")
def _handle_BotStoppedSpeakingFrame(self, src, dst, frame, time_sec):
logger.info(f"🤖🔴 BotStoppedSpeakingFrame: {src} {self._arrow} {dst} at {time_sec:.2f}s")
def _handle_LLMFullResponseStartFrame(self, src, dst, frame, time_sec):
logger.info(f"🧠🟢 LLMFullResponseStartFrame: {src} {self._arrow} {dst} at {time_sec:.2f}s")
def _handle_LLMFullResponseEndFrame(self, src, dst, frame, time_sec):
logger.info(f"🧠🔴 LLMFullResponseEndFrame: {src} {self._arrow} {dst} at {time_sec:.2f}s")
def _handle_UserStartedSpeakingFrame(self, src, dst, frame, time_sec):
logger.info(f"🙂🟢 UserStartedSpeakingFrame: {src} {self._arrow} {dst} at {time_sec:.2f}s")
def _handle_UserStoppedSpeakingFrame(self, src, dst, frame, time_sec):
logger.info(f"🙂🔴 UserStoppedSpeakingFrame: {src} {self._arrow} {dst} at {time_sec:.2f}s")
def _handle_FunctionCallsStartedFrame(self, src, dst, frame, time_sec):
logger.info(
f"📐🟢 {frame.function_calls[0].function_name} FunctionCallsStartedFrame: {src} {self._arrow} {dst} at {time_sec:.2f}s"
)
def _handle_FunctionCallResultFrame(self, src, dst, frame, time_sec):
logger.info(
f"📐🔴 {frame.function_name} FunctionCallResultFrame: {src} {self._arrow} {dst} at {time_sec:.2f}s"
)

View File

@@ -146,6 +146,8 @@ class InworldTTSService(TTSService):
Parameters:
temperature: Voice temperature control for synthesis variability (e.g., 1.1).
Valid range: [0, 2]. Higher values increase variability.
speaking_rate: Speaking speed control (range: [0.5, 1.5]). Defaults to 1.0 when
unset.
Note:
Language is automatically inferred from the input text by Inworld's TTS models,
@@ -153,6 +155,7 @@ class InworldTTSService(TTSService):
"""
temperature: Optional[float] = None # optional temperature control (range: [0, 2])
speaking_rate: Optional[float] = None # optional speaking rate control (range: [0.5, 1.5])
def __init__(
self,
@@ -198,6 +201,7 @@ class InworldTTSService(TTSService):
- Other formats as supported by Inworld API
params: Optional input parameters for additional configuration. Use this to specify:
- temperature: Voice temperature control for variability (range: [0, 2], e.g., 1.1, optional)
- speaking_rate: Set desired speaking speed (range: [0.5, 1.5], optional)
Language is automatically inferred from input text.
**kwargs: Additional arguments passed to the parent TTSService class.
@@ -228,15 +232,18 @@ class InworldTTSService(TTSService):
self._settings = {
"voiceId": voice_id, # Voice selection from direct parameter
"modelId": model, # TTS model selection from direct parameter
"audio_config": { # Audio format configuration
"audio_encoding": encoding, # Format: LINEAR16, MP3, etc.
"sample_rate_hertz": 0, # Will be set in start() from parent service
"audioConfig": { # Audio format configuration
"audioEncoding": encoding, # Format: LINEAR16, MP3, etc.
"sampleRateHertz": 0, # Will be set in start() from parent service
},
}
# Add optional temperature parameter if provided (valid range: [0, 2])
if params and params.temperature is not None:
self._settings["temperature"] = params.temperature
# Add optional speaking rate if provided (valid range: [0.5, 1.5])
if params and params.speaking_rate is not None:
self._settings["audioConfig"]["speakingRate"] = params.speaking_rate
# Register voice and model with parent service for metrics and tracking
self.set_voice(voice_id) # Used for logging and metrics
@@ -257,7 +264,7 @@ class InworldTTSService(TTSService):
frame: The start frame containing initialization parameters.
"""
await super().start(frame)
self._settings["audio_config"]["sample_rate_hertz"] = self.sample_rate
self._settings["audioConfig"]["sampleRateHertz"] = self.sample_rate
async def stop(self, frame: EndFrame):
"""Stop the Inworld TTS service.
@@ -323,9 +330,7 @@ class InworldTTSService(TTSService):
"text": text, # Text to synthesize
"voiceId": self._settings["voiceId"], # Voice selection (Ashley, Hades, etc.)
"modelId": self._settings["modelId"], # TTS model (inworld-tts-1)
"audio_config": self._settings[
"audio_config"
], # Audio format settings (LINEAR16, 48kHz)
"audioConfig": self._settings["audioConfig"], # Audio format settings (LINEAR16, 48kHz)
}
# Add optional temperature parameter if configured (valid range: [0, 2])

View File

@@ -38,7 +38,7 @@ class STTService(AIService):
Event handlers:
on_connected: Called when connected to the STT service.
on_connected: Called when disconnected from the STT service.
on_disconnected: Called when disconnected from the STT service.
on_connection_error: Called when a connection to the STT service error occurs.
Example::