first version push to talk

This commit is contained in:
2025-12-16 15:11:55 +08:00
parent 9f05f067a6
commit 1774f550dd
3 changed files with 301 additions and 19 deletions

View File

@@ -31,6 +31,7 @@ from livekit.agents import (
cli, cli,
get_job_context, get_job_context,
metrics, metrics,
RoomIO
) )
from livekit.agents.llm import ImageContent, ToolError, function_tool from livekit.agents.llm import ImageContent, ToolError, function_tool
from typing import Any, List, Optional from typing import Any, List, Optional
@@ -953,6 +954,8 @@ async def entrypoint(ctx: JobContext, avatar_dispatcher_url: str = None, vision_
# Increase the maximum number of function calls per turn to avoid hitting the limit # Increase the maximum number of function calls per turn to avoid hitting the limit
max_tool_steps=15, max_tool_steps=15,
) )
room_io = RoomIO(session, room=ctx.room)
await room_io.start()
# log metrics as they are emitted, and total usage after session is over # log metrics as they are emitted, and total usage after session is over
usage_collector = metrics.UsageCollector() usage_collector = metrics.UsageCollector()
@@ -1011,6 +1014,45 @@ async def entrypoint(ctx: JobContext, avatar_dispatcher_url: str = None, vision_
room_output_options=RoomOutputOptions(transcription_enabled=True), room_output_options=RoomOutputOptions(transcription_enabled=True),
) )
# disable input audio at the start
session.input.set_audio_enabled(False)
@ctx.room.local_participant.register_rpc_method("start_turn")
async def start_turn(data: rtc.RpcInvocationData):
try:
session.interrupt()
except RuntimeError as e:
logger.error(f"Failed to interrupt session: {e}")
# Raise RPC error so client can detect interrupt failure
# Use ERROR_INTERNAL (code 13) to indicate application error
raise rtc.RpcError(
code=13, # ERROR_INTERNAL
message="Application error in method handler"
)
session.clear_user_turn()
# listen to the caller if multi-user
room_io.set_participant(data.caller_identity)
session.input.set_audio_enabled(True)
@ctx.room.local_participant.register_rpc_method("end_turn")
async def end_turn(data: rtc.RpcInvocationData):
session.input.set_audio_enabled(False)
session.commit_user_turn(
# the timeout for the final transcript to be received after committing the user turn
# increase this value if the STT is slow to respond
transcript_timeout=10.0,
# the duration of the silence to be appended to the STT to make it generate the final transcript
stt_flush_duration=2.0,
)
@ctx.room.local_participant.register_rpc_method("cancel_turn")
async def cancel_turn(data: rtc.RpcInvocationData):
session.input.set_audio_enabled(False)
session.clear_user_turn()
logger.info("cancel turn")
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--avatar-url", type=str, default=None, help="Avatar dispatcher URL") parser.add_argument("--avatar-url", type=str, default=None, help="Avatar dispatcher URL")

View File

@@ -5,13 +5,14 @@ import {
BarVisualizer, BarVisualizer,
useConnectionState, useConnectionState,
useLocalParticipant, useLocalParticipant,
useParticipantAttributes,
useRoomContext, useRoomContext,
useTracks, useTracks,
useVoiceAssistant, useVoiceAssistant,
VideoTrack, VideoTrack,
} from "@livekit/components-react"; } from "@livekit/components-react";
import { ConnectionState, Track, LocalParticipant, Room } from "livekit-client"; import { ConnectionState, Track, LocalParticipant, Room } from "livekit-client";
import { useEffect, useMemo, useState, useRef } from "react"; import { useEffect, useMemo, useState, useRef, useCallback } from "react";
import { BatteryIcon, ImageIcon, MicIcon, MicOffIcon, PhoneIcon, PhoneOffIcon, WifiIcon, SwitchCameraIcon, VoiceIcon, CheckIcon } from "./icons"; import { BatteryIcon, ImageIcon, MicIcon, MicOffIcon, PhoneIcon, PhoneOffIcon, WifiIcon, SwitchCameraIcon, VoiceIcon, CheckIcon } from "./icons";
import { useToast } from "@/components/toast/ToasterProvider"; import { useToast } from "@/components/toast/ToasterProvider";
@@ -43,6 +44,9 @@ export function PhoneSimulator({
const { localParticipant, isMicrophoneEnabled: isMicEnabled } = useLocalParticipant(); const { localParticipant, isMicrophoneEnabled: isMicEnabled } = useLocalParticipant();
const tracks = useTracks(); const tracks = useTracks();
const voiceAssistant = useVoiceAssistant(); const voiceAssistant = useVoiceAssistant();
const agentAttributes = useParticipantAttributes({
participant: voiceAssistant.agent,
});
const fileInputRef = useRef<HTMLInputElement>(null); const fileInputRef = useRef<HTMLInputElement>(null);
const phoneContainerRef = useRef<HTMLDivElement>(null); const phoneContainerRef = useRef<HTMLDivElement>(null);
const visualizerRef = useRef<HTMLDivElement>(null); const visualizerRef = useRef<HTMLDivElement>(null);
@@ -59,6 +63,9 @@ export function PhoneSimulator({
const isAgentSpeaking = voiceAssistant.state === "speaking"; const isAgentSpeaking = voiceAssistant.state === "speaking";
const wasMicEnabledRef = useRef(false); const wasMicEnabledRef = useRef(false);
const lastPhoneMode = useRef(phoneMode); const lastPhoneMode = useRef(phoneMode);
const [isPushToTalkActive, setIsPushToTalkActive] = useState(false);
const [interruptRejected, setInterruptRejected] = useState(false);
const pushToTalkButtonRef = useRef<HTMLButtonElement>(null);
useEffect(() => { useEffect(() => {
const voiceAttr = config.settings.attributes?.find(a => a.key === "voice"); const voiceAttr = config.settings.attributes?.find(a => a.key === "voice");
@@ -421,6 +428,202 @@ export function PhoneSimulator({
setShowVoiceMenu(!showVoiceMenu); setShowVoiceMenu(!showVoiceMenu);
}; };
// Check if agent supports push-to-talk (optional check, button will show regardless)
const supportsPushToTalk = useMemo(() => {
if (!voiceAssistant.agent || !agentAttributes.attributes) return false;
return agentAttributes.attributes["push-to-talk"] === "1";
}, [voiceAssistant.agent, agentAttributes.attributes]);
const handlePushToTalkStart = async () => {
if (!room || !voiceAssistant.agent || isPushToTalkActive) return;
// Reset interrupt rejection state
setInterruptRejected(false);
try {
await room.localParticipant.performRpc({
destinationIdentity: voiceAssistant.agent.identity,
method: "start_turn",
payload: "",
});
setIsPushToTalkActive(true);
setInterruptRejected(false);
} catch (error: any) {
// Prevent error from propagating to React error boundary
// by handling all expected errors here
setIsPushToTalkActive(false);
const errorMessage = error?.message || "";
const errorCode = error?.code;
// Check for "Method not supported at destination" - this happens when RPC methods aren't registered yet
// This can occur on first call before agent is fully ready, so we silently ignore it
if (errorMessage.includes("Method not supported at destination") ||
errorMessage.includes("method not found") ||
errorCode === 12) { // METHOD_NOT_FOUND
// Silently ignore - the method will be available after first turn
console.log("RPC method not ready yet, will be available after first turn");
return;
}
// Check for "Application error in method handler" - this indicates interrupt failed
// This error is raised when session.interrupt() fails in the agent
// We handle this gracefully by showing "不允许打断" on the button, so we don't log it as an error
if (errorMessage.includes("Application error in method handler") ||
errorMessage.includes("Application error") ||
errorCode === 13 || // ERROR_INTERNAL (RpcErrorCode.ERROR_INTERNAL)
(isAgentSpeaking && errorMessage.includes("interrupt"))) {
// Suppress error logging for expected interrupt failures
// Only log at debug level to avoid error popups
if (process.env.NODE_ENV === 'development') {
console.log("Interrupt rejected (expected behavior):", errorMessage);
}
setInterruptRejected(true);
// Clear the rejection message after 3 seconds
setTimeout(() => setInterruptRejected(false), 3000);
// Explicitly prevent error from propagating
error.preventDefault?.();
error.stopPropagation?.();
return;
}
// Check if agent is speaking and the error suggests interruption was rejected
if (isAgentSpeaking) {
// Check for common rejection indicators
if (errorMessage.includes("reject") ||
errorMessage.includes("not allowed") ||
errorCode === 403 || // Forbidden
errorCode === 409) { // Conflict
// Suppress error logging for expected rejections
if (process.env.NODE_ENV === 'development') {
console.log("Interrupt rejected:", errorMessage);
}
setInterruptRejected(true);
// Clear the rejection message after 3 seconds
setTimeout(() => setInterruptRejected(false), 3000);
return;
}
}
// Only log and show error for unexpected errors
console.error("Unexpected error in push-to-talk:", error);
const defaultErrorMessage = "Agent does not support push-to-talk. Make sure your agent has the push-to-talk RPC methods (start_turn, end_turn, cancel_turn) registered.";
setToastMessage({ message: defaultErrorMessage, type: "error" });
}
};
const handlePushToTalkEnd = useCallback(async () => {
if (!room || !voiceAssistant.agent || !isPushToTalkActive) return;
try {
await room.localParticipant.performRpc({
destinationIdentity: voiceAssistant.agent.identity,
method: "end_turn",
payload: "",
});
setIsPushToTalkActive(false);
setInterruptRejected(false);
} catch (error: any) {
console.error("Failed to end turn:", error);
// Don't show error toast on end_turn failure as it might be called during cleanup
setIsPushToTalkActive(false);
setInterruptRejected(false);
}
}, [room, voiceAssistant.agent, isPushToTalkActive]);
const handlePushToTalkCancel = useCallback(async () => {
if (!room || !voiceAssistant.agent || !isPushToTalkActive) return;
try {
await room.localParticipant.performRpc({
destinationIdentity: voiceAssistant.agent.identity,
method: "cancel_turn",
payload: "",
});
setIsPushToTalkActive(false);
setInterruptRejected(false);
} catch (error) {
console.error("Failed to cancel turn:", error);
setIsPushToTalkActive(false);
setInterruptRejected(false);
}
}, [room, voiceAssistant.agent, isPushToTalkActive]);
// Handle mouse events for push-to-talk
const handlePushToTalkMouseDown = (e: React.MouseEvent) => {
e.preventDefault();
handlePushToTalkStart();
};
const handlePushToTalkMouseUp = (e: React.MouseEvent) => {
e.preventDefault();
handlePushToTalkEnd();
};
// Handle touch events for push-to-talk
const handlePushToTalkTouchStart = (e: React.TouchEvent) => {
e.preventDefault();
handlePushToTalkStart();
};
const handlePushToTalkTouchEnd = (e: React.TouchEvent) => {
e.preventDefault();
handlePushToTalkEnd();
};
// Handle window blur, escape key, and global mouse/touch events to cancel/end push-to-talk
useEffect(() => {
if (!isPushToTalkActive) return;
const handleBlur = () => {
handlePushToTalkCancel();
};
const handleKeyDown = (e: KeyboardEvent) => {
if (e.key === "Escape") {
handlePushToTalkCancel();
}
};
// Handle global mouseup/touchend to end push-to-talk even if released outside button
const handleGlobalMouseUp = () => {
handlePushToTalkEnd();
};
const handleGlobalTouchEnd = () => {
handlePushToTalkEnd();
};
window.addEventListener("blur", handleBlur);
window.addEventListener("keydown", handleKeyDown);
window.addEventListener("mouseup", handleGlobalMouseUp);
window.addEventListener("touchend", handleGlobalTouchEnd);
return () => {
window.removeEventListener("blur", handleBlur);
window.removeEventListener("keydown", handleKeyDown);
window.removeEventListener("mouseup", handleGlobalMouseUp);
window.removeEventListener("touchend", handleGlobalTouchEnd);
};
}, [isPushToTalkActive, handlePushToTalkCancel, handlePushToTalkEnd]);
// Clean up push-to-talk state on disconnect
useEffect(() => {
if (roomState === ConnectionState.Disconnected && isPushToTalkActive) {
setIsPushToTalkActive(false);
setInterruptRejected(false);
}
}, [roomState, isPushToTalkActive]);
// Reset interrupt rejection when agent stops speaking
useEffect(() => {
if (!isAgentSpeaking && interruptRejected) {
// Clear rejection state when agent finishes speaking
const timer = setTimeout(() => setInterruptRejected(false), 1000);
return () => clearTimeout(timer);
}
}, [isAgentSpeaking, interruptRejected]);
const handleFileChange = (event: React.ChangeEvent<HTMLInputElement>) => { const handleFileChange = (event: React.ChangeEvent<HTMLInputElement>) => {
const file = event.target.files?.[0]; const file = event.target.files?.[0];
if (file && onCapture) { if (file && onCapture) {
@@ -807,30 +1010,57 @@ export function PhoneSimulator({
</div> </div>
) : ( ) : (
<div className="absolute bottom-[5%] left-0 w-full px-[8%] z-40"> <div className="absolute bottom-[5%] left-0 w-full px-[8%] z-40">
<div className="w-full flex items-center justify-center gap-8"> <div className="w-full flex flex-col items-center justify-center gap-4">
{phoneMode !== "important_message" && phoneMode !== "hand_off" && ( {/* Push-to-Talk Button - Centered and Bigger */}
{phoneMode !== "important_message" && phoneMode !== "hand_off" && voiceAssistant.agent && (
<button <button
className={`p-4 rounded-full backdrop-blur-md transition-colors ${ ref={pushToTalkButtonRef}
!isMicEnabled className={`w-24 h-24 rounded-full backdrop-blur-md transition-all flex flex-col items-center justify-center gap-2 ${
? "bg-white text-black" interruptRejected
: "bg-gray-600/50 text-white hover:bg-gray-600/70" ? "bg-red-500/70 text-white"
: isPushToTalkActive
? "bg-green-500 text-white scale-110 shadow-lg shadow-green-500/50"
: "bg-blue-500/70 text-white hover:bg-blue-500/90"
}`} }`}
onClick={handleMicToggle} onMouseDown={handlePushToTalkMouseDown}
onMouseUp={handlePushToTalkMouseUp}
onTouchStart={handlePushToTalkTouchStart}
onTouchEnd={handlePushToTalkTouchEnd}
title={supportsPushToTalk ? "Push to Talk" : "Push to Talk (may not be supported by this agent)"}
> >
{isMicEnabled ? ( <MicIcon className="w-8 h-8" />
<MicIcon className="w-6 h-6" /> <span className="text-xs font-medium">
) : ( {interruptRejected ? "不允许打断" : "按住说话"}
<MicOffIcon className="w-6 h-6" /> </span>
)}
</button> </button>
)} )}
<button {/* Other Controls */}
className="p-4 rounded-full bg-red-500 text-white hover:bg-red-600 transition-colors" <div className="w-full flex items-center justify-center gap-8">
onClick={handleDisconnect} {phoneMode !== "important_message" && phoneMode !== "hand_off" && (
> <button
<PhoneOffIcon className="w-6 h-6" /> className={`p-4 rounded-full backdrop-blur-md transition-colors ${
</button> !isMicEnabled
? "bg-white text-black"
: "bg-gray-600/50 text-white hover:bg-gray-600/70"
}`}
onClick={handleMicToggle}
>
{isMicEnabled ? (
<MicIcon className="w-6 h-6" />
) : (
<MicOffIcon className="w-6 h-6" />
)}
</button>
)}
<button
className="p-4 rounded-full bg-red-500 text-white hover:bg-red-600 transition-colors"
onClick={handleDisconnect}
>
<PhoneOffIcon className="w-6 h-6" />
</button>
</div>
</div> </div>
</div> </div>
) )

View File

@@ -108,6 +108,16 @@ export function HomeInner() {
token={token} token={token}
connect={shouldConnect} connect={shouldConnect}
onError={(e) => { onError={(e) => {
// Filter out expected errors from push-to-talk interrupt failures
// These are handled gracefully in the PhoneSimulator component
if (e.message?.includes("Application error in method handler") ||
e.message?.includes("Method not supported at destination")) {
// Silently ignore - these are expected and handled in PhoneSimulator
if (process.env.NODE_ENV === 'development') {
console.log("Filtered expected error:", e.message);
}
return;
}
setToastMessage({ message: e.message, type: "error" }); setToastMessage({ message: e.message, type: "error" });
console.error(e); console.error(e);
}} }}