first version push to talk

This commit is contained in:
2025-12-16 15:11:55 +08:00
parent 9f05f067a6
commit 1774f550dd
3 changed files with 301 additions and 19 deletions

View File

@@ -31,6 +31,7 @@ from livekit.agents import (
cli,
get_job_context,
metrics,
RoomIO
)
from livekit.agents.llm import ImageContent, ToolError, function_tool
from typing import Any, List, Optional
@@ -953,6 +954,8 @@ async def entrypoint(ctx: JobContext, avatar_dispatcher_url: str = None, vision_
# Increase the maximum number of function calls per turn to avoid hitting the limit
max_tool_steps=15,
)
room_io = RoomIO(session, room=ctx.room)
await room_io.start()
# log metrics as they are emitted, and total usage after session is over
usage_collector = metrics.UsageCollector()
@@ -1011,6 +1014,45 @@ async def entrypoint(ctx: JobContext, avatar_dispatcher_url: str = None, vision_
room_output_options=RoomOutputOptions(transcription_enabled=True),
)
# disable input audio at the start
session.input.set_audio_enabled(False)
@ctx.room.local_participant.register_rpc_method("start_turn")
async def start_turn(data: rtc.RpcInvocationData):
try:
session.interrupt()
except RuntimeError as e:
logger.error(f"Failed to interrupt session: {e}")
# Raise RPC error so client can detect interrupt failure
# Use ERROR_INTERNAL (code 13) to indicate application error
raise rtc.RpcError(
code=13, # ERROR_INTERNAL
message="Application error in method handler"
)
session.clear_user_turn()
# listen to the caller if multi-user
room_io.set_participant(data.caller_identity)
session.input.set_audio_enabled(True)
@ctx.room.local_participant.register_rpc_method("end_turn")
async def end_turn(data: rtc.RpcInvocationData):
session.input.set_audio_enabled(False)
session.commit_user_turn(
# the timeout for the final transcript to be received after committing the user turn
# increase this value if the STT is slow to respond
transcript_timeout=10.0,
# the duration of the silence to be appended to the STT to make it generate the final transcript
stt_flush_duration=2.0,
)
@ctx.room.local_participant.register_rpc_method("cancel_turn")
async def cancel_turn(data: rtc.RpcInvocationData):
session.input.set_audio_enabled(False)
session.clear_user_turn()
logger.info("cancel turn")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--avatar-url", type=str, default=None, help="Avatar dispatcher URL")

View File

@@ -5,13 +5,14 @@ import {
BarVisualizer,
useConnectionState,
useLocalParticipant,
useParticipantAttributes,
useRoomContext,
useTracks,
useVoiceAssistant,
VideoTrack,
} from "@livekit/components-react";
import { ConnectionState, Track, LocalParticipant, Room } from "livekit-client";
import { useEffect, useMemo, useState, useRef } from "react";
import { useEffect, useMemo, useState, useRef, useCallback } from "react";
import { BatteryIcon, ImageIcon, MicIcon, MicOffIcon, PhoneIcon, PhoneOffIcon, WifiIcon, SwitchCameraIcon, VoiceIcon, CheckIcon } from "./icons";
import { useToast } from "@/components/toast/ToasterProvider";
@@ -43,6 +44,9 @@ export function PhoneSimulator({
const { localParticipant, isMicrophoneEnabled: isMicEnabled } = useLocalParticipant();
const tracks = useTracks();
const voiceAssistant = useVoiceAssistant();
const agentAttributes = useParticipantAttributes({
participant: voiceAssistant.agent,
});
const fileInputRef = useRef<HTMLInputElement>(null);
const phoneContainerRef = useRef<HTMLDivElement>(null);
const visualizerRef = useRef<HTMLDivElement>(null);
@@ -59,6 +63,9 @@ export function PhoneSimulator({
const isAgentSpeaking = voiceAssistant.state === "speaking";
const wasMicEnabledRef = useRef(false);
const lastPhoneMode = useRef(phoneMode);
const [isPushToTalkActive, setIsPushToTalkActive] = useState(false);
const [interruptRejected, setInterruptRejected] = useState(false);
const pushToTalkButtonRef = useRef<HTMLButtonElement>(null);
useEffect(() => {
const voiceAttr = config.settings.attributes?.find(a => a.key === "voice");
@@ -421,6 +428,202 @@ export function PhoneSimulator({
setShowVoiceMenu(!showVoiceMenu);
};
// Check if agent supports push-to-talk (optional check, button will show regardless)
const supportsPushToTalk = useMemo(() => {
if (!voiceAssistant.agent || !agentAttributes.attributes) return false;
return agentAttributes.attributes["push-to-talk"] === "1";
}, [voiceAssistant.agent, agentAttributes.attributes]);
const handlePushToTalkStart = async () => {
if (!room || !voiceAssistant.agent || isPushToTalkActive) return;
// Reset interrupt rejection state
setInterruptRejected(false);
try {
await room.localParticipant.performRpc({
destinationIdentity: voiceAssistant.agent.identity,
method: "start_turn",
payload: "",
});
setIsPushToTalkActive(true);
setInterruptRejected(false);
} catch (error: any) {
// Prevent error from propagating to React error boundary
// by handling all expected errors here
setIsPushToTalkActive(false);
const errorMessage = error?.message || "";
const errorCode = error?.code;
// Check for "Method not supported at destination" - this happens when RPC methods aren't registered yet
// This can occur on first call before agent is fully ready, so we silently ignore it
if (errorMessage.includes("Method not supported at destination") ||
errorMessage.includes("method not found") ||
errorCode === 12) { // METHOD_NOT_FOUND
// Silently ignore - the method will be available after first turn
console.log("RPC method not ready yet, will be available after first turn");
return;
}
// Check for "Application error in method handler" - this indicates interrupt failed
// This error is raised when session.interrupt() fails in the agent
// We handle this gracefully by showing "不允许打断" on the button, so we don't log it as an error
if (errorMessage.includes("Application error in method handler") ||
errorMessage.includes("Application error") ||
errorCode === 13 || // ERROR_INTERNAL (RpcErrorCode.ERROR_INTERNAL)
(isAgentSpeaking && errorMessage.includes("interrupt"))) {
// Suppress error logging for expected interrupt failures
// Only log at debug level to avoid error popups
if (process.env.NODE_ENV === 'development') {
console.log("Interrupt rejected (expected behavior):", errorMessage);
}
setInterruptRejected(true);
// Clear the rejection message after 3 seconds
setTimeout(() => setInterruptRejected(false), 3000);
// Explicitly prevent error from propagating
error.preventDefault?.();
error.stopPropagation?.();
return;
}
// Check if agent is speaking and the error suggests interruption was rejected
if (isAgentSpeaking) {
// Check for common rejection indicators
if (errorMessage.includes("reject") ||
errorMessage.includes("not allowed") ||
errorCode === 403 || // Forbidden
errorCode === 409) { // Conflict
// Suppress error logging for expected rejections
if (process.env.NODE_ENV === 'development') {
console.log("Interrupt rejected:", errorMessage);
}
setInterruptRejected(true);
// Clear the rejection message after 3 seconds
setTimeout(() => setInterruptRejected(false), 3000);
return;
}
}
// Only log and show error for unexpected errors
console.error("Unexpected error in push-to-talk:", error);
const defaultErrorMessage = "Agent does not support push-to-talk. Make sure your agent has the push-to-talk RPC methods (start_turn, end_turn, cancel_turn) registered.";
setToastMessage({ message: defaultErrorMessage, type: "error" });
}
};
const handlePushToTalkEnd = useCallback(async () => {
if (!room || !voiceAssistant.agent || !isPushToTalkActive) return;
try {
await room.localParticipant.performRpc({
destinationIdentity: voiceAssistant.agent.identity,
method: "end_turn",
payload: "",
});
setIsPushToTalkActive(false);
setInterruptRejected(false);
} catch (error: any) {
console.error("Failed to end turn:", error);
// Don't show error toast on end_turn failure as it might be called during cleanup
setIsPushToTalkActive(false);
setInterruptRejected(false);
}
}, [room, voiceAssistant.agent, isPushToTalkActive]);
const handlePushToTalkCancel = useCallback(async () => {
if (!room || !voiceAssistant.agent || !isPushToTalkActive) return;
try {
await room.localParticipant.performRpc({
destinationIdentity: voiceAssistant.agent.identity,
method: "cancel_turn",
payload: "",
});
setIsPushToTalkActive(false);
setInterruptRejected(false);
} catch (error) {
console.error("Failed to cancel turn:", error);
setIsPushToTalkActive(false);
setInterruptRejected(false);
}
}, [room, voiceAssistant.agent, isPushToTalkActive]);
// Handle mouse events for push-to-talk
const handlePushToTalkMouseDown = (e: React.MouseEvent) => {
e.preventDefault();
handlePushToTalkStart();
};
const handlePushToTalkMouseUp = (e: React.MouseEvent) => {
e.preventDefault();
handlePushToTalkEnd();
};
// Handle touch events for push-to-talk
const handlePushToTalkTouchStart = (e: React.TouchEvent) => {
e.preventDefault();
handlePushToTalkStart();
};
const handlePushToTalkTouchEnd = (e: React.TouchEvent) => {
e.preventDefault();
handlePushToTalkEnd();
};
// Handle window blur, escape key, and global mouse/touch events to cancel/end push-to-talk
useEffect(() => {
if (!isPushToTalkActive) return;
const handleBlur = () => {
handlePushToTalkCancel();
};
const handleKeyDown = (e: KeyboardEvent) => {
if (e.key === "Escape") {
handlePushToTalkCancel();
}
};
// Handle global mouseup/touchend to end push-to-talk even if released outside button
const handleGlobalMouseUp = () => {
handlePushToTalkEnd();
};
const handleGlobalTouchEnd = () => {
handlePushToTalkEnd();
};
window.addEventListener("blur", handleBlur);
window.addEventListener("keydown", handleKeyDown);
window.addEventListener("mouseup", handleGlobalMouseUp);
window.addEventListener("touchend", handleGlobalTouchEnd);
return () => {
window.removeEventListener("blur", handleBlur);
window.removeEventListener("keydown", handleKeyDown);
window.removeEventListener("mouseup", handleGlobalMouseUp);
window.removeEventListener("touchend", handleGlobalTouchEnd);
};
}, [isPushToTalkActive, handlePushToTalkCancel, handlePushToTalkEnd]);
// Clean up push-to-talk state on disconnect
useEffect(() => {
if (roomState === ConnectionState.Disconnected && isPushToTalkActive) {
setIsPushToTalkActive(false);
setInterruptRejected(false);
}
}, [roomState, isPushToTalkActive]);
// Reset interrupt rejection when agent stops speaking
useEffect(() => {
if (!isAgentSpeaking && interruptRejected) {
// Clear rejection state when agent finishes speaking
const timer = setTimeout(() => setInterruptRejected(false), 1000);
return () => clearTimeout(timer);
}
}, [isAgentSpeaking, interruptRejected]);
const handleFileChange = (event: React.ChangeEvent<HTMLInputElement>) => {
const file = event.target.files?.[0];
if (file && onCapture) {
@@ -807,6 +1010,32 @@ export function PhoneSimulator({
</div>
) : (
<div className="absolute bottom-[5%] left-0 w-full px-[8%] z-40">
<div className="w-full flex flex-col items-center justify-center gap-4">
{/* Push-to-Talk Button - Centered and Bigger */}
{phoneMode !== "important_message" && phoneMode !== "hand_off" && voiceAssistant.agent && (
<button
ref={pushToTalkButtonRef}
className={`w-24 h-24 rounded-full backdrop-blur-md transition-all flex flex-col items-center justify-center gap-2 ${
interruptRejected
? "bg-red-500/70 text-white"
: isPushToTalkActive
? "bg-green-500 text-white scale-110 shadow-lg shadow-green-500/50"
: "bg-blue-500/70 text-white hover:bg-blue-500/90"
}`}
onMouseDown={handlePushToTalkMouseDown}
onMouseUp={handlePushToTalkMouseUp}
onTouchStart={handlePushToTalkTouchStart}
onTouchEnd={handlePushToTalkTouchEnd}
title={supportsPushToTalk ? "Push to Talk" : "Push to Talk (may not be supported by this agent)"}
>
<MicIcon className="w-8 h-8" />
<span className="text-xs font-medium">
{interruptRejected ? "不允许打断" : "按住说话"}
</span>
</button>
)}
{/* Other Controls */}
<div className="w-full flex items-center justify-center gap-8">
{phoneMode !== "important_message" && phoneMode !== "hand_off" && (
<button
@@ -833,6 +1062,7 @@ export function PhoneSimulator({
</button>
</div>
</div>
</div>
)
)}
</div>

View File

@@ -108,6 +108,16 @@ export function HomeInner() {
token={token}
connect={shouldConnect}
onError={(e) => {
// Filter out expected errors from push-to-talk interrupt failures
// These are handled gracefully in the PhoneSimulator component
if (e.message?.includes("Application error in method handler") ||
e.message?.includes("Method not supported at destination")) {
// Silently ignore - these are expected and handled in PhoneSimulator
if (process.env.NODE_ENV === 'development') {
console.log("Filtered expected error:", e.message);
}
return;
}
setToastMessage({ message: e.message, type: "error" });
console.error(e);
}}