first version push to talk
This commit is contained in:
@@ -31,6 +31,7 @@ from livekit.agents import (
|
||||
cli,
|
||||
get_job_context,
|
||||
metrics,
|
||||
RoomIO
|
||||
)
|
||||
from livekit.agents.llm import ImageContent, ToolError, function_tool
|
||||
from typing import Any, List, Optional
|
||||
@@ -953,6 +954,8 @@ async def entrypoint(ctx: JobContext, avatar_dispatcher_url: str = None, vision_
|
||||
# Increase the maximum number of function calls per turn to avoid hitting the limit
|
||||
max_tool_steps=15,
|
||||
)
|
||||
room_io = RoomIO(session, room=ctx.room)
|
||||
await room_io.start()
|
||||
|
||||
# log metrics as they are emitted, and total usage after session is over
|
||||
usage_collector = metrics.UsageCollector()
|
||||
@@ -1011,6 +1014,45 @@ async def entrypoint(ctx: JobContext, avatar_dispatcher_url: str = None, vision_
|
||||
room_output_options=RoomOutputOptions(transcription_enabled=True),
|
||||
)
|
||||
|
||||
# disable input audio at the start
|
||||
session.input.set_audio_enabled(False)
|
||||
|
||||
@ctx.room.local_participant.register_rpc_method("start_turn")
|
||||
async def start_turn(data: rtc.RpcInvocationData):
|
||||
try:
|
||||
session.interrupt()
|
||||
except RuntimeError as e:
|
||||
logger.error(f"Failed to interrupt session: {e}")
|
||||
# Raise RPC error so client can detect interrupt failure
|
||||
# Use ERROR_INTERNAL (code 13) to indicate application error
|
||||
raise rtc.RpcError(
|
||||
code=13, # ERROR_INTERNAL
|
||||
message="Application error in method handler"
|
||||
)
|
||||
|
||||
session.clear_user_turn()
|
||||
|
||||
# listen to the caller if multi-user
|
||||
room_io.set_participant(data.caller_identity)
|
||||
session.input.set_audio_enabled(True)
|
||||
|
||||
@ctx.room.local_participant.register_rpc_method("end_turn")
|
||||
async def end_turn(data: rtc.RpcInvocationData):
|
||||
session.input.set_audio_enabled(False)
|
||||
session.commit_user_turn(
|
||||
# the timeout for the final transcript to be received after committing the user turn
|
||||
# increase this value if the STT is slow to respond
|
||||
transcript_timeout=10.0,
|
||||
# the duration of the silence to be appended to the STT to make it generate the final transcript
|
||||
stt_flush_duration=2.0,
|
||||
)
|
||||
|
||||
@ctx.room.local_participant.register_rpc_method("cancel_turn")
|
||||
async def cancel_turn(data: rtc.RpcInvocationData):
|
||||
session.input.set_audio_enabled(False)
|
||||
session.clear_user_turn()
|
||||
logger.info("cancel turn")
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--avatar-url", type=str, default=None, help="Avatar dispatcher URL")
|
||||
|
||||
@@ -5,13 +5,14 @@ import {
|
||||
BarVisualizer,
|
||||
useConnectionState,
|
||||
useLocalParticipant,
|
||||
useParticipantAttributes,
|
||||
useRoomContext,
|
||||
useTracks,
|
||||
useVoiceAssistant,
|
||||
VideoTrack,
|
||||
} from "@livekit/components-react";
|
||||
import { ConnectionState, Track, LocalParticipant, Room } from "livekit-client";
|
||||
import { useEffect, useMemo, useState, useRef } from "react";
|
||||
import { useEffect, useMemo, useState, useRef, useCallback } from "react";
|
||||
import { BatteryIcon, ImageIcon, MicIcon, MicOffIcon, PhoneIcon, PhoneOffIcon, WifiIcon, SwitchCameraIcon, VoiceIcon, CheckIcon } from "./icons";
|
||||
import { useToast } from "@/components/toast/ToasterProvider";
|
||||
|
||||
@@ -43,6 +44,9 @@ export function PhoneSimulator({
|
||||
const { localParticipant, isMicrophoneEnabled: isMicEnabled } = useLocalParticipant();
|
||||
const tracks = useTracks();
|
||||
const voiceAssistant = useVoiceAssistant();
|
||||
const agentAttributes = useParticipantAttributes({
|
||||
participant: voiceAssistant.agent,
|
||||
});
|
||||
const fileInputRef = useRef<HTMLInputElement>(null);
|
||||
const phoneContainerRef = useRef<HTMLDivElement>(null);
|
||||
const visualizerRef = useRef<HTMLDivElement>(null);
|
||||
@@ -59,6 +63,9 @@ export function PhoneSimulator({
|
||||
const isAgentSpeaking = voiceAssistant.state === "speaking";
|
||||
const wasMicEnabledRef = useRef(false);
|
||||
const lastPhoneMode = useRef(phoneMode);
|
||||
const [isPushToTalkActive, setIsPushToTalkActive] = useState(false);
|
||||
const [interruptRejected, setInterruptRejected] = useState(false);
|
||||
const pushToTalkButtonRef = useRef<HTMLButtonElement>(null);
|
||||
|
||||
useEffect(() => {
|
||||
const voiceAttr = config.settings.attributes?.find(a => a.key === "voice");
|
||||
@@ -421,6 +428,202 @@ export function PhoneSimulator({
|
||||
setShowVoiceMenu(!showVoiceMenu);
|
||||
};
|
||||
|
||||
// Check if agent supports push-to-talk (optional check, button will show regardless)
|
||||
const supportsPushToTalk = useMemo(() => {
|
||||
if (!voiceAssistant.agent || !agentAttributes.attributes) return false;
|
||||
return agentAttributes.attributes["push-to-talk"] === "1";
|
||||
}, [voiceAssistant.agent, agentAttributes.attributes]);
|
||||
|
||||
const handlePushToTalkStart = async () => {
|
||||
if (!room || !voiceAssistant.agent || isPushToTalkActive) return;
|
||||
|
||||
// Reset interrupt rejection state
|
||||
setInterruptRejected(false);
|
||||
|
||||
try {
|
||||
await room.localParticipant.performRpc({
|
||||
destinationIdentity: voiceAssistant.agent.identity,
|
||||
method: "start_turn",
|
||||
payload: "",
|
||||
});
|
||||
setIsPushToTalkActive(true);
|
||||
setInterruptRejected(false);
|
||||
} catch (error: any) {
|
||||
// Prevent error from propagating to React error boundary
|
||||
// by handling all expected errors here
|
||||
setIsPushToTalkActive(false);
|
||||
|
||||
const errorMessage = error?.message || "";
|
||||
const errorCode = error?.code;
|
||||
|
||||
// Check for "Method not supported at destination" - this happens when RPC methods aren't registered yet
|
||||
// This can occur on first call before agent is fully ready, so we silently ignore it
|
||||
if (errorMessage.includes("Method not supported at destination") ||
|
||||
errorMessage.includes("method not found") ||
|
||||
errorCode === 12) { // METHOD_NOT_FOUND
|
||||
// Silently ignore - the method will be available after first turn
|
||||
console.log("RPC method not ready yet, will be available after first turn");
|
||||
return;
|
||||
}
|
||||
|
||||
// Check for "Application error in method handler" - this indicates interrupt failed
|
||||
// This error is raised when session.interrupt() fails in the agent
|
||||
// We handle this gracefully by showing "不允许打断" on the button, so we don't log it as an error
|
||||
if (errorMessage.includes("Application error in method handler") ||
|
||||
errorMessage.includes("Application error") ||
|
||||
errorCode === 13 || // ERROR_INTERNAL (RpcErrorCode.ERROR_INTERNAL)
|
||||
(isAgentSpeaking && errorMessage.includes("interrupt"))) {
|
||||
// Suppress error logging for expected interrupt failures
|
||||
// Only log at debug level to avoid error popups
|
||||
if (process.env.NODE_ENV === 'development') {
|
||||
console.log("Interrupt rejected (expected behavior):", errorMessage);
|
||||
}
|
||||
setInterruptRejected(true);
|
||||
// Clear the rejection message after 3 seconds
|
||||
setTimeout(() => setInterruptRejected(false), 3000);
|
||||
// Explicitly prevent error from propagating
|
||||
error.preventDefault?.();
|
||||
error.stopPropagation?.();
|
||||
return;
|
||||
}
|
||||
|
||||
// Check if agent is speaking and the error suggests interruption was rejected
|
||||
if (isAgentSpeaking) {
|
||||
// Check for common rejection indicators
|
||||
if (errorMessage.includes("reject") ||
|
||||
errorMessage.includes("not allowed") ||
|
||||
errorCode === 403 || // Forbidden
|
||||
errorCode === 409) { // Conflict
|
||||
// Suppress error logging for expected rejections
|
||||
if (process.env.NODE_ENV === 'development') {
|
||||
console.log("Interrupt rejected:", errorMessage);
|
||||
}
|
||||
setInterruptRejected(true);
|
||||
// Clear the rejection message after 3 seconds
|
||||
setTimeout(() => setInterruptRejected(false), 3000);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Only log and show error for unexpected errors
|
||||
console.error("Unexpected error in push-to-talk:", error);
|
||||
const defaultErrorMessage = "Agent does not support push-to-talk. Make sure your agent has the push-to-talk RPC methods (start_turn, end_turn, cancel_turn) registered.";
|
||||
setToastMessage({ message: defaultErrorMessage, type: "error" });
|
||||
}
|
||||
};
|
||||
|
||||
const handlePushToTalkEnd = useCallback(async () => {
|
||||
if (!room || !voiceAssistant.agent || !isPushToTalkActive) return;
|
||||
|
||||
try {
|
||||
await room.localParticipant.performRpc({
|
||||
destinationIdentity: voiceAssistant.agent.identity,
|
||||
method: "end_turn",
|
||||
payload: "",
|
||||
});
|
||||
setIsPushToTalkActive(false);
|
||||
setInterruptRejected(false);
|
||||
} catch (error: any) {
|
||||
console.error("Failed to end turn:", error);
|
||||
// Don't show error toast on end_turn failure as it might be called during cleanup
|
||||
setIsPushToTalkActive(false);
|
||||
setInterruptRejected(false);
|
||||
}
|
||||
}, [room, voiceAssistant.agent, isPushToTalkActive]);
|
||||
|
||||
const handlePushToTalkCancel = useCallback(async () => {
|
||||
if (!room || !voiceAssistant.agent || !isPushToTalkActive) return;
|
||||
|
||||
try {
|
||||
await room.localParticipant.performRpc({
|
||||
destinationIdentity: voiceAssistant.agent.identity,
|
||||
method: "cancel_turn",
|
||||
payload: "",
|
||||
});
|
||||
setIsPushToTalkActive(false);
|
||||
setInterruptRejected(false);
|
||||
} catch (error) {
|
||||
console.error("Failed to cancel turn:", error);
|
||||
setIsPushToTalkActive(false);
|
||||
setInterruptRejected(false);
|
||||
}
|
||||
}, [room, voiceAssistant.agent, isPushToTalkActive]);
|
||||
|
||||
// Handle mouse events for push-to-talk
|
||||
const handlePushToTalkMouseDown = (e: React.MouseEvent) => {
|
||||
e.preventDefault();
|
||||
handlePushToTalkStart();
|
||||
};
|
||||
|
||||
const handlePushToTalkMouseUp = (e: React.MouseEvent) => {
|
||||
e.preventDefault();
|
||||
handlePushToTalkEnd();
|
||||
};
|
||||
|
||||
// Handle touch events for push-to-talk
|
||||
const handlePushToTalkTouchStart = (e: React.TouchEvent) => {
|
||||
e.preventDefault();
|
||||
handlePushToTalkStart();
|
||||
};
|
||||
|
||||
const handlePushToTalkTouchEnd = (e: React.TouchEvent) => {
|
||||
e.preventDefault();
|
||||
handlePushToTalkEnd();
|
||||
};
|
||||
|
||||
// Handle window blur, escape key, and global mouse/touch events to cancel/end push-to-talk
|
||||
useEffect(() => {
|
||||
if (!isPushToTalkActive) return;
|
||||
|
||||
const handleBlur = () => {
|
||||
handlePushToTalkCancel();
|
||||
};
|
||||
|
||||
const handleKeyDown = (e: KeyboardEvent) => {
|
||||
if (e.key === "Escape") {
|
||||
handlePushToTalkCancel();
|
||||
}
|
||||
};
|
||||
|
||||
// Handle global mouseup/touchend to end push-to-talk even if released outside button
|
||||
const handleGlobalMouseUp = () => {
|
||||
handlePushToTalkEnd();
|
||||
};
|
||||
|
||||
const handleGlobalTouchEnd = () => {
|
||||
handlePushToTalkEnd();
|
||||
};
|
||||
|
||||
window.addEventListener("blur", handleBlur);
|
||||
window.addEventListener("keydown", handleKeyDown);
|
||||
window.addEventListener("mouseup", handleGlobalMouseUp);
|
||||
window.addEventListener("touchend", handleGlobalTouchEnd);
|
||||
|
||||
return () => {
|
||||
window.removeEventListener("blur", handleBlur);
|
||||
window.removeEventListener("keydown", handleKeyDown);
|
||||
window.removeEventListener("mouseup", handleGlobalMouseUp);
|
||||
window.removeEventListener("touchend", handleGlobalTouchEnd);
|
||||
};
|
||||
}, [isPushToTalkActive, handlePushToTalkCancel, handlePushToTalkEnd]);
|
||||
|
||||
// Clean up push-to-talk state on disconnect
|
||||
useEffect(() => {
|
||||
if (roomState === ConnectionState.Disconnected && isPushToTalkActive) {
|
||||
setIsPushToTalkActive(false);
|
||||
setInterruptRejected(false);
|
||||
}
|
||||
}, [roomState, isPushToTalkActive]);
|
||||
|
||||
// Reset interrupt rejection when agent stops speaking
|
||||
useEffect(() => {
|
||||
if (!isAgentSpeaking && interruptRejected) {
|
||||
// Clear rejection state when agent finishes speaking
|
||||
const timer = setTimeout(() => setInterruptRejected(false), 1000);
|
||||
return () => clearTimeout(timer);
|
||||
}
|
||||
}, [isAgentSpeaking, interruptRejected]);
|
||||
|
||||
const handleFileChange = (event: React.ChangeEvent<HTMLInputElement>) => {
|
||||
const file = event.target.files?.[0];
|
||||
if (file && onCapture) {
|
||||
@@ -807,30 +1010,57 @@ export function PhoneSimulator({
|
||||
</div>
|
||||
) : (
|
||||
<div className="absolute bottom-[5%] left-0 w-full px-[8%] z-40">
|
||||
<div className="w-full flex items-center justify-center gap-8">
|
||||
{phoneMode !== "important_message" && phoneMode !== "hand_off" && (
|
||||
<div className="w-full flex flex-col items-center justify-center gap-4">
|
||||
{/* Push-to-Talk Button - Centered and Bigger */}
|
||||
{phoneMode !== "important_message" && phoneMode !== "hand_off" && voiceAssistant.agent && (
|
||||
<button
|
||||
className={`p-4 rounded-full backdrop-blur-md transition-colors ${
|
||||
!isMicEnabled
|
||||
? "bg-white text-black"
|
||||
: "bg-gray-600/50 text-white hover:bg-gray-600/70"
|
||||
ref={pushToTalkButtonRef}
|
||||
className={`w-24 h-24 rounded-full backdrop-blur-md transition-all flex flex-col items-center justify-center gap-2 ${
|
||||
interruptRejected
|
||||
? "bg-red-500/70 text-white"
|
||||
: isPushToTalkActive
|
||||
? "bg-green-500 text-white scale-110 shadow-lg shadow-green-500/50"
|
||||
: "bg-blue-500/70 text-white hover:bg-blue-500/90"
|
||||
}`}
|
||||
onClick={handleMicToggle}
|
||||
onMouseDown={handlePushToTalkMouseDown}
|
||||
onMouseUp={handlePushToTalkMouseUp}
|
||||
onTouchStart={handlePushToTalkTouchStart}
|
||||
onTouchEnd={handlePushToTalkTouchEnd}
|
||||
title={supportsPushToTalk ? "Push to Talk" : "Push to Talk (may not be supported by this agent)"}
|
||||
>
|
||||
{isMicEnabled ? (
|
||||
<MicIcon className="w-6 h-6" />
|
||||
) : (
|
||||
<MicOffIcon className="w-6 h-6" />
|
||||
)}
|
||||
<MicIcon className="w-8 h-8" />
|
||||
<span className="text-xs font-medium">
|
||||
{interruptRejected ? "不允许打断" : "按住说话"}
|
||||
</span>
|
||||
</button>
|
||||
)}
|
||||
|
||||
<button
|
||||
className="p-4 rounded-full bg-red-500 text-white hover:bg-red-600 transition-colors"
|
||||
onClick={handleDisconnect}
|
||||
>
|
||||
<PhoneOffIcon className="w-6 h-6" />
|
||||
</button>
|
||||
{/* Other Controls */}
|
||||
<div className="w-full flex items-center justify-center gap-8">
|
||||
{phoneMode !== "important_message" && phoneMode !== "hand_off" && (
|
||||
<button
|
||||
className={`p-4 rounded-full backdrop-blur-md transition-colors ${
|
||||
!isMicEnabled
|
||||
? "bg-white text-black"
|
||||
: "bg-gray-600/50 text-white hover:bg-gray-600/70"
|
||||
}`}
|
||||
onClick={handleMicToggle}
|
||||
>
|
||||
{isMicEnabled ? (
|
||||
<MicIcon className="w-6 h-6" />
|
||||
) : (
|
||||
<MicOffIcon className="w-6 h-6" />
|
||||
)}
|
||||
</button>
|
||||
)}
|
||||
|
||||
<button
|
||||
className="p-4 rounded-full bg-red-500 text-white hover:bg-red-600 transition-colors"
|
||||
onClick={handleDisconnect}
|
||||
>
|
||||
<PhoneOffIcon className="w-6 h-6" />
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
)
|
||||
|
||||
@@ -108,6 +108,16 @@ export function HomeInner() {
|
||||
token={token}
|
||||
connect={shouldConnect}
|
||||
onError={(e) => {
|
||||
// Filter out expected errors from push-to-talk interrupt failures
|
||||
// These are handled gracefully in the PhoneSimulator component
|
||||
if (e.message?.includes("Application error in method handler") ||
|
||||
e.message?.includes("Method not supported at destination")) {
|
||||
// Silently ignore - these are expected and handled in PhoneSimulator
|
||||
if (process.env.NODE_ENV === 'development') {
|
||||
console.log("Filtered expected error:", e.message);
|
||||
}
|
||||
return;
|
||||
}
|
||||
setToastMessage({ message: e.message, type: "error" });
|
||||
console.error(e);
|
||||
}}
|
||||
|
||||
Reference in New Issue
Block a user