first version push to talk
This commit is contained in:
@@ -31,6 +31,7 @@ from livekit.agents import (
|
|||||||
cli,
|
cli,
|
||||||
get_job_context,
|
get_job_context,
|
||||||
metrics,
|
metrics,
|
||||||
|
RoomIO
|
||||||
)
|
)
|
||||||
from livekit.agents.llm import ImageContent, ToolError, function_tool
|
from livekit.agents.llm import ImageContent, ToolError, function_tool
|
||||||
from typing import Any, List, Optional
|
from typing import Any, List, Optional
|
||||||
@@ -953,6 +954,8 @@ async def entrypoint(ctx: JobContext, avatar_dispatcher_url: str = None, vision_
|
|||||||
# Increase the maximum number of function calls per turn to avoid hitting the limit
|
# Increase the maximum number of function calls per turn to avoid hitting the limit
|
||||||
max_tool_steps=15,
|
max_tool_steps=15,
|
||||||
)
|
)
|
||||||
|
room_io = RoomIO(session, room=ctx.room)
|
||||||
|
await room_io.start()
|
||||||
|
|
||||||
# log metrics as they are emitted, and total usage after session is over
|
# log metrics as they are emitted, and total usage after session is over
|
||||||
usage_collector = metrics.UsageCollector()
|
usage_collector = metrics.UsageCollector()
|
||||||
@@ -1011,6 +1014,45 @@ async def entrypoint(ctx: JobContext, avatar_dispatcher_url: str = None, vision_
|
|||||||
room_output_options=RoomOutputOptions(transcription_enabled=True),
|
room_output_options=RoomOutputOptions(transcription_enabled=True),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# disable input audio at the start
|
||||||
|
session.input.set_audio_enabled(False)
|
||||||
|
|
||||||
|
@ctx.room.local_participant.register_rpc_method("start_turn")
|
||||||
|
async def start_turn(data: rtc.RpcInvocationData):
|
||||||
|
try:
|
||||||
|
session.interrupt()
|
||||||
|
except RuntimeError as e:
|
||||||
|
logger.error(f"Failed to interrupt session: {e}")
|
||||||
|
# Raise RPC error so client can detect interrupt failure
|
||||||
|
# Use ERROR_INTERNAL (code 13) to indicate application error
|
||||||
|
raise rtc.RpcError(
|
||||||
|
code=13, # ERROR_INTERNAL
|
||||||
|
message="Application error in method handler"
|
||||||
|
)
|
||||||
|
|
||||||
|
session.clear_user_turn()
|
||||||
|
|
||||||
|
# listen to the caller if multi-user
|
||||||
|
room_io.set_participant(data.caller_identity)
|
||||||
|
session.input.set_audio_enabled(True)
|
||||||
|
|
||||||
|
@ctx.room.local_participant.register_rpc_method("end_turn")
|
||||||
|
async def end_turn(data: rtc.RpcInvocationData):
|
||||||
|
session.input.set_audio_enabled(False)
|
||||||
|
session.commit_user_turn(
|
||||||
|
# the timeout for the final transcript to be received after committing the user turn
|
||||||
|
# increase this value if the STT is slow to respond
|
||||||
|
transcript_timeout=10.0,
|
||||||
|
# the duration of the silence to be appended to the STT to make it generate the final transcript
|
||||||
|
stt_flush_duration=2.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
@ctx.room.local_participant.register_rpc_method("cancel_turn")
|
||||||
|
async def cancel_turn(data: rtc.RpcInvocationData):
|
||||||
|
session.input.set_audio_enabled(False)
|
||||||
|
session.clear_user_turn()
|
||||||
|
logger.info("cancel turn")
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument("--avatar-url", type=str, default=None, help="Avatar dispatcher URL")
|
parser.add_argument("--avatar-url", type=str, default=None, help="Avatar dispatcher URL")
|
||||||
|
|||||||
@@ -5,13 +5,14 @@ import {
|
|||||||
BarVisualizer,
|
BarVisualizer,
|
||||||
useConnectionState,
|
useConnectionState,
|
||||||
useLocalParticipant,
|
useLocalParticipant,
|
||||||
|
useParticipantAttributes,
|
||||||
useRoomContext,
|
useRoomContext,
|
||||||
useTracks,
|
useTracks,
|
||||||
useVoiceAssistant,
|
useVoiceAssistant,
|
||||||
VideoTrack,
|
VideoTrack,
|
||||||
} from "@livekit/components-react";
|
} from "@livekit/components-react";
|
||||||
import { ConnectionState, Track, LocalParticipant, Room } from "livekit-client";
|
import { ConnectionState, Track, LocalParticipant, Room } from "livekit-client";
|
||||||
import { useEffect, useMemo, useState, useRef } from "react";
|
import { useEffect, useMemo, useState, useRef, useCallback } from "react";
|
||||||
import { BatteryIcon, ImageIcon, MicIcon, MicOffIcon, PhoneIcon, PhoneOffIcon, WifiIcon, SwitchCameraIcon, VoiceIcon, CheckIcon } from "./icons";
|
import { BatteryIcon, ImageIcon, MicIcon, MicOffIcon, PhoneIcon, PhoneOffIcon, WifiIcon, SwitchCameraIcon, VoiceIcon, CheckIcon } from "./icons";
|
||||||
import { useToast } from "@/components/toast/ToasterProvider";
|
import { useToast } from "@/components/toast/ToasterProvider";
|
||||||
|
|
||||||
@@ -43,6 +44,9 @@ export function PhoneSimulator({
|
|||||||
const { localParticipant, isMicrophoneEnabled: isMicEnabled } = useLocalParticipant();
|
const { localParticipant, isMicrophoneEnabled: isMicEnabled } = useLocalParticipant();
|
||||||
const tracks = useTracks();
|
const tracks = useTracks();
|
||||||
const voiceAssistant = useVoiceAssistant();
|
const voiceAssistant = useVoiceAssistant();
|
||||||
|
const agentAttributes = useParticipantAttributes({
|
||||||
|
participant: voiceAssistant.agent,
|
||||||
|
});
|
||||||
const fileInputRef = useRef<HTMLInputElement>(null);
|
const fileInputRef = useRef<HTMLInputElement>(null);
|
||||||
const phoneContainerRef = useRef<HTMLDivElement>(null);
|
const phoneContainerRef = useRef<HTMLDivElement>(null);
|
||||||
const visualizerRef = useRef<HTMLDivElement>(null);
|
const visualizerRef = useRef<HTMLDivElement>(null);
|
||||||
@@ -59,6 +63,9 @@ export function PhoneSimulator({
|
|||||||
const isAgentSpeaking = voiceAssistant.state === "speaking";
|
const isAgentSpeaking = voiceAssistant.state === "speaking";
|
||||||
const wasMicEnabledRef = useRef(false);
|
const wasMicEnabledRef = useRef(false);
|
||||||
const lastPhoneMode = useRef(phoneMode);
|
const lastPhoneMode = useRef(phoneMode);
|
||||||
|
const [isPushToTalkActive, setIsPushToTalkActive] = useState(false);
|
||||||
|
const [interruptRejected, setInterruptRejected] = useState(false);
|
||||||
|
const pushToTalkButtonRef = useRef<HTMLButtonElement>(null);
|
||||||
|
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
const voiceAttr = config.settings.attributes?.find(a => a.key === "voice");
|
const voiceAttr = config.settings.attributes?.find(a => a.key === "voice");
|
||||||
@@ -421,6 +428,202 @@ export function PhoneSimulator({
|
|||||||
setShowVoiceMenu(!showVoiceMenu);
|
setShowVoiceMenu(!showVoiceMenu);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Check if agent supports push-to-talk (optional check, button will show regardless)
|
||||||
|
const supportsPushToTalk = useMemo(() => {
|
||||||
|
if (!voiceAssistant.agent || !agentAttributes.attributes) return false;
|
||||||
|
return agentAttributes.attributes["push-to-talk"] === "1";
|
||||||
|
}, [voiceAssistant.agent, agentAttributes.attributes]);
|
||||||
|
|
||||||
|
const handlePushToTalkStart = async () => {
|
||||||
|
if (!room || !voiceAssistant.agent || isPushToTalkActive) return;
|
||||||
|
|
||||||
|
// Reset interrupt rejection state
|
||||||
|
setInterruptRejected(false);
|
||||||
|
|
||||||
|
try {
|
||||||
|
await room.localParticipant.performRpc({
|
||||||
|
destinationIdentity: voiceAssistant.agent.identity,
|
||||||
|
method: "start_turn",
|
||||||
|
payload: "",
|
||||||
|
});
|
||||||
|
setIsPushToTalkActive(true);
|
||||||
|
setInterruptRejected(false);
|
||||||
|
} catch (error: any) {
|
||||||
|
// Prevent error from propagating to React error boundary
|
||||||
|
// by handling all expected errors here
|
||||||
|
setIsPushToTalkActive(false);
|
||||||
|
|
||||||
|
const errorMessage = error?.message || "";
|
||||||
|
const errorCode = error?.code;
|
||||||
|
|
||||||
|
// Check for "Method not supported at destination" - this happens when RPC methods aren't registered yet
|
||||||
|
// This can occur on first call before agent is fully ready, so we silently ignore it
|
||||||
|
if (errorMessage.includes("Method not supported at destination") ||
|
||||||
|
errorMessage.includes("method not found") ||
|
||||||
|
errorCode === 12) { // METHOD_NOT_FOUND
|
||||||
|
// Silently ignore - the method will be available after first turn
|
||||||
|
console.log("RPC method not ready yet, will be available after first turn");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for "Application error in method handler" - this indicates interrupt failed
|
||||||
|
// This error is raised when session.interrupt() fails in the agent
|
||||||
|
// We handle this gracefully by showing "不允许打断" on the button, so we don't log it as an error
|
||||||
|
if (errorMessage.includes("Application error in method handler") ||
|
||||||
|
errorMessage.includes("Application error") ||
|
||||||
|
errorCode === 13 || // ERROR_INTERNAL (RpcErrorCode.ERROR_INTERNAL)
|
||||||
|
(isAgentSpeaking && errorMessage.includes("interrupt"))) {
|
||||||
|
// Suppress error logging for expected interrupt failures
|
||||||
|
// Only log at debug level to avoid error popups
|
||||||
|
if (process.env.NODE_ENV === 'development') {
|
||||||
|
console.log("Interrupt rejected (expected behavior):", errorMessage);
|
||||||
|
}
|
||||||
|
setInterruptRejected(true);
|
||||||
|
// Clear the rejection message after 3 seconds
|
||||||
|
setTimeout(() => setInterruptRejected(false), 3000);
|
||||||
|
// Explicitly prevent error from propagating
|
||||||
|
error.preventDefault?.();
|
||||||
|
error.stopPropagation?.();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if agent is speaking and the error suggests interruption was rejected
|
||||||
|
if (isAgentSpeaking) {
|
||||||
|
// Check for common rejection indicators
|
||||||
|
if (errorMessage.includes("reject") ||
|
||||||
|
errorMessage.includes("not allowed") ||
|
||||||
|
errorCode === 403 || // Forbidden
|
||||||
|
errorCode === 409) { // Conflict
|
||||||
|
// Suppress error logging for expected rejections
|
||||||
|
if (process.env.NODE_ENV === 'development') {
|
||||||
|
console.log("Interrupt rejected:", errorMessage);
|
||||||
|
}
|
||||||
|
setInterruptRejected(true);
|
||||||
|
// Clear the rejection message after 3 seconds
|
||||||
|
setTimeout(() => setInterruptRejected(false), 3000);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Only log and show error for unexpected errors
|
||||||
|
console.error("Unexpected error in push-to-talk:", error);
|
||||||
|
const defaultErrorMessage = "Agent does not support push-to-talk. Make sure your agent has the push-to-talk RPC methods (start_turn, end_turn, cancel_turn) registered.";
|
||||||
|
setToastMessage({ message: defaultErrorMessage, type: "error" });
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const handlePushToTalkEnd = useCallback(async () => {
|
||||||
|
if (!room || !voiceAssistant.agent || !isPushToTalkActive) return;
|
||||||
|
|
||||||
|
try {
|
||||||
|
await room.localParticipant.performRpc({
|
||||||
|
destinationIdentity: voiceAssistant.agent.identity,
|
||||||
|
method: "end_turn",
|
||||||
|
payload: "",
|
||||||
|
});
|
||||||
|
setIsPushToTalkActive(false);
|
||||||
|
setInterruptRejected(false);
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error("Failed to end turn:", error);
|
||||||
|
// Don't show error toast on end_turn failure as it might be called during cleanup
|
||||||
|
setIsPushToTalkActive(false);
|
||||||
|
setInterruptRejected(false);
|
||||||
|
}
|
||||||
|
}, [room, voiceAssistant.agent, isPushToTalkActive]);
|
||||||
|
|
||||||
|
const handlePushToTalkCancel = useCallback(async () => {
|
||||||
|
if (!room || !voiceAssistant.agent || !isPushToTalkActive) return;
|
||||||
|
|
||||||
|
try {
|
||||||
|
await room.localParticipant.performRpc({
|
||||||
|
destinationIdentity: voiceAssistant.agent.identity,
|
||||||
|
method: "cancel_turn",
|
||||||
|
payload: "",
|
||||||
|
});
|
||||||
|
setIsPushToTalkActive(false);
|
||||||
|
setInterruptRejected(false);
|
||||||
|
} catch (error) {
|
||||||
|
console.error("Failed to cancel turn:", error);
|
||||||
|
setIsPushToTalkActive(false);
|
||||||
|
setInterruptRejected(false);
|
||||||
|
}
|
||||||
|
}, [room, voiceAssistant.agent, isPushToTalkActive]);
|
||||||
|
|
||||||
|
// Handle mouse events for push-to-talk
|
||||||
|
const handlePushToTalkMouseDown = (e: React.MouseEvent) => {
|
||||||
|
e.preventDefault();
|
||||||
|
handlePushToTalkStart();
|
||||||
|
};
|
||||||
|
|
||||||
|
const handlePushToTalkMouseUp = (e: React.MouseEvent) => {
|
||||||
|
e.preventDefault();
|
||||||
|
handlePushToTalkEnd();
|
||||||
|
};
|
||||||
|
|
||||||
|
// Handle touch events for push-to-talk
|
||||||
|
const handlePushToTalkTouchStart = (e: React.TouchEvent) => {
|
||||||
|
e.preventDefault();
|
||||||
|
handlePushToTalkStart();
|
||||||
|
};
|
||||||
|
|
||||||
|
const handlePushToTalkTouchEnd = (e: React.TouchEvent) => {
|
||||||
|
e.preventDefault();
|
||||||
|
handlePushToTalkEnd();
|
||||||
|
};
|
||||||
|
|
||||||
|
// Handle window blur, escape key, and global mouse/touch events to cancel/end push-to-talk
|
||||||
|
useEffect(() => {
|
||||||
|
if (!isPushToTalkActive) return;
|
||||||
|
|
||||||
|
const handleBlur = () => {
|
||||||
|
handlePushToTalkCancel();
|
||||||
|
};
|
||||||
|
|
||||||
|
const handleKeyDown = (e: KeyboardEvent) => {
|
||||||
|
if (e.key === "Escape") {
|
||||||
|
handlePushToTalkCancel();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Handle global mouseup/touchend to end push-to-talk even if released outside button
|
||||||
|
const handleGlobalMouseUp = () => {
|
||||||
|
handlePushToTalkEnd();
|
||||||
|
};
|
||||||
|
|
||||||
|
const handleGlobalTouchEnd = () => {
|
||||||
|
handlePushToTalkEnd();
|
||||||
|
};
|
||||||
|
|
||||||
|
window.addEventListener("blur", handleBlur);
|
||||||
|
window.addEventListener("keydown", handleKeyDown);
|
||||||
|
window.addEventListener("mouseup", handleGlobalMouseUp);
|
||||||
|
window.addEventListener("touchend", handleGlobalTouchEnd);
|
||||||
|
|
||||||
|
return () => {
|
||||||
|
window.removeEventListener("blur", handleBlur);
|
||||||
|
window.removeEventListener("keydown", handleKeyDown);
|
||||||
|
window.removeEventListener("mouseup", handleGlobalMouseUp);
|
||||||
|
window.removeEventListener("touchend", handleGlobalTouchEnd);
|
||||||
|
};
|
||||||
|
}, [isPushToTalkActive, handlePushToTalkCancel, handlePushToTalkEnd]);
|
||||||
|
|
||||||
|
// Clean up push-to-talk state on disconnect
|
||||||
|
useEffect(() => {
|
||||||
|
if (roomState === ConnectionState.Disconnected && isPushToTalkActive) {
|
||||||
|
setIsPushToTalkActive(false);
|
||||||
|
setInterruptRejected(false);
|
||||||
|
}
|
||||||
|
}, [roomState, isPushToTalkActive]);
|
||||||
|
|
||||||
|
// Reset interrupt rejection when agent stops speaking
|
||||||
|
useEffect(() => {
|
||||||
|
if (!isAgentSpeaking && interruptRejected) {
|
||||||
|
// Clear rejection state when agent finishes speaking
|
||||||
|
const timer = setTimeout(() => setInterruptRejected(false), 1000);
|
||||||
|
return () => clearTimeout(timer);
|
||||||
|
}
|
||||||
|
}, [isAgentSpeaking, interruptRejected]);
|
||||||
|
|
||||||
const handleFileChange = (event: React.ChangeEvent<HTMLInputElement>) => {
|
const handleFileChange = (event: React.ChangeEvent<HTMLInputElement>) => {
|
||||||
const file = event.target.files?.[0];
|
const file = event.target.files?.[0];
|
||||||
if (file && onCapture) {
|
if (file && onCapture) {
|
||||||
@@ -807,6 +1010,32 @@ export function PhoneSimulator({
|
|||||||
</div>
|
</div>
|
||||||
) : (
|
) : (
|
||||||
<div className="absolute bottom-[5%] left-0 w-full px-[8%] z-40">
|
<div className="absolute bottom-[5%] left-0 w-full px-[8%] z-40">
|
||||||
|
<div className="w-full flex flex-col items-center justify-center gap-4">
|
||||||
|
{/* Push-to-Talk Button - Centered and Bigger */}
|
||||||
|
{phoneMode !== "important_message" && phoneMode !== "hand_off" && voiceAssistant.agent && (
|
||||||
|
<button
|
||||||
|
ref={pushToTalkButtonRef}
|
||||||
|
className={`w-24 h-24 rounded-full backdrop-blur-md transition-all flex flex-col items-center justify-center gap-2 ${
|
||||||
|
interruptRejected
|
||||||
|
? "bg-red-500/70 text-white"
|
||||||
|
: isPushToTalkActive
|
||||||
|
? "bg-green-500 text-white scale-110 shadow-lg shadow-green-500/50"
|
||||||
|
: "bg-blue-500/70 text-white hover:bg-blue-500/90"
|
||||||
|
}`}
|
||||||
|
onMouseDown={handlePushToTalkMouseDown}
|
||||||
|
onMouseUp={handlePushToTalkMouseUp}
|
||||||
|
onTouchStart={handlePushToTalkTouchStart}
|
||||||
|
onTouchEnd={handlePushToTalkTouchEnd}
|
||||||
|
title={supportsPushToTalk ? "Push to Talk" : "Push to Talk (may not be supported by this agent)"}
|
||||||
|
>
|
||||||
|
<MicIcon className="w-8 h-8" />
|
||||||
|
<span className="text-xs font-medium">
|
||||||
|
{interruptRejected ? "不允许打断" : "按住说话"}
|
||||||
|
</span>
|
||||||
|
</button>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{/* Other Controls */}
|
||||||
<div className="w-full flex items-center justify-center gap-8">
|
<div className="w-full flex items-center justify-center gap-8">
|
||||||
{phoneMode !== "important_message" && phoneMode !== "hand_off" && (
|
{phoneMode !== "important_message" && phoneMode !== "hand_off" && (
|
||||||
<button
|
<button
|
||||||
@@ -833,6 +1062,7 @@ export function PhoneSimulator({
|
|||||||
</button>
|
</button>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
</div>
|
||||||
)
|
)
|
||||||
)}
|
)}
|
||||||
</div>
|
</div>
|
||||||
|
|||||||
@@ -108,6 +108,16 @@ export function HomeInner() {
|
|||||||
token={token}
|
token={token}
|
||||||
connect={shouldConnect}
|
connect={shouldConnect}
|
||||||
onError={(e) => {
|
onError={(e) => {
|
||||||
|
// Filter out expected errors from push-to-talk interrupt failures
|
||||||
|
// These are handled gracefully in the PhoneSimulator component
|
||||||
|
if (e.message?.includes("Application error in method handler") ||
|
||||||
|
e.message?.includes("Method not supported at destination")) {
|
||||||
|
// Silently ignore - these are expected and handled in PhoneSimulator
|
||||||
|
if (process.env.NODE_ENV === 'development') {
|
||||||
|
console.log("Filtered expected error:", e.message);
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
setToastMessage({ message: e.message, type: "error" });
|
setToastMessage({ message: e.message, type: "error" });
|
||||||
console.error(e);
|
console.error(e);
|
||||||
}}
|
}}
|
||||||
|
|||||||
Reference in New Issue
Block a user