a better push to talk layout

This commit is contained in:
Xin Wang 2025-12-16 15:56:46 +08:00
parent 1774f550dd
commit e09e4b6930
2 changed files with 156 additions and 48 deletions

View File

@ -1016,6 +1016,9 @@ async def entrypoint(ctx: JobContext, avatar_dispatcher_url: str = None, vision_
# disable input audio at the start # disable input audio at the start
session.input.set_audio_enabled(False) session.input.set_audio_enabled(False)
# Track current audio state for mode switching
_audio_enabled_state = False
@ctx.room.local_participant.register_rpc_method("start_turn") @ctx.room.local_participant.register_rpc_method("start_turn")
async def start_turn(data: rtc.RpcInvocationData): async def start_turn(data: rtc.RpcInvocationData):
@ -1053,6 +1056,16 @@ async def entrypoint(ctx: JobContext, avatar_dispatcher_url: str = None, vision_
session.clear_user_turn() session.clear_user_turn()
logger.info("cancel turn") logger.info("cancel turn")
@ctx.room.local_participant.register_rpc_method("switch_ptt_and_rt")
async def switch_ptt_and_rt(data: rtc.RpcInvocationData):
nonlocal _audio_enabled_state
# Toggle audio input state
_audio_enabled_state = not _audio_enabled_state
session.input.set_audio_enabled(_audio_enabled_state)
mode = "push-to-talk" if not _audio_enabled_state else "realtime"
logger.info(f"Switched to {mode} mode (audio enabled: {_audio_enabled_state})")
return json.dumps({"success": True, "mode": mode, "audio_enabled": _audio_enabled_state})
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--avatar-url", type=str, default=None, help="Avatar dispatcher URL") parser.add_argument("--avatar-url", type=str, default=None, help="Avatar dispatcher URL")

View File

@ -65,6 +65,7 @@ export function PhoneSimulator({
const lastPhoneMode = useRef(phoneMode); const lastPhoneMode = useRef(phoneMode);
const [isPushToTalkActive, setIsPushToTalkActive] = useState(false); const [isPushToTalkActive, setIsPushToTalkActive] = useState(false);
const [interruptRejected, setInterruptRejected] = useState(false); const [interruptRejected, setInterruptRejected] = useState(false);
const [isPushToTalkMode, setIsPushToTalkMode] = useState(true); // false = realtime mode, true = PTT mode (default)
const pushToTalkButtonRef = useRef<HTMLButtonElement>(null); const pushToTalkButtonRef = useRef<HTMLButtonElement>(null);
useEffect(() => { useEffect(() => {
@ -428,6 +429,23 @@ export function PhoneSimulator({
setShowVoiceMenu(!showVoiceMenu); setShowVoiceMenu(!showVoiceMenu);
}; };
const handleModeSwitch = async () => {
if (!room || !voiceAssistant.agent) return;
try {
await room.localParticipant.performRpc({
destinationIdentity: voiceAssistant.agent.identity,
method: "switch_ptt_and_rt",
payload: "",
});
// Toggle mode on success
setIsPushToTalkMode(prev => !prev);
} catch (error: any) {
console.error("Failed to switch mode:", error);
// Don't show error toast for mode switch failures, just log
}
};
// Check if agent supports push-to-talk (optional check, button will show regardless) // Check if agent supports push-to-talk (optional check, button will show regardless)
const supportsPushToTalk = useMemo(() => { const supportsPushToTalk = useMemo(() => {
if (!voiceAssistant.agent || !agentAttributes.attributes) return false; if (!voiceAssistant.agent || !agentAttributes.attributes) return false;
@ -513,6 +531,9 @@ export function PhoneSimulator({
}; };
const handlePushToTalkEnd = useCallback(async () => { const handlePushToTalkEnd = useCallback(async () => {
// Always clear interrupt rejection state when button is released
setInterruptRejected(false);
if (!room || !voiceAssistant.agent || !isPushToTalkActive) return; if (!room || !voiceAssistant.agent || !isPushToTalkActive) return;
try { try {
@ -522,16 +543,17 @@ export function PhoneSimulator({
payload: "", payload: "",
}); });
setIsPushToTalkActive(false); setIsPushToTalkActive(false);
setInterruptRejected(false);
} catch (error: any) { } catch (error: any) {
console.error("Failed to end turn:", error); console.error("Failed to end turn:", error);
// Don't show error toast on end_turn failure as it might be called during cleanup // Don't show error toast on end_turn failure as it might be called during cleanup
setIsPushToTalkActive(false); setIsPushToTalkActive(false);
setInterruptRejected(false);
} }
}, [room, voiceAssistant.agent, isPushToTalkActive]); }, [room, voiceAssistant.agent, isPushToTalkActive]);
const handlePushToTalkCancel = useCallback(async () => { const handlePushToTalkCancel = useCallback(async () => {
// Always clear interrupt rejection state when button is cancelled
setInterruptRejected(false);
if (!room || !voiceAssistant.agent || !isPushToTalkActive) return; if (!room || !voiceAssistant.agent || !isPushToTalkActive) return;
try { try {
@ -541,11 +563,9 @@ export function PhoneSimulator({
payload: "", payload: "",
}); });
setIsPushToTalkActive(false); setIsPushToTalkActive(false);
setInterruptRejected(false);
} catch (error) { } catch (error) {
console.error("Failed to cancel turn:", error); console.error("Failed to cancel turn:", error);
setIsPushToTalkActive(false); setIsPushToTalkActive(false);
setInterruptRejected(false);
} }
}, [room, voiceAssistant.agent, isPushToTalkActive]); }, [room, voiceAssistant.agent, isPushToTalkActive]);
@ -587,10 +607,14 @@ export function PhoneSimulator({
// Handle global mouseup/touchend to end push-to-talk even if released outside button // Handle global mouseup/touchend to end push-to-talk even if released outside button
const handleGlobalMouseUp = () => { const handleGlobalMouseUp = () => {
// Clear interrupt rejection state immediately when button is released
setInterruptRejected(false);
handlePushToTalkEnd(); handlePushToTalkEnd();
}; };
const handleGlobalTouchEnd = () => { const handleGlobalTouchEnd = () => {
// Clear interrupt rejection state immediately when button is released
setInterruptRejected(false);
handlePushToTalkEnd(); handlePushToTalkEnd();
}; };
@ -1011,56 +1035,127 @@ export function PhoneSimulator({
) : ( ) : (
<div className="absolute bottom-[5%] left-0 w-full px-[8%] z-40"> <div className="absolute bottom-[5%] left-0 w-full px-[8%] z-40">
<div className="w-full flex flex-col items-center justify-center gap-4"> <div className="w-full flex flex-col items-center justify-center gap-4">
{/* Push-to-Talk Button - Centered and Bigger */} {/* Mode Toggle Switch */}
{phoneMode !== "important_message" && phoneMode !== "hand_off" && voiceAssistant.agent && ( {phoneMode !== "important_message" && phoneMode !== "hand_off" && voiceAssistant.agent && (
<button <div className="flex items-center gap-3 mb-2">
ref={pushToTalkButtonRef} <span className={`text-xs font-medium transition-colors ${isPushToTalkMode ? "text-white" : "text-gray-400"}`}>
className={`w-24 h-24 rounded-full backdrop-blur-md transition-all flex flex-col items-center justify-center gap-2 ${
interruptRejected
? "bg-red-500/70 text-white"
: isPushToTalkActive
? "bg-green-500 text-white scale-110 shadow-lg shadow-green-500/50"
: "bg-blue-500/70 text-white hover:bg-blue-500/90"
}`}
onMouseDown={handlePushToTalkMouseDown}
onMouseUp={handlePushToTalkMouseUp}
onTouchStart={handlePushToTalkTouchStart}
onTouchEnd={handlePushToTalkTouchEnd}
title={supportsPushToTalk ? "Push to Talk" : "Push to Talk (may not be supported by this agent)"}
>
<MicIcon className="w-8 h-8" />
<span className="text-xs font-medium">
{interruptRejected ? "不允许打断" : "按住说话"}
</span> </span>
</button> <button
onClick={handleModeSwitch}
className={`relative inline-flex h-6 w-11 items-center rounded-full transition-colors focus:outline-none focus:ring-2 focus:ring-blue-500 focus:ring-offset-2 ${
!isPushToTalkMode ? "bg-blue-500" : "bg-gray-600"
}`}
role="switch"
aria-checked={!isPushToTalkMode}
title={isPushToTalkMode ? "切换到实时对话模式" : "切换到按下说话模式"}
>
<span
className={`inline-block h-4 w-4 transform rounded-full bg-white transition-transform ${
!isPushToTalkMode ? "translate-x-6" : "translate-x-1"
}`}
/>
</button>
<span className={`text-xs font-medium transition-colors ${!isPushToTalkMode ? "text-white" : "text-gray-400"}`}>
</span>
</div>
)} )}
{/* Other Controls */} {/* Push-to-Talk Mode Layout */}
<div className="w-full flex items-center justify-center gap-8"> {isPushToTalkMode && phoneMode !== "important_message" && phoneMode !== "hand_off" && voiceAssistant.agent && (
{phoneMode !== "important_message" && phoneMode !== "hand_off" && ( <div className="w-full flex items-center justify-center gap-8">
<button {/* Camera Switch Button - Left */}
className={`p-4 rounded-full backdrop-blur-md transition-colors ${ <div className="relative">
!isMicEnabled <button
? "bg-white text-black" className="p-4 rounded-full bg-gray-800/50 text-white hover:bg-gray-800/70 transition-colors"
: "bg-gray-600/50 text-white hover:bg-gray-600/70" onClick={handleSwitchCamera}
}`} >
onClick={handleMicToggle} <SwitchCameraIcon className="w-6 h-6" />
> </button>
{isMicEnabled ? ( {showCameraMenu && (
<MicIcon className="w-6 h-6" /> <div className="absolute bottom-full mb-2 left-0 bg-gray-900 border border-gray-800 rounded-lg shadow-xl py-2 w-48 z-50">
) : ( {cameras.length === 0 ? (
<MicOffIcon className="w-6 h-6" /> <div className="px-4 py-2 text-gray-500 text-sm">
No cameras found
</div>
) : (
cameras.map((device) => (
<button
key={device.deviceId}
onClick={() => handleSelectCamera(device.deviceId)}
className="w-full text-left px-4 py-2 text-sm text-white hover:bg-gray-800 transition-colors truncate"
>
{device.label ||
`Camera ${cameras.indexOf(device) + 1}`}
</button>
))
)}
</div>
)} )}
</button> </div>
)}
<button {/* Large Push-to-Talk Button - Center */}
className="p-4 rounded-full bg-red-500 text-white hover:bg-red-600 transition-colors" <button
onClick={handleDisconnect} ref={pushToTalkButtonRef}
> className={`w-24 h-24 rounded-full backdrop-blur-md transition-all flex flex-col items-center justify-center gap-2 ${
<PhoneOffIcon className="w-6 h-6" /> interruptRejected
</button> ? "bg-red-500/70 text-white"
</div> : isPushToTalkActive
? "bg-green-500 text-white scale-110 shadow-lg shadow-green-500/50"
: "bg-blue-500/70 text-white hover:bg-blue-500/90"
}`}
onMouseDown={handlePushToTalkMouseDown}
onMouseUp={handlePushToTalkMouseUp}
onTouchStart={handlePushToTalkTouchStart}
onTouchEnd={handlePushToTalkTouchEnd}
title={supportsPushToTalk ? "Push to Talk" : "Push to Talk (may not be supported by this agent)"}
>
<MicIcon className="w-8 h-8" />
<span className="text-xs font-medium">
{interruptRejected ? "不允许打断" : "按住说话"}
</span>
</button>
{/* End Call Button - Right */}
<button
className="p-4 rounded-full bg-red-500 text-white hover:bg-red-600 transition-colors"
onClick={handleDisconnect}
>
<PhoneOffIcon className="w-6 h-6" />
</button>
</div>
)}
{/* Realtime Mode Layout */}
{!isPushToTalkMode && (
<div className="w-full flex items-center justify-center gap-8">
{phoneMode !== "important_message" && phoneMode !== "hand_off" && (
<button
className={`p-4 rounded-full backdrop-blur-md transition-colors ${
!isMicEnabled
? "bg-white text-black"
: "bg-gray-600/50 text-white hover:bg-gray-600/70"
}`}
onClick={handleMicToggle}
>
{isMicEnabled ? (
<MicIcon className="w-6 h-6" />
) : (
<MicOffIcon className="w-6 h-6" />
)}
</button>
)}
{/* End Call Button */}
<button
className="p-4 rounded-full bg-red-500 text-white hover:bg-red-600 transition-colors"
onClick={handleDisconnect}
>
<PhoneOffIcon className="w-6 h-6" />
</button>
</div>
)}
</div> </div>
</div> </div>
) )