import asyncio from typing import Any, Dict, List import pytest from runtime.pipeline.duplex import DuplexPipeline class _DummySileroVAD: def __init__(self, *args, **kwargs): pass def process_audio(self, _pcm: bytes) -> float: return 0.0 class _DummyVADProcessor: def __init__(self, *args, **kwargs): pass def process(self, _speech_prob: float): return "Silence", 0.0 class _DummyEouDetector: def __init__(self, *args, **kwargs): self.is_speaking = True def process(self, _vad_status: str, force_eligible: bool = False) -> bool: _ = force_eligible return False def reset(self) -> None: self.is_speaking = False class _FakeTransport: async def send_event(self, _event: Dict[str, Any]) -> None: return None async def send_audio(self, _audio: bytes) -> None: return None class _FakeStreamingASR: mode = "streaming" def __init__(self): self.begin_calls = 0 self.end_calls = 0 self.wait_calls = 0 self.sent_audio: List[bytes] = [] self.wait_text = "" async def connect(self) -> None: return None async def disconnect(self) -> None: return None async def send_audio(self, audio: bytes) -> None: self.sent_audio.append(audio) async def receive_transcripts(self): if False: yield None async def begin_utterance(self) -> None: self.begin_calls += 1 async def end_utterance(self) -> None: self.end_calls += 1 async def wait_for_final_transcription(self, timeout_ms: int = 800) -> str: _ = timeout_ms self.wait_calls += 1 return self.wait_text def clear_utterance(self) -> None: return None class _FakeOfflineASR: mode = "offline" def __init__(self): self.start_interim_calls = 0 self.stop_interim_calls = 0 self.sent_audio: List[bytes] = [] self.final_text = "offline final" async def connect(self) -> None: return None async def disconnect(self) -> None: return None async def send_audio(self, audio: bytes) -> None: self.sent_audio.append(audio) async def receive_transcripts(self): if False: yield None async def start_interim_transcription(self) -> None: self.start_interim_calls += 1 async def stop_interim_transcription(self) -> None: self.stop_interim_calls += 1 async def get_final_transcription(self) -> str: return self.final_text def clear_buffer(self) -> None: return None def get_and_clear_text(self) -> str: return self.final_text def _build_pipeline(monkeypatch, asr_service): monkeypatch.setattr("runtime.pipeline.duplex.SileroVAD", _DummySileroVAD) monkeypatch.setattr("runtime.pipeline.duplex.VADProcessor", _DummyVADProcessor) monkeypatch.setattr("runtime.pipeline.duplex.EouDetector", _DummyEouDetector) return DuplexPipeline( transport=_FakeTransport(), session_id="asr_mode_test", asr_service=asr_service, ) @pytest.mark.asyncio async def test_start_asr_capture_uses_streaming_begin(monkeypatch): asr = _FakeStreamingASR() pipeline = _build_pipeline(monkeypatch, asr) pipeline._asr_mode = "streaming" pipeline._pending_speech_audio = b"\x00" * 320 pipeline._pre_speech_buffer = b"\x00" * 640 await pipeline._start_asr_capture() assert asr.begin_calls == 1 assert asr.sent_audio assert pipeline._asr_capture_active is True @pytest.mark.asyncio async def test_start_asr_capture_uses_offline_interim_control(monkeypatch): asr = _FakeOfflineASR() pipeline = _build_pipeline(monkeypatch, asr) pipeline._asr_mode = "offline" pipeline._pending_speech_audio = b"\x00" * 320 pipeline._pre_speech_buffer = b"\x00" * 640 await pipeline._start_asr_capture() assert asr.start_interim_calls == 1 assert asr.sent_audio assert pipeline._asr_capture_active is True @pytest.mark.asyncio async def test_streaming_eou_falls_back_to_latest_interim(monkeypatch): asr = _FakeStreamingASR() asr.wait_text = "" pipeline = _build_pipeline(monkeypatch, asr) pipeline._asr_mode = "streaming" pipeline._asr_capture_active = True pipeline._latest_asr_interim_text = "fallback interim text" await pipeline.conversation.start_user_turn() captured_events = [] captured_turns = [] async def _capture_event(event: Dict[str, Any], priority: int = 20): _ = priority captured_events.append(event) async def _noop_stop_current_speech() -> None: return None async def _capture_turn(user_text: str, *args, **kwargs) -> None: _ = (args, kwargs) captured_turns.append(user_text) monkeypatch.setattr(pipeline, "_send_event", _capture_event) monkeypatch.setattr(pipeline, "_stop_current_speech", _noop_stop_current_speech) monkeypatch.setattr(pipeline, "_handle_turn", _capture_turn) await pipeline._on_end_of_utterance() await asyncio.sleep(0.05) assert asr.end_calls == 1 assert asr.wait_calls == 1 assert captured_turns == ["fallback interim text"] assert any(event.get("type") == "transcript.final" for event in captured_events)