fix long run bug

2026-02-03 12:05:09 +08:00
parent a2e341b433
commit 8bc24ded59
6 changed files with 343 additions and 11 deletions
--- a/README.md
+++ b/README.md
@@ -5,3 +5,21 @@ Python Active-Call: real-time audio streaming with WebSocket and WebRTC.
 This repo contains a Python 3.11+ codebase for building low-latency voice
 pipelines (capture, stream, and process audio) using WebRTC and WebSockets.
 It is currently in an early, experimental stage.
 # Usage
 启动
 ```
 uvicorn app.main:app --reload --host 0.0.0.0 --port 8000
 ```
 测试
 ```
 python examples/test_websocket.py
 ```
 ```
 python mic_client.py
 ```
--- a/core/duplex_pipeline.py
+++ b/core/duplex_pipeline.py
@@ -113,6 +113,10 @@ class DuplexPipeline:
        # Interruption handling
        self._interrupt_event = asyncio.Event()
        # Latency tracking - TTFB (Time to First Byte)
        self._turn_start_time: Optional[float] = None
        self._first_audio_sent: bool = False
        # Barge-in filtering - require minimum speech duration to interrupt
        self._barge_in_speech_start_time: Optional[float] = None
        self._barge_in_min_duration_ms: int = settings.barge_in_min_duration_ms if hasattr(settings, 'barge_in_min_duration_ms') else 50
@@ -396,6 +400,10 @@ class DuplexPipeline:
            user_text: User's transcribed text
        """
        try:
            # Start latency tracking
            self._turn_start_time = time.time()
            self._first_audio_sent = False
            # Get AI response (streaming)
            messages = self.conversation.get_messages()
            full_response = ""
@@ -495,10 +503,33 @@ class DuplexPipeline:
        try:
            async for chunk in self.tts_service.synthesize_stream(text):
                # Check interrupt at the start of each iteration
                if self._interrupt_event.is_set():
                    logger.debug("TTS sentence interrupted")
                    break
                # Track and log first audio packet latency (TTFB)
                if not self._first_audio_sent and self._turn_start_time:
                    ttfb_ms = (time.time() - self._turn_start_time) * 1000
                    self._first_audio_sent = True
                    logger.info(f"[TTFB] Server first audio packet latency: {ttfb_ms:.0f}ms (session {self.session_id})")
                    # Send TTFB event to client
                    await self.transport.send_event({
                        "event": "ttfb",
                        "trackId": self.session_id,
                        "timestamp": self._get_timestamp_ms(),
                        "latencyMs": round(ttfb_ms)
                    })
                # Double-check interrupt right before sending audio
                if self._interrupt_event.is_set():
                    break
                await self.transport.send_audio(chunk.audio)
                await asyncio.sleep(0.005)  # Small delay to prevent flooding
        except asyncio.CancelledError:
            logger.debug("TTS sentence cancelled")
        except Exception as e:
            logger.error(f"TTS sentence error: {e}")
@@ -513,6 +544,10 @@ class DuplexPipeline:
            return
        try:
            # Start latency tracking for greeting
            speak_start_time = time.time()
            first_audio_sent = False
            # Send track start event
            await self.transport.send_event({
                "event": "trackStart",
@@ -528,6 +563,20 @@ class DuplexPipeline:
                    logger.info("TTS interrupted by barge-in")
                    break
                # Track and log first audio packet latency (TTFB)
                if not first_audio_sent:
                    ttfb_ms = (time.time() - speak_start_time) * 1000
                    first_audio_sent = True
                    logger.info(f"[TTFB] Greeting first audio packet latency: {ttfb_ms:.0f}ms (session {self.session_id})")
                    # Send TTFB event to client
                    await self.transport.send_event({
                        "event": "ttfb",
                        "trackId": self.session_id,
                        "timestamp": self._get_timestamp_ms(),
                        "latencyMs": round(ttfb_ms)
                    })
                # Send audio to client
                await self.transport.send_audio(chunk.audio)
@@ -561,8 +610,17 @@ class DuplexPipeline:
        self._barge_in_speech_frames = 0
        self._barge_in_silence_frames = 0
-        # Signal interruption
+        # IMPORTANT: Signal interruption FIRST to stop audio sending
        self._interrupt_event.set()
        self._is_bot_speaking = False
        # Send interrupt event to client IMMEDIATELY
        # This must happen BEFORE canceling services, so client knows to discard in-flight audio
        await self.transport.send_event({
            "event": "interrupt",
            "trackId": self.session_id,
            "timestamp": self._get_timestamp_ms()
        })
        # Cancel TTS
        if self.tts_service:
@@ -575,15 +633,7 @@ class DuplexPipeline:
        # Interrupt conversation
        await self.conversation.interrupt()
        # Send interrupt event to client
        await self.transport.send_event({
            "event": "interrupt",
            "trackId": self.session_id,
            "timestamp": self._get_timestamp_ms()
        })
        # Reset for new user turn
        self._is_bot_speaking = False
        await self.conversation.start_user_turn()
        self._audio_buffer = b""
        self.eou_detector.reset()
--- a/docs/proejct_todo.md
+++ b/docs/proejct_todo.md
@@ -0,0 +1,187 @@
 # OmniSense: 12-Week Sprint Board + Tech Stack (Python Backend) — TODO
 ## Scope
 - [ ] Build a realtime AI SaaS (OmniSense) focused on web-first audio + video with WebSocket + WebRTC endpoints
 - [ ] Deliver assistant builder, tool execution, observability, evals, optional telephony later
 - [ ] Keep scope aligned to 2-person team, self-hosted services
 ---
 ## Sprint Board (12 weeks, 2-week sprints)
 Team assumption: 2 engineers. Scope prioritized to web-first audio + video, with BYO-SFU adapters.
 ### Sprint 1 (Weeks 1–2) — Realtime Core MVP (WebSocket + WebRTC Audio)
 - Deliverables
  - [ ] WebSocket transport: audio in/out streaming (1:1)
  - [ ] WebRTC transport: audio in/out streaming (1:1)
  - [ ] Adapter contract wired into runtime (transport-agnostic session core)
  - [ ] ASR → LLM → TTS pipeline, streaming both directions
  - [ ] Basic session state (start/stop, silence timeout)
  - [ ] Transcript persistence
 - Acceptance criteria
  - [ ] < 1.5s median round-trip for short responses
  - [ ] Stable streaming for 10+ minute session
 ### Sprint 2 (Weeks 3–4) — Video + Realtime UX
 - Deliverables
  - [ ] WebRTC video capture + streaming (assistant can “see” frames)
  - [ ] WebSocket video streaming for local/dev mode
  - [ ] Low-latency UI: push-to-talk, live captions, speaking indicator
  - [ ] Recording + transcript storage (web sessions)
 - Acceptance criteria
  - [ ] Video < 2.5s end-to-end latency for analysis
  - [ ] Audio quality acceptable (no clipping, jitter handling)
 ### Sprint 3 (Weeks 5–6) — Assistant Builder v1
 - Deliverables
  - [ ] Assistant schema + versioning
  - [ ] UI: Model/Voice/Transcriber/Tools/Video/Transport tabs
  - [ ] “Test/Chat/Talk to Assistant” (web)
 - Acceptance criteria
  - [ ] Create/publish assistant and run a live web session
  - [ ] All config changes tracked by version
 ### Sprint 4 (Weeks 7–8) — Tooling + Structured Outputs
 - Deliverables
  - [ ] Tool registry + custom HTTP tools
  - [ ] Tool auth secrets management
  - [ ] Structured outputs (JSON extraction)
 - Acceptance criteria
  - [ ] Tool calls executed with retries/timeouts
  - [ ] Structured JSON stored per call/session
 ### Sprint 5 (Weeks 9–10) — Observability + QA + Dev Platform
 - Deliverables
  - [ ] Session logs + chat logs + media logs
  - [ ] Evals engine + test suites
  - [ ] Basic analytics dashboard
  - [ ] Public WebSocket API spec + message schema
  - [ ] JS/TS SDK (connect, send audio/video, receive transcripts)
 - Acceptance criteria
  - [ ] Reproducible test suite runs
  - [ ] Log filters by assistant/time/status
  - [ ] SDK demo app runs end-to-end
 ### Sprint 6 (Weeks 11–12) — SaaS Hardening
 - Deliverables
  - [ ] Org/RBAC + API keys + rate limits
  - [ ] Usage metering + credits
  - [ ] Stripe billing integration
  - [ ] Self-hosted DB ops (migrations, backup/restore, monitoring)
 - Acceptance criteria
  - [ ] Metered usage per org
  - [ ] Credits decrement correctly
  - [ ] Optional telephony spike documented (defer build)
  - [ ] Enterprise adapter guide published (BYO-SFU)
 ---
 ## Tech Stack by Service (Self-Hosted, Web-First)
 ### 1) Transport Gateway (Realtime)
 - [ ] WebRTC (browser) + WebSocket (lightweight/dev) protocols
 - [ ] BYO-SFU adapter (enterprise) + LiveKit optional adapter + WS transport server
 - [ ] Python core (FastAPI + asyncio) + Node.js mediasoup adapters when needed
 - [ ] Media: Opus/VP8, jitter buffer, VAD, echo cancellation
 - [ ] Storage: S3-compatible (MinIO) for recordings
 ### 2) ASR Service
 - [ ] Whisper (self-hosted) baseline
 - [ ] gRPC/WebSocket streaming transport
 - [ ] Python native service
 - [ ] Optional cloud provider fallback (later)
 ### 3) TTS Service
 - [ ] Piper or Coqui TTS (self-hosted)
 - [ ] gRPC/WebSocket streaming transport
 - [ ] Python native service
 - [ ] Redis cache for common phrases
 ### 4) LLM Orchestrator
 - [ ] Self-hosted (vLLM + open model)
 - [ ] Python (FastAPI + asyncio)
 - [ ] Streaming, tool calling, JSON mode
 - [ ] Safety filters + prompt templates
 ### 5) Assistant Config Service
 - [ ] PostgreSQL
 - [ ] Python (SQLAlchemy or SQLModel)
 - [ ] Versioning, publish/rollback
 ### 6) Session Service
 - [ ] PostgreSQL + Redis
 - [ ] Python
 - [ ] State machine, timeouts, events
 ### 7) Tool Execution Layer
 - [ ] PostgreSQL
 - [ ] Python
 - [ ] Auth secret vault, retry policies, tool schemas
 ### 8) Observability + Logs
 - [ ] Postgres (metadata), ClickHouse (logs/metrics)
 - [ ] OpenSearch for search
 - [ ] Prometheus + Grafana metrics
 - [ ] OpenTelemetry tracing
 ### 9) Billing + Usage Metering
 - [ ] Stripe billing
 - [ ] PostgreSQL
 - [ ] NATS JetStream (events) + Redis counters
 ### 10) Web App (Dashboard)
 - [ ] React + Next.js
 - [ ] Tailwind or Radix UI
 - [ ] WebRTC client + WS client; adapter-based RTC integration
 - [ ] ECharts/Recharts
 ### 11) Auth + RBAC
 - [ ] Keycloak (self-hosted) or custom JWT
 - [ ] Org/user/role tables in Postgres
 ### 12) Public WebSocket API + SDK
 - [ ] WS API: versioned schema, binary audio frames + JSON control messages
 - [ ] SDKs: JS/TS first, optional Python/Go clients
 - [ ] Docs: quickstart, auth flow, session lifecycle, examples
 ---
 ## Infrastructure (Self-Hosted)
 - [ ] Docker Compose → k3s (later)
 - [ ] Redis Streams or NATS
 - [ ] MinIO object store
 - [ ] GitHub Actions + Helm or kustomize
 - [ ] Self-hosted Postgres + pgbackrest backups
 - [ ] Vault for secrets
 ---
 ## Suggested MVP Sequence
 - [ ] WebRTC demo + ASR/LLM/TTS streaming
 - [ ] Assistant schema + versioning (web-first)
 - [ ] Video capture + multimodal analysis
 - [ ] Tool execution + structured outputs
 - [ ] Logs + evals + public WS API + SDK
 - [ ] Telephony (optional, later)
 ---
 ## Public WebSocket API (Minimum Spec)
 - [ ] Auth: API key or JWT in initial `hello` message
 - [ ] Core messages: `session.start`, `session.stop`, `audio.append`, `audio.commit`, `video.append`, `transcript.delta`, `assistant.response`, `tool.call`, `tool.result`, `error`
 - [ ] Binary payloads: PCM/Opus frames with metadata in control channel
 - [ ] Versioning: `v1` schema with backward compatibility rules
 ---
 ## Self-Hosted DB Ops Checklist
 - [ ] Postgres in Docker/k3s with persistent volumes
 - [ ] Migrations: `alembic` or `atlas`
 - [ ] Backups: `pgbackrest` nightly + on-demand
 - [ ] Monitoring: postgres_exporter + alerts
 ---
 ## RTC Adapter Contract (BYO-SFU First)
 - [ ] Keep RTC pluggable; LiveKit optional, not core dependency
 - [ ] Define adapter interface (TypeScript sketch)
--- a/examples/mic_client.py
+++ b/examples/mic_client.py
@@ -17,6 +17,7 @@ import argparse
 import asyncio
 import json
 import sys
 import time
 import threading
 import queue
 from pathlib import Path
@@ -92,6 +93,14 @@ class MicrophoneClient:
        # State
        self.is_recording = True
        self.is_playing = True
        # TTFB tracking (Time to First Byte)
        self.request_start_time = None
        self.first_audio_received = False
        # Interrupt handling - discard audio until next trackStart
        self._discard_audio = False
        self._audio_sequence = 0  # Track audio sequence to detect stale chunks
    async def connect(self) -> None:
        """Connect to WebSocket server."""
@@ -117,6 +126,10 @@ class MicrophoneClient:
    async def send_chat(self, text: str) -> None:
        """Send chat message (text input)."""
        # Reset TTFB tracking for new request
        self.request_start_time = time.time()
        self.first_audio_received = False
        await self.send_command({
            "command": "chat",
            "text": text
@@ -236,9 +249,21 @@ class MicrophoneClient:
                        # Audio data received
                        self.bytes_received += len(message)
                        # Check if we should discard this audio (after interrupt)
                        if self._discard_audio:
                            duration_ms = len(message) / (self.sample_rate * 2) * 1000
                            print(f"← Audio: {duration_ms:.0f}ms (DISCARDED - waiting for new track)")
                            continue
                        if self.is_playing:
                            self._add_audio_to_buffer(message)
                        # Calculate and display TTFB for first audio packet
                        if not self.first_audio_received and self.request_start_time:
                            client_ttfb_ms = (time.time() - self.request_start_time) * 1000
                            self.first_audio_received = True
                            print(f"← [TTFB] Client first audio latency: {client_ttfb_ms:.0f}ms")
                        # Show progress (less verbose)
                        with self.audio_output_lock:
                            buffer_ms = len(self.audio_output_buffer) / (self.sample_rate * 2) * 1000
@@ -285,20 +310,36 @@ class MicrophoneClient:
                # Interim result - show with indicator (overwrite same line)
                display_text = text[:60] + "..." if len(text) > 60 else text
                print(f"  [listening] {display_text}".ljust(80), end="\r")
        elif event_type == "ttfb":
            # Server-side TTFB event
            latency_ms = event.get("latencyMs", 0)
            print(f"← [TTFB] Server reported latency: {latency_ms}ms")
        elif event_type == "trackStart":
            print("← Bot started speaking")
            # IMPORTANT: Accept audio again after trackStart
            self._discard_audio = False
            self._audio_sequence += 1
            # Reset TTFB tracking for voice responses (when no chat was sent)
            if self.request_start_time is None:
                self.request_start_time = time.time()
                self.first_audio_received = False
            # Clear any old audio in buffer
            with self.audio_output_lock:
                self.audio_output_buffer = b""
        elif event_type == "trackEnd":
            print("← Bot finished speaking")
            # Reset TTFB tracking after response completes
            self.request_start_time = None
            self.first_audio_received = False
        elif event_type == "interrupt":
            print("← Bot interrupted!")
-            # IMPORTANT: Clear audio buffer immediately on interrupt
+            # IMPORTANT: Discard all audio until next trackStart
            self._discard_audio = True
            # Clear audio buffer immediately
            with self.audio_output_lock:
                buffer_ms = len(self.audio_output_buffer) / (self.sample_rate * 2) * 1000
                self.audio_output_buffer = b""
-                print(f"   (cleared {buffer_ms:.0f}ms of buffered audio)")
+                print(f"   (cleared {buffer_ms:.0f}ms, discarding audio until new track)")
        elif event_type == "error":
            print(f"← Error: {event.get('error')}")
        elif event_type == "hangup":
--- a/examples/simple_client.py
+++ b/examples/simple_client.py
@@ -12,6 +12,7 @@ import argparse
 import asyncio
 import json
 import sys
 import time
 import wave
 import io
@@ -67,6 +68,13 @@ class SimpleVoiceClient:
        # Stats
        self.bytes_received = 0
        # TTFB tracking (Time to First Byte)
        self.request_start_time = None
        self.first_audio_received = False
        # Interrupt handling - discard audio until next trackStart
        self._discard_audio = False
    async def connect(self):
        """Connect to server."""
@@ -84,6 +92,10 @@ class SimpleVoiceClient:
    async def send_chat(self, text: str):
        """Send chat message."""
        # Reset TTFB tracking for new request
        self.request_start_time = time.time()
        self.first_audio_received = False
        await self.ws.send(json.dumps({"command": "chat", "text": text}))
        print(f"-> chat: {text}")
@@ -120,6 +132,18 @@ class SimpleVoiceClient:
                    # Audio data
                    self.bytes_received += len(msg)
                    duration_ms = len(msg) / (self.sample_rate * 2) * 1000
                    # Check if we should discard this audio (after interrupt)
                    if self._discard_audio:
                        print(f"<- audio: {len(msg)} bytes ({duration_ms:.0f}ms) [DISCARDED]")
                        continue
                    # Calculate and display TTFB for first audio packet
                    if not self.first_audio_received and self.request_start_time:
                        client_ttfb_ms = (time.time() - self.request_start_time) * 1000
                        self.first_audio_received = True
                        print(f"<- [TTFB] Client first audio latency: {client_ttfb_ms:.0f}ms")
                    print(f"<- audio: {len(msg)} bytes ({duration_ms:.0f}ms)")
                    # Play immediately in executor to not block
@@ -138,6 +162,18 @@ class SimpleVoiceClient:
                            print(f"<- You said: {text}")
                        else:
                            print(f"<- [listening] {text}", end="\r")
                    elif etype == "ttfb":
                        # Server-side TTFB event
                        latency_ms = event.get("latencyMs", 0)
                        print(f"<- [TTFB] Server reported latency: {latency_ms}ms")
                    elif etype == "trackStart":
                        # New track starting - accept audio again
                        self._discard_audio = False
                        print(f"<- {etype}")
                    elif etype == "interrupt":
                        # Interrupt - discard audio until next trackStart
                        self._discard_audio = True
                        print(f"<- {etype} (discarding audio until new track)")
                    elif etype == "hangup":
                        print(f"<- {etype}")
                        self.running = False
--- a/examples/test_websocket.py
+++ b/examples/test_websocket.py