Integrate eou and vad

2026-01-29 13:57:12 +08:00
parent 4cb267a288
commit cd90b4fb37
25 changed files with 2592 additions and 297 deletions
--- a/app/init.py
+++ b/app/init.py
@@ -0,0 +1 @@
+"""Active-Call Application Package"""
--- a/app/config.py
+++ b/app/config.py
@@ -0,0 +1,81 @@
+"""Configuration management using Pydantic settings."""
+
+from typing import List, Optional
+from pydantic import Field
+from pydantic_settings import BaseSettings, SettingsConfigDict
+import json
+
+
+class Settings(BaseSettings):
+    """Application settings loaded from environment variables."""
+
+    model_config = SettingsConfigDict(
+        env_file=".env",
+        env_file_encoding="utf-8",
+        case_sensitive=False,
+        extra="ignore"
+    )
+
+    # Server Configuration
+    host: str = Field(default="0.0.0.0", description="Server host address")
+    port: int = Field(default=8000, description="Server port")
+    external_ip: Optional[str] = Field(default=None, description="External IP for NAT traversal")
+
+    # Audio Configuration
+    sample_rate: int = Field(default=16000, description="Audio sample rate in Hz")
+    chunk_size_ms: int = Field(default=20, description="Audio chunk duration in milliseconds")
+    default_codec: str = Field(default="pcm", description="Default audio codec")
+
+    # VAD Configuration
+    vad_type: str = Field(default="silero", description="VAD algorithm type")
+    vad_model_path: str = Field(default="data/vad/silero_vad.onnx", description="Path to VAD model")
+    vad_threshold: float = Field(default=0.5, description="VAD detection threshold")
+    vad_min_speech_duration_ms: int = Field(default=250, description="Minimum speech duration in milliseconds")
+    vad_eou_threshold_ms: int = Field(default=400, description="End of utterance (silence) threshold in milliseconds")
+
+    # Logging
+    log_level: str = Field(default="INFO", description="Logging level")
+    log_format: str = Field(default="json", description="Log format (json or text)")
+
+    # CORS
+    cors_origins: str = Field(
+        default='["http://localhost:3000", "http://localhost:8080"]',
+        description="CORS allowed origins"
+    )
+
+    # ICE Servers (WebRTC)
+    ice_servers: str = Field(
+        default='[{"urls": "stun:stun.l.google.com:19302"}]',
+        description="ICE servers configuration"
+    )
+
+    @property
+    def chunk_size_bytes(self) -> int:
+        """Calculate chunk size in bytes based on sample rate and duration."""
+        # 16-bit (2 bytes) per sample, mono channel
+        return int(self.sample_rate * 2 * (self.chunk_size_ms / 1000.0))
+
+    @property
+    def cors_origins_list(self) -> List[str]:
+        """Parse CORS origins from JSON string."""
+        try:
+            return json.loads(self.cors_origins)
+        except json.JSONDecodeError:
+            return ["http://localhost:3000", "http://localhost:8080"]
+
+    @property
+    def ice_servers_list(self) -> List[dict]:
+        """Parse ICE servers from JSON string."""
+        try:
+            return json.loads(self.ice_servers)
+        except json.JSONDecodeError:
+            return [{"urls": "stun:stun.l.google.com:19302"}]
+
+
+# Global settings instance
+settings = Settings()
+
+
+def get_settings() -> Settings:
+    """Get application settings instance."""
+    return settings
--- a/app/main.py
+++ b/app/main.py
@@ -0,0 +1,290 @@
+"""FastAPI application with WebSocket and WebRTC endpoints."""
+
+import uuid
+import json
+from typing import Dict, Any, Optional
+from fastapi import FastAPI, WebSocket, WebSocketDisconnect, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse
+from loguru import logger
+
+# Try to import aiortc (optional for WebRTC functionality)
+try:
+    from aiortc import RTCPeerConnection, RTCSessionDescription
+    AIORTC_AVAILABLE = True
+except ImportError:
+    AIORTC_AVAILABLE = False
+    logger.warning("aiortc not available - WebRTC endpoint will be disabled")
+
+from app.config import settings
+from core.transports import SocketTransport, WebRtcTransport
+from core.session import Session
+from processors.tracks import Resampled16kTrack
+from core.events import get_event_bus, reset_event_bus
+
+# Initialize FastAPI
+app = FastAPI(title="Python Active-Call", version="0.1.0")
+
+# Configure CORS
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=settings.cors_origins_list,
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+# Active sessions storage
+active_sessions: Dict[str, Session] = {}
+
+# Configure logging
+logger.remove()
+logger.add(
+    "../logs/active_call_{time}.log",
+    rotation="1 day",
+    retention="7 days",
+    level=settings.log_level,
+    format="{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {name}:{function}:{line} - {message}"
+)
+logger.add(
+    lambda msg: print(msg, end=""),
+    level=settings.log_level,
+    format="{time:HH:mm:ss} | {level: <8} | {message}"
+)
+
+
+@app.get("/health")
+async def health_check():
+    """Health check endpoint."""
+    return {"status": "healthy", "sessions": len(active_sessions)}
+
+
+@app.get("/iceservers")
+async def get_ice_servers():
+    """Get ICE servers configuration for WebRTC."""
+    return settings.ice_servers_list
+
+
+@app.get("/call/lists")
+async def list_calls():
+    """List all active calls."""
+    return {
+        "calls": [
+            {
+                "id": session_id,
+                "state": session.state,
+                "created_at": session.created_at
+            }
+            for session_id, session in active_sessions.items()
+        ]
+    }
+
+
+@app.post("/call/kill/{session_id}")
+async def kill_call(session_id: str):
+    """Kill a specific active call."""
+    if session_id not in active_sessions:
+        raise HTTPException(status_code=404, detail="Session not found")
+
+    session = active_sessions[session_id]
+    await session.cleanup()
+    del active_sessions[session_id]
+
+    return True
+
+
+@app.websocket("/ws")
+async def websocket_endpoint(websocket: WebSocket):
+    """
+    WebSocket endpoint for raw audio streaming.
+
+    Accepts mixed text/binary frames:
+    - Text frames: JSON commands
+    - Binary frames: PCM audio data (16kHz, 16-bit, mono)
+    """
+    await websocket.accept()
+    session_id = str(uuid.uuid4())
+
+    # Create transport and session
+    transport = SocketTransport(websocket)
+    session = Session(session_id, transport)
+    active_sessions[session_id] = session
+
+    logger.info(f"WebSocket connection established: {session_id}")
+
+    try:
+        # Receive loop
+        while True:
+            message = await websocket.receive()
+
+            # Handle binary audio data
+            if "bytes" in message:
+                await session.handle_audio(message["bytes"])
+
+            # Handle text commands
+            elif "text" in message:
+                await session.handle_text(message["text"])
+
+    except WebSocketDisconnect:
+        logger.info(f"WebSocket disconnected: {session_id}")
+
+    except Exception as e:
+        logger.error(f"WebSocket error: {e}", exc_info=True)
+
+    finally:
+        # Cleanup session
+        if session_id in active_sessions:
+            await session.cleanup()
+            del active_sessions[session_id]
+
+        logger.info(f"Session {session_id} removed")
+
+
+@app.websocket("/webrtc")
+async def webrtc_endpoint(websocket: WebSocket):
+    """
+    WebRTC endpoint for WebRTC audio streaming.
+
+    Uses WebSocket for signaling (SDP exchange) and WebRTC for media transport.
+    """
+    # Check if aiortc is available
+    if not AIORTC_AVAILABLE:
+        await websocket.close(code=1011, reason="WebRTC not available - aiortc/av not installed")
+        logger.warning("WebRTC connection attempted but aiortc is not available")
+        return
+    await websocket.accept()
+    session_id = str(uuid.uuid4())
+
+    # Create WebRTC peer connection
+    pc = RTCPeerConnection()
+
+    # Create transport and session
+    transport = WebRtcTransport(websocket, pc)
+    session = Session(session_id, transport)
+    active_sessions[session_id] = session
+
+    logger.info(f"WebRTC connection established: {session_id}")
+
+    # Track handler for incoming audio
+    @pc.on("track")
+    def on_track(track):
+        logger.info(f"Track received: {track.kind}")
+
+        if track.kind == "audio":
+            # Wrap track with resampler
+            wrapped_track = Resampled16kTrack(track)
+
+            # Create task to pull audio from track
+            async def pull_audio():
+                try:
+                    while True:
+                        frame = await wrapped_track.recv()
+                        # Convert frame to bytes
+                        pcm_bytes = frame.to_ndarray().tobytes()
+                        # Feed to session
+                        await session.handle_audio(pcm_bytes)
+                except Exception as e:
+                    logger.error(f"Error pulling audio from track: {e}")
+
+            asyncio.create_task(pull_audio())
+
+    @pc.on("connectionstatechange")
+    async def on_connectionstatechange():
+        logger.info(f"Connection state: {pc.connectionState}")
+        if pc.connectionState == "failed" or pc.connectionState == "closed":
+            await session.cleanup()
+
+    try:
+        # Signaling loop
+        while True:
+            message = await websocket.receive()
+
+            if "text" not in message:
+                continue
+
+            data = json.loads(message["text"])
+
+            # Handle SDP offer/answer
+            if "sdp" in data and "type" in data:
+                logger.info(f"Received SDP {data['type']}")
+
+                # Set remote description
+                offer = RTCSessionDescription(sdp=data["sdp"], type=data["type"])
+                await pc.setRemoteDescription(offer)
+
+                # Create and set local description
+                if data["type"] == "offer":
+                    answer = await pc.createAnswer()
+                    await pc.setLocalDescription(answer)
+
+                    # Send answer back
+                    await websocket.send_text(json.dumps({
+                        "event": "answer",
+                        "trackId": session_id,
+                        "timestamp": int(asyncio.get_event_loop().time() * 1000),
+                        "sdp": pc.localDescription.sdp
+                    }))
+
+                    logger.info(f"Sent SDP answer")
+
+            else:
+                # Handle other commands
+                await session.handle_text(message["text"])
+
+    except WebSocketDisconnect:
+        logger.info(f"WebRTC WebSocket disconnected: {session_id}")
+
+    except Exception as e:
+        logger.error(f"WebRTC error: {e}", exc_info=True)
+
+    finally:
+        # Cleanup
+        await pc.close()
+        if session_id in active_sessions:
+            await session.cleanup()
+            del active_sessions[session_id]
+
+        logger.info(f"WebRTC session {session_id} removed")
+
+
+@app.on_event("startup")
+async def startup_event():
+    """Run on application startup."""
+    logger.info("Starting Python Active-Call server")
+    logger.info(f"Server: {settings.host}:{settings.port}")
+    logger.info(f"Sample rate: {settings.sample_rate} Hz")
+    logger.info(f"VAD model: {settings.vad_model_path}")
+
+
+@app.on_event("shutdown")
+async def shutdown_event():
+    """Run on application shutdown."""
+    logger.info("Shutting down Python Active-Call server")
+
+    # Cleanup all sessions
+    for session_id, session in active_sessions.items():
+        await session.cleanup()
+
+    # Close event bus
+    event_bus = get_event_bus()
+    await event_bus.close()
+    reset_event_bus()
+
+    logger.info("Server shutdown complete")
+
+
+if __name__ == "__main__":
+    import uvicorn
+
+    # Create logs directory
+    import os
+    os.makedirs("logs", exist_ok=True)
+
+    # Run server
+    uvicorn.run(
+        "app.main:app",
+        host=settings.host,
+        port=settings.port,
+        reload=True,
+        log_level=settings.log_level.lower()
+    )
--- a/core/init.py
+++ b/core/init.py
@@ -0,0 +1 @@
+"""Core Components Package"""
--- a/core/events.py
+++ b/core/events.py
@@ -0,0 +1,134 @@
+"""Event bus for pub/sub communication between components."""
+
+import asyncio
+from typing import Callable, Dict, List, Any, Optional
+from collections import defaultdict
+from loguru import logger
+
+
+class EventBus:
+    """
+    Async event bus for pub/sub communication.
+
+    Similar to the original Rust implementation's broadcast channel.
+    Components can subscribe to specific event types and receive events asynchronously.
+    """
+
+    def __init__(self):
+        """Initialize the event bus."""
+        self._subscribers: Dict[str, List[Callable]] = defaultdict(list)
+        self._lock = asyncio.Lock()
+        self._running = True
+
+    def subscribe(self, event_type: str, callback: Callable[[Dict[str, Any]], None]) -> None:
+        """
+        Subscribe to an event type.
+
+        Args:
+            event_type: Type of event to subscribe to (e.g., "speaking", "silence")
+            callback: Async callback function that receives event data
+        """
+        if not self._running:
+            logger.warning(f"Event bus is shut down, ignoring subscription to {event_type}")
+            return
+
+        self._subscribers[event_type].append(callback)
+        logger.debug(f"Subscribed to event type: {event_type}")
+
+    def unsubscribe(self, event_type: str, callback: Callable[[Dict[str, Any]], None]) -> None:
+        """
+        Unsubscribe from an event type.
+
+        Args:
+            event_type: Type of event to unsubscribe from
+            callback: Callback function to remove
+        """
+        if callback in self._subscribers[event_type]:
+            self._subscribers[event_type].remove(callback)
+            logger.debug(f"Unsubscribed from event type: {event_type}")
+
+    async def publish(self, event_type: str, event_data: Dict[str, Any]) -> None:
+        """
+        Publish an event to all subscribers.
+
+        Args:
+            event_type: Type of event to publish
+            event_data: Event data to send to subscribers
+        """
+        if not self._running:
+            logger.warning(f"Event bus is shut down, ignoring event: {event_type}")
+            return
+
+        # Get subscribers for this event type
+        subscribers = self._subscribers.get(event_type, [])
+
+        if not subscribers:
+            logger.debug(f"No subscribers for event type: {event_type}")
+            return
+
+        # Notify all subscribers concurrently
+        tasks = []
+        for callback in subscribers:
+            try:
+                # Create task for each subscriber
+                task = asyncio.create_task(self._call_subscriber(callback, event_data))
+                tasks.append(task)
+            except Exception as e:
+                logger.error(f"Error creating task for subscriber: {e}")
+
+        # Wait for all subscribers to complete
+        if tasks:
+            await asyncio.gather(*tasks, return_exceptions=True)
+
+        logger.debug(f"Published event '{event_type}' to {len(tasks)} subscribers")
+
+    async def _call_subscriber(self, callback: Callable[[Dict[str, Any]], None], event_data: Dict[str, Any]) -> None:
+        """
+        Call a subscriber callback with error handling.
+
+        Args:
+            callback: Subscriber callback function
+            event_data: Event data to pass to callback
+        """
+        try:
+            # Check if callback is a coroutine function
+            if asyncio.iscoroutinefunction(callback):
+                await callback(event_data)
+            else:
+                callback(event_data)
+        except Exception as e:
+            logger.error(f"Error in subscriber callback: {e}", exc_info=True)
+
+    async def close(self) -> None:
+        """Close the event bus and stop processing events."""
+        self._running = False
+        self._subscribers.clear()
+        logger.info("Event bus closed")
+
+    @property
+    def is_running(self) -> bool:
+        """Check if the event bus is running."""
+        return self._running
+
+
+# Global event bus instance
+_event_bus: Optional[EventBus] = None
+
+
+def get_event_bus() -> EventBus:
+    """
+    Get the global event bus instance.
+
+    Returns:
+        EventBus instance
+    """
+    global _event_bus
+    if _event_bus is None:
+        _event_bus = EventBus()
+    return _event_bus
+
+
+def reset_event_bus() -> None:
+    """Reset the global event bus (mainly for testing)."""
+    global _event_bus
+    _event_bus = None
--- a/core/pipeline.py
+++ b/core/pipeline.py
@@ -0,0 +1,131 @@
+"""Audio processing pipeline."""
+
+import asyncio
+from typing import Optional
+from loguru import logger
+
+from core.transports import BaseTransport
+from core.events import EventBus, get_event_bus
+from processors.vad import VADProcessor, SileroVAD
+from app.config import settings
+
+
+class AudioPipeline:
+    """
+    Audio processing pipeline.
+
+    Processes incoming audio through VAD and emits events.
+    """
+
+    def __init__(self, transport: BaseTransport, session_id: str):
+        """
+        Initialize audio pipeline.
+
+        Args:
+            transport: Transport instance for sending events/audio
+            session_id: Session identifier for event tracking
+        """
+        self.transport = transport
+        self.session_id = session_id
+        self.event_bus = get_event_bus()
+
+        # Initialize VAD
+        self.vad_model = SileroVAD(
+            model_path=settings.vad_model_path,
+            sample_rate=settings.sample_rate
+        )
+        self.vad_processor = VADProcessor(
+            vad_model=self.vad_model,
+            threshold=settings.vad_threshold,
+            silence_threshold_ms=settings.vad_eou_threshold_ms,
+            min_speech_duration_ms=settings.vad_min_speech_duration_ms
+        )
+
+        # State
+        self.is_bot_speaking = False
+        self.interrupt_signal = asyncio.Event()
+        self._running = True
+
+        logger.info(f"Audio pipeline initialized for session {session_id}")
+
+    async def process_input(self, pcm_bytes: bytes) -> None:
+        """
+        Process incoming audio chunk.
+
+        Args:
+            pcm_bytes: PCM audio data (16-bit, mono, 16kHz)
+        """
+        if not self._running:
+            return
+
+        try:
+            # Process through VAD
+            result = self.vad_processor.process(pcm_bytes, settings.chunk_size_ms)
+
+            if result:
+                event_type, probability = result
+
+                # Emit event through event bus
+                await self.event_bus.publish(event_type, {
+                    "trackId": self.session_id,
+                    "probability": probability
+                })
+
+                # Send event to client
+                if event_type == "speaking":
+                    logger.info(f"User speaking started (session {self.session_id})")
+                    await self.transport.send_event({
+                        "event": "speaking",
+                        "trackId": self.session_id,
+                        "timestamp": self._get_timestamp_ms(),
+                        "startTime": self._get_timestamp_ms()
+                    })
+
+                elif event_type == "silence":
+                    logger.info(f"User speaking stopped (session {self.session_id})")
+                    await self.transport.send_event({
+                        "event": "silence",
+                        "trackId": self.session_id,
+                        "timestamp": self._get_timestamp_ms(),
+                        "startTime": self._get_timestamp_ms(),
+                        "duration": 0  # TODO: Calculate actual duration
+                    })
+
+                elif event_type == "eou":
+                    logger.info(f"EOU detected (session {self.session_id})")
+                    await self.transport.send_event({
+                        "event": "eou",
+                        "trackId": self.session_id,
+                        "timestamp": self._get_timestamp_ms()
+                    })
+
+        except Exception as e:
+            logger.error(f"Pipeline processing error: {e}", exc_info=True)
+
+    async def process_text_input(self, text: str) -> None:
+        """
+        Process text input (chat command).
+
+        Args:
+            text: Text input
+        """
+        logger.info(f"Processing text input: {text[:50]}...")
+        # TODO: Implement text processing (LLM integration, etc.)
+        # For now, just log it
+
+    async def interrupt(self) -> None:
+        """Interrupt current audio playback."""
+        if self.is_bot_speaking:
+            self.interrupt_signal.set()
+            logger.info(f"Pipeline interrupted for session {self.session_id}")
+
+    async def cleanup(self) -> None:
+        """Cleanup pipeline resources."""
+        logger.info(f"Cleaning up pipeline for session {self.session_id}")
+        self._running = False
+        self.interrupt_signal.set()
+
+    def _get_timestamp_ms(self) -> int:
+        """Get current timestamp in milliseconds."""
+        import time
+        return int(time.time() * 1000)
--- a/core/session.py
+++ b/core/session.py
@@ -0,0 +1,266 @@
+"""Session management for active calls."""
+
+import uuid
+import json
+from typing import Optional, Dict, Any
+from loguru import logger
+
+from core.transports import BaseTransport
+from core.pipeline import AudioPipeline
+from models.commands import parse_command, TTSCommand, ChatCommand, InterruptCommand, HangupCommand
+
+
+class Session:
+    """
+    Manages a single call session.
+
+    Handles command routing, audio processing, and session lifecycle.
+    """
+
+    def __init__(self, session_id: str, transport: BaseTransport):
+        """
+        Initialize session.
+
+        Args:
+            session_id: Unique session identifier
+            transport: Transport instance for communication
+        """
+        self.id = session_id
+        self.transport = transport
+        self.pipeline = AudioPipeline(transport, session_id)
+
+        # Session state
+        self.created_at = None
+        self.state = "created"  # created, invited, accepted, ringing, hungup
+
+        # Track IDs
+        self.current_track_id: Optional[str] = str(uuid.uuid4())
+
+        logger.info(f"Session {self.id} created")
+
+    async def handle_text(self, text_data: str) -> None:
+        """
+        Handle incoming text data (JSON commands).
+
+        Args:
+            text_data: JSON text data
+        """
+        try:
+            data = json.loads(text_data)
+            command = parse_command(data)
+            command_type = command.command
+
+            logger.info(f"Session {self.id} received command: {command_type}")
+
+            # Route command to appropriate handler
+            if command_type == "invite":
+                await self._handle_invite(data)
+
+            elif command_type == "accept":
+                await self._handle_accept(data)
+
+            elif command_type == "reject":
+                await self._handle_reject(data)
+
+            elif command_type == "ringing":
+                await self._handle_ringing(data)
+
+            elif command_type == "tts":
+                await self._handle_tts(command)
+
+            elif command_type == "play":
+                await self._handle_play(data)
+
+            elif command_type == "interrupt":
+                await self._handle_interrupt(command)
+
+            elif command_type == "pause":
+                await self._handle_pause()
+
+            elif command_type == "resume":
+                await self._handle_resume()
+
+            elif command_type == "hangup":
+                await self._handle_hangup(command)
+
+            elif command_type == "history":
+                await self._handle_history(data)
+
+            elif command_type == "chat":
+                await self._handle_chat(command)
+
+            else:
+                logger.warning(f"Session {self.id} unknown command: {command_type}")
+
+        except json.JSONDecodeError as e:
+            logger.error(f"Session {self.id} JSON decode error: {e}")
+            await self._send_error("client", f"Invalid JSON: {e}")
+
+        except ValueError as e:
+            logger.error(f"Session {self.id} command parse error: {e}")
+            await self._send_error("client", f"Invalid command: {e}")
+
+        except Exception as e:
+            logger.error(f"Session {self.id} handle_text error: {e}", exc_info=True)
+            await self._send_error("server", f"Internal error: {e}")
+
+    async def handle_audio(self, audio_bytes: bytes) -> None:
+        """
+        Handle incoming audio data.
+
+        Args:
+            audio_bytes: PCM audio data
+        """
+        try:
+            await self.pipeline.process_input(audio_bytes)
+        except Exception as e:
+            logger.error(f"Session {self.id} handle_audio error: {e}", exc_info=True)
+
+    async def _handle_invite(self, data: Dict[str, Any]) -> None:
+        """Handle invite command."""
+        self.state = "invited"
+        option = data.get("option", {})
+
+        # Send answer event
+        await self.transport.send_event({
+            "event": "answer",
+            "trackId": self.current_track_id,
+            "timestamp": self._get_timestamp_ms()
+        })
+
+        logger.info(f"Session {self.id} invited with codec: {option.get('codec', 'pcm')}")
+
+    async def _handle_accept(self, data: Dict[str, Any]) -> None:
+        """Handle accept command."""
+        self.state = "accepted"
+        logger.info(f"Session {self.id} accepted")
+
+    async def _handle_reject(self, data: Dict[str, Any]) -> None:
+        """Handle reject command."""
+        self.state = "rejected"
+        reason = data.get("reason", "Rejected")
+        logger.info(f"Session {self.id} rejected: {reason}")
+
+    async def _handle_ringing(self, data: Dict[str, Any]) -> None:
+        """Handle ringing command."""
+        self.state = "ringing"
+        logger.info(f"Session {self.id} ringing")
+
+    async def _handle_tts(self, command: TTSCommand) -> None:
+        """Handle TTS command."""
+        logger.info(f"Session {self.id} TTS: {command.text[:50]}...")
+
+        # Send track start event
+        await self.transport.send_event({
+            "event": "trackStart",
+            "trackId": self.current_track_id,
+            "timestamp": self._get_timestamp_ms(),
+            "playId": command.play_id
+        })
+
+        # TODO: Implement actual TTS synthesis
+        # For now, just send track end event
+        await self.transport.send_event({
+            "event": "trackEnd",
+            "trackId": self.current_track_id,
+            "timestamp": self._get_timestamp_ms(),
+            "duration": 1000,
+            "ssrc": 0,
+            "playId": command.play_id
+        })
+
+    async def _handle_play(self, data: Dict[str, Any]) -> None:
+        """Handle play command."""
+        url = data.get("url", "")
+        logger.info(f"Session {self.id} play: {url}")
+
+        # Send track start event
+        await self.transport.send_event({
+            "event": "trackStart",
+            "trackId": self.current_track_id,
+            "timestamp": self._get_timestamp_ms(),
+            "playId": url
+        })
+
+        # TODO: Implement actual audio playback
+        # For now, just send track end event
+        await self.transport.send_event({
+            "event": "trackEnd",
+            "trackId": self.current_track_id,
+            "timestamp": self._get_timestamp_ms(),
+            "duration": 1000,
+            "ssrc": 0,
+            "playId": url
+        })
+
+    async def _handle_interrupt(self, command: InterruptCommand) -> None:
+        """Handle interrupt command."""
+        if command.graceful:
+            logger.info(f"Session {self.id} graceful interrupt")
+        else:
+            logger.info(f"Session {self.id} immediate interrupt")
+            await self.pipeline.interrupt()
+
+    async def _handle_pause(self) -> None:
+        """Handle pause command."""
+        logger.info(f"Session {self.id} paused")
+
+    async def _handle_resume(self) -> None:
+        """Handle resume command."""
+        logger.info(f"Session {self.id} resumed")
+
+    async def _handle_hangup(self, command: HangupCommand) -> None:
+        """Handle hangup command."""
+        self.state = "hungup"
+        reason = command.reason or "User requested"
+        logger.info(f"Session {self.id} hung up: {reason}")
+
+        # Send hangup event
+        await self.transport.send_event({
+            "event": "hangup",
+            "timestamp": self._get_timestamp_ms(),
+            "reason": reason,
+            "initiator": command.initiator or "user"
+        })
+
+        # Close transport
+        await self.transport.close()
+
+    async def _handle_history(self, data: Dict[str, Any]) -> None:
+        """Handle history command."""
+        speaker = data.get("speaker", "unknown")
+        text = data.get("text", "")
+        logger.info(f"Session {self.id} history [{speaker}]: {text[:50]}...")
+
+    async def _handle_chat(self, command: ChatCommand) -> None:
+        """Handle chat command."""
+        logger.info(f"Session {self.id} chat: {command.text[:50]}...")
+        # Process text input through pipeline
+        await self.pipeline.process_text_input(command.text)
+
+    async def _send_error(self, sender: str, error_message: str) -> None:
+        """
+        Send error event to client.
+
+        Args:
+            sender: Component that generated the error
+            error_message: Error message
+        """
+        await self.transport.send_event({
+            "event": "error",
+            "trackId": self.current_track_id,
+            "timestamp": self._get_timestamp_ms(),
+            "sender": sender,
+            "error": error_message
+        })
+
+    def _get_timestamp_ms(self) -> int:
+        """Get current timestamp in milliseconds."""
+        import time
+        return int(time.time() * 1000)
+
+    async def cleanup(self) -> None:
+        """Cleanup session resources."""
+        logger.info(f"Session {self.id} cleaning up")
+        await self.pipeline.cleanup()
+        await self.transport.close()
--- a/core/transports.py
+++ b/core/transports.py
@@ -0,0 +1,207 @@
+"""Transport layer for WebSocket and WebRTC communication."""
+
+import asyncio
+import json
+from abc import ABC, abstractmethod
+from typing import Optional
+from fastapi import WebSocket
+from loguru import logger
+
+# Try to import aiortc (optional for WebRTC functionality)
+try:
+    from aiortc import RTCPeerConnection
+    AIORTC_AVAILABLE = True
+except ImportError:
+    AIORTC_AVAILABLE = False
+    RTCPeerConnection = None  # Type hint placeholder
+
+
+class BaseTransport(ABC):
+    """
+    Abstract base class for transports.
+
+    All transports must implement send_event and send_audio methods.
+    """
+
+    @abstractmethod
+    async def send_event(self, event: dict) -> None:
+        """
+        Send a JSON event to the client.
+
+        Args:
+            event: Event data as dictionary
+        """
+        pass
+
+    @abstractmethod
+    async def send_audio(self, pcm_bytes: bytes) -> None:
+        """
+        Send audio data to the client.
+
+        Args:
+            pcm_bytes: PCM audio data (16-bit, mono, 16kHz)
+        """
+        pass
+
+    @abstractmethod
+    async def close(self) -> None:
+        """Close the transport and cleanup resources."""
+        pass
+
+
+class SocketTransport(BaseTransport):
+    """
+    WebSocket transport for raw audio streaming.
+
+    Handles mixed text/binary frames over WebSocket connection.
+    Uses asyncio.Lock to prevent frame interleaving.
+    """
+
+    def __init__(self, websocket: WebSocket):
+        """
+        Initialize WebSocket transport.
+
+        Args:
+            websocket: FastAPI WebSocket connection
+        """
+        self.ws = websocket
+        self.lock = asyncio.Lock()  # Prevent frame interleaving
+        self._closed = False
+
+    async def send_event(self, event: dict) -> None:
+        """
+        Send a JSON event via WebSocket.
+
+        Args:
+            event: Event data as dictionary
+        """
+        if self._closed:
+            logger.warning("Attempted to send event on closed transport")
+            return
+
+        async with self.lock:
+            try:
+                await self.ws.send_text(json.dumps(event))
+                logger.debug(f"Sent event: {event.get('event', 'unknown')}")
+            except Exception as e:
+                logger.error(f"Error sending event: {e}")
+                self._closed = True
+
+    async def send_audio(self, pcm_bytes: bytes) -> None:
+        """
+        Send PCM audio data via WebSocket.
+
+        Args:
+            pcm_bytes: PCM audio data (16-bit, mono, 16kHz)
+        """
+        if self._closed:
+            logger.warning("Attempted to send audio on closed transport")
+            return
+
+        async with self.lock:
+            try:
+                await self.ws.send_bytes(pcm_bytes)
+            except Exception as e:
+                logger.error(f"Error sending audio: {e}")
+                self._closed = True
+
+    async def close(self) -> None:
+        """Close the WebSocket connection."""
+        self._closed = True
+        try:
+            await self.ws.close()
+        except Exception as e:
+            logger.error(f"Error closing WebSocket: {e}")
+
+    @property
+    def is_closed(self) -> bool:
+        """Check if the transport is closed."""
+        return self._closed
+
+
+class WebRtcTransport(BaseTransport):
+    """
+    WebRTC transport for WebRTC audio streaming.
+
+    Uses WebSocket for signaling and RTCPeerConnection for media.
+    """
+
+    def __init__(self, websocket: WebSocket, pc):
+        """
+        Initialize WebRTC transport.
+
+        Args:
+            websocket: FastAPI WebSocket connection for signaling
+            pc: RTCPeerConnection for media transport
+        """
+        if not AIORTC_AVAILABLE:
+            raise RuntimeError("aiortc is not available - WebRTC transport cannot be used")
+
+        self.ws = websocket
+        self.pc = pc
+        self.outbound_track = None  # MediaStreamTrack for outbound audio
+        self._closed = False
+
+    async def send_event(self, event: dict) -> None:
+        """
+        Send a JSON event via WebSocket signaling.
+
+        Args:
+            event: Event data as dictionary
+        """
+        if self._closed:
+            logger.warning("Attempted to send event on closed transport")
+            return
+
+        try:
+            await self.ws.send_text(json.dumps(event))
+            logger.debug(f"Sent event: {event.get('event', 'unknown')}")
+        except Exception as e:
+            logger.error(f"Error sending event: {e}")
+            self._closed = True
+
+    async def send_audio(self, pcm_bytes: bytes) -> None:
+        """
+        Send audio data via WebRTC track.
+
+        Note: In WebRTC, you don't send bytes directly. You push frames
+        to a MediaStreamTrack that the peer connection is reading.
+
+        Args:
+            pcm_bytes: PCM audio data (16-bit, mono, 16kHz)
+        """
+        if self._closed:
+            logger.warning("Attempted to send audio on closed transport")
+            return
+
+        # This would require a custom MediaStreamTrack implementation
+        # For now, we'll log this as a placeholder
+        logger.debug(f"Audio bytes queued for WebRTC track: {len(pcm_bytes)} bytes")
+
+        # TODO: Implement outbound audio track if needed
+        # if self.outbound_track:
+        #     await self.outbound_track.add_frame(pcm_bytes)
+
+    async def close(self) -> None:
+        """Close the WebRTC connection."""
+        self._closed = True
+        try:
+            await self.pc.close()
+            await self.ws.close()
+        except Exception as e:
+            logger.error(f"Error closing WebRTC transport: {e}")
+
+    @property
+    def is_closed(self) -> bool:
+        """Check if the transport is closed."""
+        return self._closed
+
+    def set_outbound_track(self, track):
+        """
+        Set the outbound audio track for sending audio to client.
+
+        Args:
+            track: MediaStreamTrack for outbound audio
+        """
+        self.outbound_track = track
+        logger.debug("Set outbound track for WebRTC transport")
--- a/data/vad/silero_vad.onnx
+++ b/data/vad/silero_vad.onnx
--- a/examples/mic_client.py
+++ b/examples/mic_client.py
@@ -0,0 +1,137 @@
+"""
+Microphone WebSocket Client
+
+Connects to the backend WebSocket endpoint and streams audio from the microphone.
+Used to test VAD and EOU detection.
+
+Dependencies:
+    pip install pyaudio aiohttp
+"""
+
+import asyncio
+import aiohttp
+import pyaudio
+import json
+import sys
+from datetime import datetime
+
+# Configuration
+SERVER_URL = "ws://localhost:8000/ws"
+SAMPLE_RATE = 16000
+CHANNELS = 1
+CHUNK_DURATION_MS = 20
+CHUNK_SIZE = int(SAMPLE_RATE * (CHUNK_DURATION_MS / 1000.0))  # 320 samples for 20ms
+FORMAT = pyaudio.paInt16
+
+async def send_audio_loop(ws, stream):
+    """Read from microphone and send to WebSocket."""
+    print("🎙️  Microphone streaming started...")
+    try:
+        while True:
+            # Read non-blocking? PyAudio read is blocking, so run in executor or use specialized async lib.
+            # For simplicity in this script, we'll just read. It might block the event loop slightly 
+            # but for 20ms chunks it's usually acceptable for a test script.
+            # To be proper async, we should run_in_executor.
+            data = await asyncio.get_event_loop().run_in_executor(
+                None, lambda: stream.read(CHUNK_SIZE, exception_on_overflow=False)
+            )
+            
+            await ws.send_bytes(data)
+            # No sleep needed here as microphone dictates the timing
+            
+    except Exception as e:
+        print(f"❌ Error in send loop: {e}")
+
+async def receive_loop(ws):
+    """Listen for VAD/EOU events."""
+    print("👂 Listening for server events...")
+    async for msg in ws:
+        timestamp = datetime.now().strftime("%H:%M:%S.%f")[:-3]
+        
+        if msg.type == aiohttp.WSMsgType.TEXT:
+            try:
+                data = json.loads(msg.data)
+                event = data.get('event')
+                
+                # Highlight VAD/EOU events
+                if event == 'speaking':
+                    print(f"[{timestamp}] 🗣️  SPEAKING STARTED")
+                elif event == 'silence':
+                    print(f"[{timestamp}] 🤫 SILENCE DETECTED")
+                elif event == 'eou':
+                    print(f"[{timestamp}] ✅ END OF UTTERANCE (EOU)")
+                elif event == 'error':
+                    print(f"[{timestamp}] ❌ ERROR: {data.get('error')}")
+                else:
+                    print(f"[{timestamp}] 📩 {event}: {str(data)[:100]}")
+                    
+            except json.JSONDecodeError:
+                print(f"[{timestamp}] 📄 Text: {msg.data}")
+                
+        elif msg.type == aiohttp.WSMsgType.CLOSED:
+            print("❌ Connection closed")
+            break
+        elif msg.type == aiohttp.WSMsgType.ERROR:
+            print("❌ Connection error")
+            break
+
+async def main():
+    p = pyaudio.PyAudio()
+    
+    # Check for input devices
+    info = p.get_host_api_info_by_index(0)
+    numdevices = info.get('deviceCount')
+    if numdevices == 0:
+        print("❌ No audio input devices found")
+        return
+
+    # Open microphone stream
+    try:
+        stream = p.open(format=FORMAT,
+                        channels=CHANNELS,
+                        rate=SAMPLE_RATE,
+                        input=True,
+                        frames_per_buffer=CHUNK_SIZE)
+    except Exception as e:
+        print(f"❌ Failed to open microphone: {e}")
+        return
+
+    session = aiohttp.ClientSession()
+    
+    try:
+        print(f"🔌 Connecting to {SERVER_URL}...")
+        async with session.ws_connect(SERVER_URL) as ws:
+            print("✅ Connected!")
+
+            # 1. Send Invite
+            invite_msg = {
+                "command": "invite",
+                "option": {
+                    "codec": "pcm",
+                    "samplerate": SAMPLE_RATE
+                }
+            }
+            await ws.send_json(invite_msg)
+            print("📤 Sent Invite")
+
+            # 2. Run loops
+            await asyncio.gather(
+                receive_loop(ws),
+                send_audio_loop(ws, stream)
+            )
+
+    except aiohttp.ClientConnectorError:
+        print(f"❌ Failed to connect to {SERVER_URL}. Is the server running?")
+    except KeyboardInterrupt:
+        print("\n👋 Stopping...")
+    finally:
+        stream.stop_stream()
+        stream.close()
+        p.terminate()
+        await session.close()
+
+if __name__ == "__main__":
+    try:
+        asyncio.run(main())
+    except KeyboardInterrupt:
+        pass
--- a/main.py
+++ b/main.py
@@ -1,137 +0,0 @@
-"""
-Step 1: Minimal WebSocket Echo Server
-
-This is the simplest possible WebSocket audio server.
-It accepts connections and echoes back events.
-
-What you'll learn:
- How to create a FastAPI WebSocket endpoint
- How to handle mixed text/binary frames
- Basic event sending
-
-Test with:
-    python main.py
-    python test_client.py
-"""
-
-import asyncio
-import json
-import uuid
-from fastapi import FastAPI, WebSocket
-from loguru import logger
-
-# Configure logging
-logger.remove()
-logger.add(lambda msg: print(msg, end=""), level="INFO", format="<green>{time:HH:mm:ss}</green> | {level} | {message}")
-
-# Create FastAPI app
-app = FastAPI(title="Voice Gateway - Step 1")
-
-
-@app.get("/health")
-async def health_check():
-    """Health check endpoint."""
-    return {"status": "healthy", "step": "1_minimal_echo"}
-
-
-@app.websocket("/ws")
-async def websocket_endpoint(websocket: WebSocket):
-    """
-    WebSocket endpoint for audio streaming.
-
-    This is a minimal echo server that:
-    1. Accepts WebSocket connections
-    2. Sends a welcome event
-    3. Receives text commands and binary audio
-    4. Echoes speaking events back
-    """
-    await websocket.accept()
-
-    # Generate unique session ID
-    session_id = str(uuid.uuid4())
-    logger.info(f"[{session_id}] Client connected")
-
-    try:
-        # Send welcome event (answer)
-        await websocket.send_json({
-            "event": "answer",
-            "trackId": session_id,
-            "timestamp": _get_timestamp_ms()
-        })
-        logger.info(f"[{session_id}] Sent answer event")
-
-        # Message receive loop
-        while True:
-            message = await websocket.receive()
-
-            # Handle binary audio data
-            if "bytes" in message:
-                audio_bytes = message["bytes"]
-                logger.info(f"[{session_id}] Received audio: {len(audio_bytes)} bytes")
-
-                # Send speaking event (echo back)
-                await websocket.send_json({
-                    "event": "speaking",
-                    "trackId": session_id,
-                    "timestamp": _get_timestamp_ms(),
-                    "startTime": _get_timestamp_ms()
-                })
-
-            # Handle text commands
-            elif "text" in message:
-                text_data = message["text"]
-                logger.info(f"[{session_id}] Received text: {text_data[:100]}...")
-
-                try:
-                    data = json.loads(text_data)
-                    command = data.get("command", "unknown")
-                    logger.info(f"[{session_id}] Command: {command}")
-
-                    # Handle basic commands
-                    if command == "invite":
-                        await websocket.send_json({
-                            "event": "answer",
-                            "trackId": session_id,
-                            "timestamp": _get_timestamp_ms()
-                        })
-                        logger.info(f"[{session_id}] Responded to invite")
-
-                    elif command == "hangup":
-                        logger.info(f"[{session_id}] Hangup requested")
-                        break
-
-                    elif command == "ping":
-                        await websocket.send_json({
-                            "event": "pong",
-                            "timestamp": _get_timestamp_ms()
-                        })
-
-                except json.JSONDecodeError as e:
-                    logger.error(f"[{session_id}] Invalid JSON: {e}")
-
-    except Exception as e:
-        logger.error(f"[{session_id}] Error: {e}")
-
-    finally:
-        logger.info(f"[{session_id}] Connection closed")
-
-
-def _get_timestamp_ms() -> int:
-    """Get current timestamp in milliseconds."""
-    import time
-    return int(time.time() * 1000)
-
-
-if __name__ == "__main__":
-    import uvicorn
-
-    logger.info("🚀 Starting Step 1: Minimal WebSocket Echo Server")
-    logger.info("📡 Server: ws://localhost:8000/ws")
-    logger.info("🩺 Health: http://localhost:8000/health")
-
-    uvicorn.run(
-        app,
-        host="0.0.0.0",
-        port=8000,
-        log_level="info"
-    )
--- a/models/init.py
+++ b/models/init.py
@@ -0,0 +1 @@
+"""Data Models Package"""
--- a/models/commands.py
+++ b/models/commands.py
@@ -0,0 +1,143 @@
+"""Protocol command models matching the original active-call API."""
+
+from typing import Optional, Dict, Any
+from pydantic import BaseModel, Field
+
+
+class InviteCommand(BaseModel):
+    """Invite command to initiate a call."""
+
+    command: str = Field(default="invite", description="Command type")
+    option: Optional[Dict[str, Any]] = Field(default=None, description="Call configuration options")
+
+
+class AcceptCommand(BaseModel):
+    """Accept command to accept an incoming call."""
+
+    command: str = Field(default="accept", description="Command type")
+    option: Optional[Dict[str, Any]] = Field(default=None, description="Call configuration options")
+
+
+class RejectCommand(BaseModel):
+    """Reject command to reject an incoming call."""
+
+    command: str = Field(default="reject", description="Command type")
+    reason: str = Field(default="", description="Reason for rejection")
+    code: Optional[int] = Field(default=None, description="SIP response code")
+
+
+class RingingCommand(BaseModel):
+    """Ringing command to send ringing response."""
+
+    command: str = Field(default="ringing", description="Command type")
+    recorder: Optional[Dict[str, Any]] = Field(default=None, description="Call recording configuration")
+    early_media: bool = Field(default=False, description="Enable early media")
+    ringtone: Optional[str] = Field(default=None, description="Custom ringtone URL")
+
+
+class TTSCommand(BaseModel):
+    """TTS command to convert text to speech."""
+
+    command: str = Field(default="tts", description="Command type")
+    text: str = Field(..., description="Text to synthesize")
+    speaker: Optional[str] = Field(default=None, description="Speaker voice name")
+    play_id: Optional[str] = Field(default=None, description="Unique identifier for this TTS session")
+    auto_hangup: bool = Field(default=False, description="Auto hangup after TTS completion")
+    streaming: bool = Field(default=False, description="Streaming text input")
+    end_of_stream: bool = Field(default=False, description="End of streaming input")
+    wait_input_timeout: Optional[int] = Field(default=None, description="Max time to wait for input (seconds)")
+    option: Optional[Dict[str, Any]] = Field(default=None, description="TTS provider specific options")
+
+
+class PlayCommand(BaseModel):
+    """Play command to play audio from URL."""
+
+    command: str = Field(default="play", description="Command type")
+    url: str = Field(..., description="URL of audio file to play")
+    auto_hangup: bool = Field(default=False, description="Auto hangup after playback")
+    wait_input_timeout: Optional[int] = Field(default=None, description="Max time to wait for input (seconds)")
+
+
+class InterruptCommand(BaseModel):
+    """Interrupt command to interrupt current playback."""
+
+    command: str = Field(default="interrupt", description="Command type")
+    graceful: bool = Field(default=False, description="Wait for current TTS to complete")
+
+
+class PauseCommand(BaseModel):
+    """Pause command to pause current playback."""
+
+    command: str = Field(default="pause", description="Command type")
+
+
+class ResumeCommand(BaseModel):
+    """Resume command to resume paused playback."""
+
+    command: str = Field(default="resume", description="Command type")
+
+
+class HangupCommand(BaseModel):
+    """Hangup command to end the call."""
+
+    command: str = Field(default="hangup", description="Command type")
+    reason: Optional[str] = Field(default=None, description="Reason for hangup")
+    initiator: Optional[str] = Field(default=None, description="Who initiated the hangup")
+
+
+class HistoryCommand(BaseModel):
+    """History command to add conversation history."""
+
+    command: str = Field(default="history", description="Command type")
+    speaker: str = Field(..., description="Speaker identifier")
+    text: str = Field(..., description="Conversation text")
+
+
+class ChatCommand(BaseModel):
+    """Chat command for text-based conversation."""
+
+    command: str = Field(default="chat", description="Command type")
+    text: str = Field(..., description="Chat text message")
+
+
+# Command type mapping
+COMMAND_TYPES = {
+    "invite": InviteCommand,
+    "accept": AcceptCommand,
+    "reject": RejectCommand,
+    "ringing": RingingCommand,
+    "tts": TTSCommand,
+    "play": PlayCommand,
+    "interrupt": InterruptCommand,
+    "pause": PauseCommand,
+    "resume": ResumeCommand,
+    "hangup": HangupCommand,
+    "history": HistoryCommand,
+    "chat": ChatCommand,
+}
+
+
+def parse_command(data: Dict[str, Any]) -> BaseModel:
+    """
+    Parse a command from JSON data.
+
+    Args:
+        data: JSON data as dictionary
+
+    Returns:
+        Parsed command model
+
+    Raises:
+        ValueError: If command type is unknown
+    """
+    command_type = data.get("command")
+
+    if not command_type:
+        raise ValueError("Missing 'command' field")
+
+    command_class = COMMAND_TYPES.get(command_type)
+
+    if not command_class:
+        raise ValueError(f"Unknown command type: {command_type}")
+
+    return command_class(**data)
--- a/models/config.py
+++ b/models/config.py
@@ -0,0 +1,126 @@
+"""Configuration models for call options."""
+
+from typing import Optional, Dict, Any, List
+from pydantic import BaseModel, Field
+
+
+class VADOption(BaseModel):
+    """Voice Activity Detection configuration."""
+
+    type: str = Field(default="silero", description="VAD algorithm type (silero, webrtc)")
+    samplerate: int = Field(default=16000, description="Audio sample rate for VAD")
+    speech_padding: int = Field(default=250, description="Speech padding in milliseconds")
+    silence_padding: int = Field(default=100, description="Silence padding in milliseconds")
+    ratio: float = Field(default=0.5, description="Voice detection ratio threshold")
+    voice_threshold: float = Field(default=0.5, description="Voice energy threshold")
+    max_buffer_duration_secs: int = Field(default=50, description="Maximum buffer duration in seconds")
+    silence_timeout: Optional[int] = Field(default=None, description="Silence timeout in milliseconds")
+    endpoint: Optional[str] = Field(default=None, description="Custom VAD service endpoint")
+    secret_key: Optional[str] = Field(default=None, description="VAD service secret key")
+    secret_id: Optional[str] = Field(default=None, description="VAD service secret ID")
+
+
+class ASROption(BaseModel):
+    """Automatic Speech Recognition configuration."""
+
+    provider: str = Field(..., description="ASR provider (tencent, aliyun, openai, etc.)")
+    language: Optional[str] = Field(default=None, description="Language code (zh-CN, en-US)")
+    app_id: Optional[str] = Field(default=None, description="Application ID")
+    secret_id: Optional[str] = Field(default=None, description="Secret ID for authentication")
+    secret_key: Optional[str] = Field(default=None, description="Secret key for authentication")
+    model_type: Optional[str] = Field(default=None, description="ASR model type (16k_zh, 8k_en)")
+    buffer_size: Optional[int] = Field(default=None, description="Audio buffer size in bytes")
+    samplerate: Optional[int] = Field(default=None, description="Audio sample rate")
+    endpoint: Optional[str] = Field(default=None, description="Custom ASR service endpoint")
+    extra: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters")
+    start_when_answer: bool = Field(default=False, description="Start ASR when call is answered")
+
+
+class TTSOption(BaseModel):
+    """Text-to-Speech configuration."""
+
+    samplerate: Optional[int] = Field(default=None, description="TTS output sample rate")
+    provider: str = Field(default="msedge", description="TTS provider (tencent, aliyun, deepgram, msedge)")
+    speed: float = Field(default=1.0, description="Speech speed multiplier")
+    app_id: Optional[str] = Field(default=None, description="Application ID")
+    secret_id: Optional[str] = Field(default=None, description="Secret ID for authentication")
+    secret_key: Optional[str] = Field(default=None, description="Secret key for authentication")
+    volume: Optional[int] = Field(default=None, description="Speech volume level (1-10)")
+    speaker: Optional[str] = Field(default=None, description="Voice speaker name")
+    codec: Optional[str] = Field(default=None, description="Audio codec")
+    subtitle: bool = Field(default=False, description="Enable subtitle generation")
+    emotion: Optional[str] = Field(default=None, description="Speech emotion")
+    endpoint: Optional[str] = Field(default=None, description="Custom TTS service endpoint")
+    extra: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters")
+    max_concurrent_tasks: Optional[int] = Field(default=None, description="Max concurrent tasks")
+
+
+class RecorderOption(BaseModel):
+    """Call recording configuration."""
+
+    recorder_file: str = Field(..., description="Path to recording file")
+    samplerate: int = Field(default=16000, description="Recording sample rate")
+    ptime: int = Field(default=200, description="Packet time in milliseconds")
+
+
+class MediaPassOption(BaseModel):
+    """Media pass-through configuration for external audio processing."""
+
+    url: str = Field(..., description="WebSocket URL for media streaming")
+    input_sample_rate: int = Field(default=16000, description="Sample rate of audio received from WebSocket")
+    output_sample_rate: int = Field(default=16000, description="Sample rate of audio sent to WebSocket")
+    packet_size: int = Field(default=2560, description="Packet size in bytes")
+    ptime: Optional[int] = Field(default=None, description="Buffered playback period in milliseconds")
+
+
+class SipOption(BaseModel):
+    """SIP protocol configuration."""
+
+    username: Optional[str] = Field(default=None, description="SIP username")
+    password: Optional[str] = Field(default=None, description="SIP password")
+    realm: Optional[str] = Field(default=None, description="SIP realm/domain")
+    headers: Optional[Dict[str, str]] = Field(default=None, description="Additional SIP headers")
+
+
+class HandlerRule(BaseModel):
+    """Handler routing rule."""
+
+    caller: Optional[str] = Field(default=None, description="Caller pattern (regex)")
+    callee: Optional[str] = Field(default=None, description="Callee pattern (regex)")
+    playbook: Optional[str] = Field(default=None, description="Playbook file path")
+    webhook: Optional[str] = Field(default=None, description="Webhook URL")
+
+
+class CallOption(BaseModel):
+    """Comprehensive call configuration options."""
+
+    # Basic options
+    denoise: bool = Field(default=False, description="Enable noise reduction")
+    offer: Optional[str] = Field(default=None, description="SDP offer string")
+    callee: Optional[str] = Field(default=None, description="Callee SIP URI or phone number")
+    caller: Optional[str] = Field(default=None, description="Caller SIP URI or phone number")
+
+    # Audio codec
+    codec: str = Field(default="pcm", description="Audio codec (pcm, pcma, pcmu, g722)")
+
+    # Component configurations
+    recorder: Optional[RecorderOption] = Field(default=None, description="Call recording config")
+    asr: Optional[ASROption] = Field(default=None, description="ASR configuration")
+    vad: Optional[VADOption] = Field(default=None, description="VAD configuration")
+    tts: Optional[TTSOption] = Field(default=None, description="TTS configuration")
+    media_pass: Optional[MediaPassOption] = Field(default=None, description="Media pass-through config")
+    sip: Optional[SipOption] = Field(default=None, description="SIP configuration")
+
+    # Timeouts and networking
+    handshake_timeout: Optional[int] = Field(default=None, description="Handshake timeout in seconds")
+    enable_ipv6: bool = Field(default=False, description="Enable IPv6 support")
+    inactivity_timeout: Optional[int] = Field(default=None, description="Inactivity timeout in seconds")
+
+    # EOU configuration
+    eou: Optional[Dict[str, Any]] = Field(default=None, description="End of utterance detection config")
+
+    # Extra parameters
+    extra: Optional[Dict[str, Any]] = Field(default=None, description="Additional custom parameters")
+
+    class Config:
+        populate_by_name = True
--- a/models/events.py
+++ b/models/events.py
@@ -0,0 +1,223 @@
+"""Protocol event models matching the original active-call API."""
+
+from typing import Optional, Dict, Any
+from pydantic import BaseModel, Field
+from datetime import datetime
+
+
+def current_timestamp_ms() -> int:
+    """Get current timestamp in milliseconds."""
+    return int(datetime.now().timestamp() * 1000)
+
+
+# Base Event Model
+class BaseEvent(BaseModel):
+    """Base event model."""
+
+    event: str = Field(..., description="Event type")
+    track_id: str = Field(..., description="Unique track identifier")
+    timestamp: int = Field(default_factory=current_timestamp_ms, description="Event timestamp in milliseconds")
+
+
+# Lifecycle Events
+class IncomingEvent(BaseEvent):
+    """Incoming call event (SIP only)."""
+
+    event: str = Field(default="incoming", description="Event type")
+    caller: Optional[str] = Field(default=None, description="Caller's SIP URI")
+    callee: Optional[str] = Field(default=None, description="Callee's SIP URI")
+    sdp: Optional[str] = Field(default=None, description="SDP offer from caller")
+
+
+class AnswerEvent(BaseEvent):
+    """Call answered event."""
+
+    event: str = Field(default="answer", description="Event type")
+    sdp: Optional[str] = Field(default=None, description="SDP answer from server")
+
+
+class RejectEvent(BaseEvent):
+    """Call rejected event."""
+
+    event: str = Field(default="reject", description="Event type")
+    reason: Optional[str] = Field(default=None, description="Rejection reason")
+    code: Optional[int] = Field(default=None, description="SIP response code")
+
+
+class RingingEvent(BaseEvent):
+    """Call ringing event."""
+
+    event: str = Field(default="ringing", description="Event type")
+    early_media: bool = Field(default=False, description="Early media available")
+
+
+class HangupEvent(BaseModel):
+    """Call hangup event."""
+
+    event: str = Field(default="hangup", description="Event type")
+    timestamp: int = Field(default_factory=current_timestamp_ms, description="Event timestamp")
+    reason: Optional[str] = Field(default=None, description="Hangup reason")
+    initiator: Optional[str] = Field(default=None, description="Who initiated hangup")
+    start_time: Optional[str] = Field(default=None, description="Call start time (ISO 8601)")
+    hangup_time: Optional[str] = Field(default=None, description="Hangup time (ISO 8601)")
+    answer_time: Optional[str] = Field(default=None, description="Answer time (ISO 8601)")
+    ringing_time: Optional[str] = Field(default=None, description="Ringing time (ISO 8601)")
+    from_: Optional[Dict[str, Any]] = Field(default=None, alias="from", description="Caller info")
+    to: Optional[Dict[str, Any]] = Field(default=None, description="Callee info")
+    extra: Optional[Dict[str, Any]] = Field(default=None, description="Additional metadata")
+
+    class Config:
+        populate_by_name = True
+
+
+# VAD Events
+class SpeakingEvent(BaseEvent):
+    """Speech detected event."""
+
+    event: str = Field(default="speaking", description="Event type")
+    start_time: int = Field(default_factory=current_timestamp_ms, description="Speech start time")
+
+
+class SilenceEvent(BaseEvent):
+    """Silence detected event."""
+
+    event: str = Field(default="silence", description="Event type")
+    start_time: int = Field(default_factory=current_timestamp_ms, description="Silence start time")
+    duration: int = Field(default=0, description="Silence duration in milliseconds")
+
+
+# AI/ASR Events
+class AsrFinalEvent(BaseEvent):
+    """ASR final transcription event."""
+
+    event: str = Field(default="asrFinal", description="Event type")
+    index: int = Field(..., description="ASR result sequence number")
+    start_time: Optional[int] = Field(default=None, description="Speech start time")
+    end_time: Optional[int] = Field(default=None, description="Speech end time")
+    text: str = Field(..., description="Transcribed text")
+
+
+class AsrDeltaEvent(BaseEvent):
+    """ASR partial transcription event (streaming)."""
+
+    event: str = Field(default="asrDelta", description="Event type")
+    index: int = Field(..., description="ASR result sequence number")
+    start_time: Optional[int] = Field(default=None, description="Speech start time")
+    end_time: Optional[int] = Field(default=None, description="Speech end time")
+    text: str = Field(..., description="Partial transcribed text")
+
+
+class EouEvent(BaseEvent):
+    """End of utterance detection event."""
+
+    event: str = Field(default="eou", description="Event type")
+    completed: bool = Field(default=True, description="Whether utterance was completed")
+
+
+# Audio Track Events
+class TrackStartEvent(BaseEvent):
+    """Audio track start event."""
+
+    event: str = Field(default="trackStart", description="Event type")
+    play_id: Optional[str] = Field(default=None, description="Play ID from TTS/Play command")
+
+
+class TrackEndEvent(BaseEvent):
+    """Audio track end event."""
+
+    event: str = Field(default="trackEnd", description="Event type")
+    duration: int = Field(..., description="Track duration in milliseconds")
+    ssrc: int = Field(..., description="RTP SSRC identifier")
+    play_id: Optional[str] = Field(default=None, description="Play ID from TTS/Play command")
+
+
+class InterruptionEvent(BaseEvent):
+    """Playback interruption event."""
+
+    event: str = Field(default="interruption", description="Event type")
+    play_id: Optional[str] = Field(default=None, description="Play ID that was interrupted")
+    subtitle: Optional[str] = Field(default=None, description="TTS text being played")
+    position: Optional[int] = Field(default=None, description="Word index position")
+    total_duration: Optional[int] = Field(default=None, description="Total TTS duration")
+    current: Optional[int] = Field(default=None, description="Elapsed time when interrupted")
+
+
+# System Events
+class ErrorEvent(BaseEvent):
+    """Error event."""
+
+    event: str = Field(default="error", description="Event type")
+    sender: str = Field(..., description="Component that generated the error")
+    error: str = Field(..., description="Error message")
+    code: Optional[int] = Field(default=None, description="Error code")
+
+
+class MetricsEvent(BaseModel):
+    """Performance metrics event."""
+
+    event: str = Field(default="metrics", description="Event type")
+    timestamp: int = Field(default_factory=current_timestamp_ms, description="Event timestamp")
+    key: str = Field(..., description="Metric key")
+    duration: int = Field(..., description="Duration in milliseconds")
+    data: Optional[Dict[str, Any]] = Field(default=None, description="Additional metric data")
+
+
+class AddHistoryEvent(BaseModel):
+    """Conversation history entry added event."""
+
+    event: str = Field(default="addHistory", description="Event type")
+    timestamp: int = Field(default_factory=current_timestamp_ms, description="Event timestamp")
+    sender: Optional[str] = Field(default=None, description="Component that added history")
+    speaker: str = Field(..., description="Speaker identifier")
+    text: str = Field(..., description="Conversation text")
+
+
+class DTMFEvent(BaseEvent):
+    """DTMF tone detected event."""
+
+    event: str = Field(default="dtmf", description="Event type")
+    digit: str = Field(..., description="DTMF digit (0-9, *, #, A-D)")
+
+
+# Event type mapping
+EVENT_TYPES = {
+    "incoming": IncomingEvent,
+    "answer": AnswerEvent,
+    "reject": RejectEvent,
+    "ringing": RingingEvent,
+    "hangup": HangupEvent,
+    "speaking": SpeakingEvent,
+    "silence": SilenceEvent,
+    "asrFinal": AsrFinalEvent,
+    "asrDelta": AsrDeltaEvent,
+    "eou": EouEvent,
+    "trackStart": TrackStartEvent,
+    "trackEnd": TrackEndEvent,
+    "interruption": InterruptionEvent,
+    "error": ErrorEvent,
+    "metrics": MetricsEvent,
+    "addHistory": AddHistoryEvent,
+    "dtmf": DTMFEvent,
+}
+
+
+def create_event(event_type: str, **kwargs) -> BaseModel:
+    """
+    Create an event model.
+
+    Args:
+        event_type: Type of event to create
+        **kwargs: Event fields
+
+    Returns:
+        Event model instance
+
+    Raises:
+        ValueError: If event type is unknown
+    """
+    event_class = EVENT_TYPES.get(event_type)
+
+    if not event_class:
+        raise ValueError(f"Unknown event type: {event_type}")
+
+    return event_class(event=event_type, **kwargs)
--- a/processors/init.py
+++ b/processors/init.py
@@ -0,0 +1,6 @@
+"""Audio Processors Package"""
+
+from processors.eou import EouDetector
+from processors.vad import SileroVAD, VADProcessor
+
+__all__ = ["EouDetector", "SileroVAD", "VADProcessor"]
--- a/processors/eou.py
+++ b/processors/eou.py
@@ -0,0 +1,80 @@
+"""End-of-Utterance Detection."""
+
+import time
+from typing import Optional
+
+
+class EouDetector:
+    """
+    End-of-utterance detector. Fires EOU only after continuous silence for
+    silence_threshold_ms. Short pauses between sentences do not trigger EOU
+    because speech resets the silence timer (one EOU per turn).
+    """
+
+    def __init__(self, silence_threshold_ms: int = 1000, min_speech_duration_ms: int = 250):
+        """
+        Initialize EOU detector.
+
+        Args:
+            silence_threshold_ms: How long silence must last to trigger EOU (default 1000ms)
+            min_speech_duration_ms: Minimum speech duration to consider valid (default 250ms)
+        """
+        self.threshold = silence_threshold_ms / 1000.0
+        self.min_speech = min_speech_duration_ms / 1000.0
+        self._silence_threshold_ms = silence_threshold_ms
+        self._min_speech_duration_ms = min_speech_duration_ms
+
+        # State
+        self.is_speaking = False
+        self.speech_start_time = 0.0
+        self.silence_start_time: Optional[float] = None
+        self.triggered = False
+
+    def process(self, vad_status: str) -> bool:
+        """
+        Process VAD status and detect end of utterance.
+
+        Input: "Speech" or "Silence" (from VAD).
+        Output: True if EOU detected, False otherwise.
+
+        Short breaks between phrases reset the silence clock when speech
+        resumes, so only one EOU is emitted after the user truly stops.
+        """
+        now = time.time()
+
+        if vad_status == "Speech":
+            if not self.is_speaking:
+                self.is_speaking = True
+                self.speech_start_time = now
+                self.triggered = False
+            # Any speech resets silence timer — short pause + more speech = one utterance
+            self.silence_start_time = None
+            return False
+
+        if vad_status == "Silence":
+            if not self.is_speaking:
+                return False
+            if self.silence_start_time is None:
+                self.silence_start_time = now
+
+            speech_duration = self.silence_start_time - self.speech_start_time
+            if speech_duration < self.min_speech:
+                self.is_speaking = False
+                self.silence_start_time = None
+                return False
+
+            silence_duration = now - self.silence_start_time
+            if silence_duration >= self.threshold and not self.triggered:
+                self.triggered = True
+                self.is_speaking = False
+                self.silence_start_time = None
+                return True
+
+        return False
+
+    def reset(self) -> None:
+        """Reset EOU detector state."""
+        self.is_speaking = False
+        self.speech_start_time = 0.0
+        self.silence_start_time = None
+        self.triggered = False
--- a/processors/tracks.py
+++ b/processors/tracks.py
@@ -0,0 +1,168 @@
+"""Audio track processing for WebRTC."""
+
+import asyncio
+import fractions
+from typing import Optional
+from loguru import logger
+
+# Try to import aiortc (optional for WebRTC functionality)
+try:
+    from aiortc import AudioStreamTrack
+    AIORTC_AVAILABLE = True
+except ImportError:
+    AIORTC_AVAILABLE = False
+    AudioStreamTrack = object  # Dummy class for type hints
+
+# Try to import PyAV (optional for audio resampling)
+try:
+    from av import AudioFrame, AudioResampler
+    AV_AVAILABLE = True
+except ImportError:
+    AV_AVAILABLE = False
+    # Create dummy classes for type hints
+    class AudioFrame:
+        pass
+    class AudioResampler:
+        pass
+
+import numpy as np
+
+
+class Resampled16kTrack(AudioStreamTrack if AIORTC_AVAILABLE else object):
+    """
+    Audio track that resamples input to 16kHz mono PCM.
+
+    Wraps an existing MediaStreamTrack and converts its output
+    to 16kHz mono 16-bit PCM format for the pipeline.
+    """
+
+    def __init__(self, track, target_sample_rate: int = 16000):
+        """
+        Initialize resampled track.
+
+        Args:
+            track: Source MediaStreamTrack
+            target_sample_rate: Target sample rate (default: 16000)
+        """
+        if not AIORTC_AVAILABLE:
+            raise RuntimeError("aiortc not available - Resampled16kTrack cannot be used")
+
+        super().__init__()
+        self.track = track
+        self.target_sample_rate = target_sample_rate
+
+        if AV_AVAILABLE:
+            self.resampler = AudioResampler(
+                format="s16",
+                layout="mono",
+                rate=target_sample_rate
+            )
+        else:
+            logger.warning("PyAV not available, audio resampling disabled")
+            self.resampler = None
+
+        self._closed = False
+
+    async def recv(self):
+        """
+        Receive and resample next audio frame.
+
+        Returns:
+            Resampled AudioFrame at 16kHz mono
+        """
+        if self._closed:
+            raise RuntimeError("Track is closed")
+
+        # Get frame from source track
+        frame = await self.track.recv()
+
+        # Resample the frame if AV is available
+        if AV_AVAILABLE and self.resampler:
+            resampled_frame = self.resampler.resample(frame)
+            # Ensure the frame has the correct format
+            resampled_frame.sample_rate = self.target_sample_rate
+            return resampled_frame
+        else:
+            # Return frame as-is if AV is not available
+            return frame
+
+    async def stop(self) -> None:
+        """Stop the track and cleanup resources."""
+        self._closed = True
+        if hasattr(self, 'resampler') and self.resampler:
+            del self.resampler
+        logger.debug("Resampled track stopped")
+
+
+class SineWaveTrack(AudioStreamTrack if AIORTC_AVAILABLE else object):
+    """
+    Synthetic audio track that generates a sine wave.
+
+    Useful for testing without requiring real audio input.
+    """
+
+    def __init__(self, sample_rate: int = 16000, frequency: int = 440):
+        """
+        Initialize sine wave track.
+
+        Args:
+            sample_rate: Audio sample rate (default: 16000)
+            frequency: Sine wave frequency in Hz (default: 440)
+        """
+        if not AIORTC_AVAILABLE:
+            raise RuntimeError("aiortc not available - SineWaveTrack cannot be used")
+
+        super().__init__()
+        self.sample_rate = sample_rate
+        self.frequency = frequency
+        self.counter = 0
+        self._stopped = False
+
+    async def recv(self):
+        """
+        Generate next audio frame with sine wave.
+
+        Returns:
+            AudioFrame with sine wave data
+        """
+        if self._stopped:
+            raise RuntimeError("Track is stopped")
+
+        # Generate 20ms of audio
+        samples = int(self.sample_rate * 0.02)
+        pts = self.counter
+        time_base = fractions.Fraction(1, self.sample_rate)
+
+        # Generate sine wave
+        t = np.linspace(
+            self.counter / self.sample_rate,
+            (self.counter + samples) / self.sample_rate,
+            samples,
+            endpoint=False
+        )
+
+        # Generate sine wave (Int16 PCM)
+        data = (0.5 * np.sin(2 * np.pi * self.frequency * t) * 32767).astype(np.int16)
+
+        # Update counter
+        self.counter += samples
+
+        # Create AudioFrame if AV is available
+        if AV_AVAILABLE:
+            frame = AudioFrame.from_ndarray(data.reshape(1, -1), format='s16', layout='mono')
+            frame.pts = pts
+            frame.time_base = time_base
+            frame.sample_rate = self.sample_rate
+            return frame
+        else:
+            # Return simple data structure if AV is not available
+            return {
+                'data': data,
+                'sample_rate': self.sample_rate,
+                'pts': pts,
+                'time_base': time_base
+            }
+
+    def stop(self) -> None:
+        """Stop the track."""
+        self._stopped = True
--- a/processors/vad.py
+++ b/processors/vad.py
@@ -0,0 +1,213 @@
+"""Voice Activity Detection using Silero VAD."""
+
+import asyncio
+import os
+from typing import Tuple, Optional
+import numpy as np
+from loguru import logger
+
+from processors.eou import EouDetector
+
+# Try to import onnxruntime (optional for VAD functionality)
+try:
+    import onnxruntime as ort
+    ONNX_AVAILABLE = True
+except ImportError:
+    ONNX_AVAILABLE = False
+    ort = None
+    logger.warning("onnxruntime not available - VAD will be disabled")
+
+
+class SileroVAD:
+    """
+    Voice Activity Detection using Silero VAD model.
+
+    Detects speech in audio chunks using the Silero VAD ONNX model.
+    Returns "Speech" or "Silence" for each audio chunk.
+    """
+
+    def __init__(self, model_path: str = "data/vad/silero_vad.onnx", sample_rate: int = 16000):
+        """
+        Initialize Silero VAD.
+
+        Args:
+            model_path: Path to Silero VAD ONNX model
+            sample_rate: Audio sample rate (must be 16kHz for Silero VAD)
+        """
+        self.sample_rate = sample_rate
+        self.model_path = model_path
+
+        # Check if model exists
+        if not os.path.exists(model_path):
+            logger.warning(f"VAD model not found at {model_path}. VAD will be disabled.")
+            self.session = None
+            return
+
+        # Check if onnxruntime is available
+        if not ONNX_AVAILABLE:
+            logger.warning("onnxruntime not available - VAD will be disabled")
+            self.session = None
+            return
+
+        # Load ONNX model
+        try:
+            self.session = ort.InferenceSession(model_path)
+            logger.info(f"Loaded Silero VAD model from {model_path}")
+        except Exception as e:
+            logger.error(f"Failed to load VAD model: {e}")
+            self.session = None
+            return
+
+        # Internal state for VAD
+        self._reset_state()
+        self.buffer = np.array([], dtype=np.float32)
+        self.min_chunk_size = 512
+        self.last_label = "Silence"
+        self.last_probability = 0.0
+
+    def _reset_state(self):
+        # Silero VAD V4+ expects state shape [2, 1, 128]
+        self._state = np.zeros((2, 1, 128), dtype=np.float32)
+        self._sr = np.array([self.sample_rate], dtype=np.int64)
+
+    def process_audio(self, pcm_bytes: bytes, chunk_size_ms: int = 20) -> Tuple[str, float]:
+        """
+        Process audio chunk and detect speech.
+
+        Args:
+            pcm_bytes: PCM audio data (16-bit, mono, 16kHz)
+            chunk_size_ms: Chunk duration in milliseconds (ignored for buffering logic)
+
+        Returns:
+            Tuple of (label, probability) where label is "Speech" or "Silence"
+        """
+        if self.session is None or not ONNX_AVAILABLE:
+            # If model not loaded or onnxruntime not available, assume speech
+            return "Speech", 1.0
+
+        # Convert bytes to numpy array of int16
+        audio_int16 = np.frombuffer(pcm_bytes, dtype=np.int16)
+
+        # Normalize to float32 (-1.0 to 1.0)
+        audio_float = audio_int16.astype(np.float32) / 32768.0
+
+        # Add to buffer
+        self.buffer = np.concatenate((self.buffer, audio_float))
+
+        # Process all complete chunks in the buffer
+        processed_any = False
+        while len(self.buffer) >= self.min_chunk_size:
+            # Slice exactly 512 samples
+            chunk = self.buffer[:self.min_chunk_size]
+            self.buffer = self.buffer[self.min_chunk_size:]
+
+            # Prepare inputs
+            # Input tensor shape: [batch, samples] -> [1, 512]
+            input_tensor = chunk.reshape(1, -1)
+
+            # Run inference
+            try:
+                ort_inputs = {
+                    'input': input_tensor,
+                    'state': self._state,
+                    'sr': self._sr
+                }
+
+                # Outputs: probability, state
+                out, self._state = self.session.run(None, ort_inputs)
+                
+                # Get probability
+                self.last_probability = float(out[0][0])
+                self.last_label = "Speech" if self.last_probability >= 0.5 else "Silence"
+                processed_any = True
+
+            except Exception as e:
+                logger.error(f"VAD inference error: {e}")
+                # Try to determine if it's an input name issue
+                try:
+                    inputs = [x.name for x in self.session.get_inputs()]
+                    logger.error(f"Model expects inputs: {inputs}")
+                except:
+                    pass
+                return "Speech", 1.0
+
+        return self.last_label, self.last_probability
+
+    def reset(self) -> None:
+        """Reset VAD internal state."""
+        self._reset_state()
+        self.buffer = np.array([], dtype=np.float32)
+        self.last_label = "Silence"
+        self.last_probability = 0.0
+
+
+class VADProcessor:
+    """
+    High-level VAD processor with state management.
+
+    Tracks speech/silence state and emits events on transitions.
+    """
+
+    def __init__(self, vad_model: SileroVAD, threshold: float = 0.5,
+                 silence_threshold_ms: int = 1000, min_speech_duration_ms: int = 250):
+        """
+        Initialize VAD processor.
+
+        Args:
+            vad_model: Silero VAD model instance
+            threshold: Speech detection threshold
+            silence_threshold_ms: EOU silence threshold in ms (longer = one EOU across short pauses)
+            min_speech_duration_ms: EOU min speech duration in ms (ignore very short noises)
+        """
+        self.vad = vad_model
+        self.threshold = threshold
+        self._eou_silence_ms = silence_threshold_ms
+        self._eou_min_speech_ms = min_speech_duration_ms
+        self.is_speaking = False
+        self.speech_start_time: Optional[float] = None
+        self.silence_start_time: Optional[float] = None
+        self.eou_detector = EouDetector(silence_threshold_ms, min_speech_duration_ms)
+
+    def process(self, pcm_bytes: bytes, chunk_size_ms: int = 20) -> Optional[Tuple[str, float]]:
+        """
+        Process audio chunk and detect state changes.
+
+        Args:
+            pcm_bytes: PCM audio data
+            chunk_size_ms: Chunk duration in milliseconds
+
+        Returns:
+            Tuple of (event_type, probability) if state changed, None otherwise
+        """
+        label, probability = self.vad.process_audio(pcm_bytes, chunk_size_ms)
+
+        # Check if this is speech based on threshold
+        is_speech = probability >= self.threshold
+        
+        # Check EOU
+        if self.eou_detector.process("Speech" if is_speech else "Silence"):
+            return ("eou", probability)
+
+        # State transition: Silence -> Speech
+        if is_speech and not self.is_speaking:
+            self.is_speaking = True
+            self.speech_start_time = asyncio.get_event_loop().time()
+            self.silence_start_time = None
+            return ("speaking", probability)
+
+        # State transition: Speech -> Silence
+        elif not is_speech and self.is_speaking:
+            self.is_speaking = False
+            self.silence_start_time = asyncio.get_event_loop().time()
+            self.speech_start_time = None
+            return ("silence", probability)
+
+        return None
+
+    def reset(self) -> None:
+        """Reset VAD state."""
+        self.vad.reset()
+        self.is_speaking = False
+        self.speech_start_time = None
+        self.silence_start_time = None
+        self.eou_detector = EouDetector(self._eou_silence_ms, self._eou_min_speech_ms)
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,134 @@
+[build-system]
+requires = ["setuptools>=68.0"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "py-active-call-cc"
+version = "0.1.0"
+description = "Python Active-Call: Real-time audio streaming with WebSocket and WebRTC"
+readme = "README.md"
+requires-python = ">=3.11"
+license = {text = "MIT"}
+authors = [
+    {name = "Your Name", email = "your.email@example.com"}
+]
+keywords = ["webrtc", "websocket", "audio", "voip", "real-time"]
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Intended Audience :: Developers",
+    "Topic :: Communications :: Telephony",
+    "License :: OSI Approved :: MIT License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+]
+
+[project.urls]
+Homepage = "https://github.com/yourusername/py-active-call-cc"
+Documentation = "https://github.com/yourusername/py-active-call-cc/blob/main/README.md"
+Repository = "https://github.com/yourusername/py-active-call-cc.git"
+Issues = "https://github.com/yourusername/py-active-call-cc/issues"
+
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["app*"]
+exclude = ["tests*", "scripts*", "reference*"]
+
+[tool.black]
+line-length = 100
+target-version = ['py311']
+include = '\.pyi?$'
+extend-exclude = '''
+/(
+  # directories
+  \.eggs
+  | \.git
+  | \.hg
+  | \.mypy_cache
+  | \.tox
+  | \.venv
+  | build
+  | dist
+  | reference
+)/
+'''
+
+[tool.ruff]
+line-length = 100
+target-version = "py311"
+select = [
+    "E",   # pycodestyle errors
+    "W",   # pycodestyle warnings
+    "F",   # pyflakes
+    "I",   # isort
+    "B",   # flake8-bugbear
+    "C4",  # flake8-comprehensions
+    "UP",  # pyupgrade
+]
+ignore = [
+    "E501",  # line too long (handled by black)
+    "B008",  # do not perform function calls in argument defaults
+]
+exclude = [
+    ".bzr",
+    ".direnv",
+    ".eggs",
+    ".git",
+    ".hg",
+    ".mypy_cache",
+    ".nox",
+    ".pants.d",
+    ".ruff_cache",
+    ".svn",
+    ".tox",
+    ".venv",
+    "__pypackages__",
+    "_build",
+    "buck-out",
+    "build",
+    "dist",
+    "node_modules",
+    "venv",
+    "reference",
+]
+
+[tool.ruff.per-file-ignores]
+"__init__.py" = ["F401"]  # unused imports
+
+[tool.mypy]
+python_version = "3.11"
+warn_return_any = true
+warn_unused_configs = true
+disallow_untyped_defs = false
+disallow_incomplete_defs = false
+check_untyped_defs = true
+no_implicit_optional = true
+warn_redundant_casts = true
+warn_unused_ignores = true
+warn_no_return = true
+strict_equality = true
+exclude = [
+    "venv",
+    "reference",
+    "build",
+    "dist",
+]
+
+[[tool.mypy.overrides]]
+module = [
+    "aiortc.*",
+    "av.*",
+    "onnxruntime.*",
+]
+ignore_missing_imports = true
+
+[tool.pytest.ini_options]
+minversion = "7.0"
+addopts = "-ra -q --strict-markers --strict-config"
+testpaths = ["tests"]
+pythonpath = ["."]
+asyncio_mode = "auto"
+markers = [
+    "slow: marks tests as slow (deselect with '-m \"not slow\"')",
+    "integration: marks tests as integration tests",
+]
--- a/requirements.txt
+++ b/requirements.txt
--- a/scripts/test_websocket.py
+++ b/scripts/test_websocket.py
@@ -0,0 +1,166 @@
+"""WebSocket endpoint test client.
+
+Tests the /ws endpoint with sine wave or file audio streaming.
+Based on reference/py-active-call/exec/test_ws_endpoint/test_ws.py
+"""
+
+import asyncio
+import aiohttp
+import json
+import struct
+import math
+import argparse
+import os
+from datetime import datetime
+
+# Configuration
+SERVER_URL = "ws://localhost:8000/ws"
+SAMPLE_RATE = 16000
+FREQUENCY = 440  # 440Hz Sine Wave
+CHUNK_DURATION_MS = 20
+# 16kHz * 16-bit (2 bytes) * 20ms = 640 bytes per chunk
+CHUNK_SIZE_BYTES = int(SAMPLE_RATE * 2 * (CHUNK_DURATION_MS / 1000.0))
+
+
+def generate_sine_wave(duration_ms=1000):
+    """Generates sine wave audio (16kHz mono PCM 16-bit)."""
+    num_samples = int(SAMPLE_RATE * (duration_ms / 1000.0))
+    audio_data = bytearray()
+
+    for x in range(num_samples):
+        # Generate sine wave sample
+        value = int(32767.0 * math.sin(2 * math.pi * FREQUENCY * x / SAMPLE_RATE))
+        # Pack as little-endian 16-bit integer
+        audio_data.extend(struct.pack('<h', value))
+
+    return audio_data
+
+
+async def receive_loop(ws):
+    """Listen for incoming messages from the server."""
+    print("👂 Listening for server responses...")
+    async for msg in ws:
+        timestamp = datetime.now().strftime("%H:%M:%S")
+
+        if msg.type == aiohttp.WSMsgType.TEXT:
+            try:
+                data = json.loads(msg.data)
+                event_type = data.get('event', 'Unknown')
+                print(f"[{timestamp}] 📨 Event: {event_type} | {msg.data[:150]}...")
+            except json.JSONDecodeError:
+                print(f"[{timestamp}] 📨 Text: {msg.data[:100]}...")
+
+        elif msg.type == aiohttp.WSMsgType.BINARY:
+            # Received audio chunk back (e.g., TTS or echo)
+            print(f"[{timestamp}] 🔊 Audio: {len(msg.data)} bytes", end="\r")
+
+        elif msg.type == aiohttp.WSMsgType.CLOSED:
+            print(f"\n[{timestamp}] ❌ Socket Closed")
+            break
+
+        elif msg.type == aiohttp.WSMsgType.ERROR:
+            print(f"\n[{timestamp}] ⚠️ Socket Error")
+            break
+
+
+async def send_file_loop(ws, file_path):
+    """Stream a raw PCM/WAV file to the server."""
+    if not os.path.exists(file_path):
+        print(f"❌ Error: File '{file_path}' not found.")
+        return
+
+    print(f"📂 Streaming file: {file_path} ...")
+
+    with open(file_path, "rb") as f:
+        # Skip WAV header if present (first 44 bytes)
+        if file_path.endswith('.wav'):
+            f.read(44)
+
+        while True:
+            chunk = f.read(CHUNK_SIZE_BYTES)
+            if not chunk:
+                break
+
+            # Send binary frame
+            await ws.send_bytes(chunk)
+
+            # Sleep to simulate real-time playback
+            await asyncio.sleep(CHUNK_DURATION_MS / 1000.0)
+
+    print(f"\n✅ Finished streaming {file_path}")
+
+
+async def send_sine_loop(ws):
+    """Stream generated sine wave to the server."""
+    print("🎙️  Starting Audio Stream (Sine Wave)...")
+
+    # Generate 10 seconds of audio buffer
+    audio_buffer = generate_sine_wave(5000)
+    cursor = 0
+
+    while cursor < len(audio_buffer):
+        chunk = audio_buffer[cursor:cursor + CHUNK_SIZE_BYTES]
+        if not chunk:
+            break
+
+        await ws.send_bytes(chunk)
+        cursor += len(chunk)
+
+        await asyncio.sleep(CHUNK_DURATION_MS / 1000.0)
+
+    print("\n✅ Finished streaming test audio.")
+
+
+async def run_client(url, file_path=None, use_sine=False):
+    """Run the WebSocket test client."""
+    session = aiohttp.ClientSession()
+    try:
+        print(f"🔌 Connecting to {url}...")
+        async with session.ws_connect(url) as ws:
+            print("✅ Connected!")
+
+            # Send initial invite command
+            init_cmd = {
+                "command": "invite",
+                "option": {
+                    "codec": "pcm",
+                    "samplerate": SAMPLE_RATE
+                }
+            }
+            await ws.send_json(init_cmd)
+            print("📤 Sent Invite Command")
+
+            # Select sender based on args
+            if use_sine:
+                sender_task = send_sine_loop(ws)
+            elif file_path:
+                sender_task = send_file_loop(ws, file_path)
+            else:
+                # Default to sine wave
+                sender_task = send_sine_loop(ws)
+
+            # Run send and receive loops in parallel
+            await asyncio.gather(
+                receive_loop(ws),
+                sender_task
+            )
+
+    except aiohttp.ClientConnectorError:
+        print(f"❌ Connection Failed. Is the server running at {url}?")
+    except Exception as e:
+        print(f"❌ Error: {e}")
+    finally:
+        await session.close()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="WebSocket Audio Test Client")
+    parser.add_argument("--url", default=SERVER_URL, help="WebSocket endpoint URL")
+    parser.add_argument("--file", help="Path to PCM/WAV file to stream")
+    parser.add_argument("--sine", action="store_true", help="Use sine wave generation (default)")
+    args = parser.parse_args()
+
+    try:
+        asyncio.run(run_client(args.url, args.file, args.sine))
+    except KeyboardInterrupt:
+        print("\n👋 Client stopped.")
--- a/test_client.py
+++ b/test_client.py
@@ -1,160 +0,0 @@
-"""
-WebSocket Test Client
-
-Tests the WebSocket server with sine wave audio generation.
-
-Usage:
-    python test_client.py
-    python test_client.py --url ws://localhost:8000/ws
-"""
-
-import asyncio
-import aiohttp
-import json
-import struct
-import math
-import argparse
-from datetime import datetime
-
-# Configuration
-SERVER_URL = "ws://localhost:8000/ws"
-SAMPLE_RATE = 16000
-FREQUENCY = 440  # 440Hz sine wave
-CHUNK_DURATION_MS = 20
-CHUNK_SIZE_BYTES = int(SAMPLE_RATE * 2 * (CHUNK_DURATION_MS / 1000.0))  # 640 bytes
-
-
-def generate_sine_wave(duration_ms=1000):
-    """
-    Generate sine wave audio data.
-
-    Format: 16kHz, mono, 16-bit PCM
-    """
-    num_samples = int(SAMPLE_RATE * (duration_ms / 1000.0))
-    audio_data = bytearray()
-
-    for x in range(num_samples):
-        # Generate sine wave sample
-        value = int(32767.0 * math.sin(2 * math.pi * FREQUENCY * x / SAMPLE_RATE))
-        # Pack as little-endian 16-bit signed integer
-        audio_data.extend(struct.pack('<h', value))
-
-    return audio_data
-
-
-async def receive_loop(ws, session_id):
-    """
-    Listen for incoming messages from the server.
-    """
-    print("👂 Listening for server responses...")
-    async for msg in ws:
-        timestamp = datetime.now().strftime("%H:%M:%S")
-
-        if msg.type == aiohttp.WSMsgType.TEXT:
-            try:
-                data = json.loads(msg.data)
-                event_type = data.get('event', 'Unknown')
-                print(f"[{timestamp}] 📨 Event: {event_type}")
-                print(f"           {json.dumps(data, indent=2)}")
-            except json.JSONDecodeError:
-                print(f"[{timestamp}] 📨 Text: {msg.data[:100]}...")
-
-        elif msg.type == aiohttp.WSMsgType.BINARY:
-            # Received audio chunk back
-            print(f"[{timestamp}] 🔊 Audio: {len(msg.data)} bytes")
-
-        elif msg.type == aiohttp.WSMsgType.CLOSED:
-            print(f"\n[{timestamp}] ❌ Connection closed")
-            break
-
-        elif msg.type == aiohttp.WSMsgType.ERROR:
-            print(f"\n[{timestamp}] ⚠️ WebSocket error")
-            break
-
-
-async def send_audio_loop(ws):
-    """
-    Stream sine wave audio to the server.
-    """
-    print("🎙️  Starting audio stream (sine wave)...")
-
-    # Generate 5 seconds of audio
-    audio_buffer = generate_sine_wave(5000)
-    cursor = 0
-
-    while cursor < len(audio_buffer):
-        chunk = audio_buffer[cursor:cursor + CHUNK_SIZE_BYTES]
-        if not chunk:
-            break
-
-        await ws.send_bytes(chunk)
-        print(f"📤 Sent audio chunk: {len(chunk)} bytes", end="\r")
-
-        cursor += len(chunk)
-
-        # Sleep to simulate real-time (20ms per chunk)
-        await asyncio.sleep(CHUNK_DURATION_MS / 1000.0)
-
-    print("\n✅ Finished streaming audio")
-
-
-async def run_client(url):
-    """
-    Run the WebSocket test client.
-    """
-    session = aiohttp.ClientSession()
-
-    try:
-        print(f"🔌 Connecting to {url}...")
-
-        async with session.ws_connect(url) as ws:
-            print("✅ Connected!")
-            print()
-
-            # Send invite command
-            invite_cmd = {
-                "command": "invite",
-                "option": {
-                    "codec": "pcm",
-                    "samplerate": SAMPLE_RATE
-                }
-            }
-            await ws.send_json(invite_cmd)
-            print("📤 Sent invite command")
-            print()
-
-            # Send a ping command
-            ping_cmd = {"command": "ping"}
-            await ws.send_json(ping_cmd)
-            print("📤 Sent ping command")
-            print()
-
-            # Wait a moment for responses
-            await asyncio.sleep(1)
-
-            # Run audio streaming and receiving in parallel
-            await asyncio.gather(
-                receive_loop(ws, None),
-                send_audio_loop(ws)
-            )
-
-    except aiohttp.ClientConnectorError:
-        print(f"❌ Connection failed. Is the server running at {url}?")
-        print(f"   Start server with: python main.py")
-
-    except Exception as e:
-        print(f"❌ Error: {e}")
-
-    finally:
-        await session.close()
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="WebSocket Audio Test Client")
-    parser.add_argument("--url", default=SERVER_URL, help="WebSocket URL")
-    args = parser.parse_args()
-
-    try:
-        asyncio.run(run_client(args.url))
-    except KeyboardInterrupt:
-        print("\n👋 Client stopped.")
--- a/utils/init.py
+++ b/utils/init.py
@@ -0,0 +1 @@
+"""Utilities Package"""
--- a/utils/logging.py
+++ b/utils/logging.py
@@ -0,0 +1,83 @@
+"""Logging configuration utilities."""
+
+import sys
+from loguru import logger
+from pathlib import Path
+
+
+def setup_logging(
+    log_level: str = "INFO",
+    log_format: str = "text",
+    log_to_file: bool = True,
+    log_dir: str = "logs"
+):
+    """
+    Configure structured logging with loguru.
+
+    Args:
+        log_level: Logging level (DEBUG, INFO, WARNING, ERROR)
+        log_format: Format type (json or text)
+        log_to_file: Whether to log to file
+        log_dir: Directory for log files
+    """
+    # Remove default handler
+    logger.remove()
+
+    # Console handler
+    if log_format == "json":
+        logger.add(
+            sys.stdout,
+            format="{message}",
+            level=log_level,
+            serialize=True,
+            colorize=False
+        )
+    else:
+        logger.add(
+            sys.stdout,
+            format="<green>{time:HH:mm:ss}</green> | <level>{level: <8}</level> | <level>{message}</level>",
+            level=log_level,
+            colorize=True
+        )
+
+    # File handler
+    if log_to_file:
+        log_path = Path(log_dir)
+        log_path.mkdir(exist_ok=True)
+
+        if log_format == "json":
+            logger.add(
+                log_path / "active_call_{time:YYYY-MM-DD}.log",
+                format="{message}",
+                level=log_level,
+                rotation="1 day",
+                retention="7 days",
+                compression="zip",
+                serialize=True
+            )
+        else:
+            logger.add(
+                log_path / "active_call_{time:YYYY-MM-DD}.log",
+                format="{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {name}:{function}:{line} - {message}",
+                level=log_level,
+                rotation="1 day",
+                retention="7 days",
+                compression="zip"
+            )
+
+    return logger
+
+
+def get_logger(name: str = None):
+    """
+    Get a logger instance.
+
+    Args:
+        name: Logger name (optional)
+
+    Returns:
+        Logger instance
+    """
+    if name:
+        return logger.bind(name=name)
+    return logger