- Introduced Volcengine as a new provider for both TTS and ASR services. - Updated configuration files to include Volcengine-specific parameters such as app_id, resource_id, and uid. - Enhanced the ASR service to support streaming mode with Volcengine's API. - Modified existing tests to validate the integration of Volcengine services. - Updated documentation to reflect the addition of Volcengine as a supported provider for TTS and ASR. - Refactored service factory to accommodate Volcengine alongside existing providers.
91 lines
2.6 KiB
Python
91 lines
2.6 KiB
Python
"""ASR extension port contracts."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from dataclasses import dataclass
|
|
from typing import Any, AsyncIterator, Awaitable, Callable, Dict, Literal, Optional, Protocol
|
|
|
|
from providers.common.base import ASRResult
|
|
|
|
TranscriptCallback = Callable[[str, bool], Awaitable[None]]
|
|
ASRMode = Literal["offline", "streaming"]
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class ASRServiceSpec:
|
|
"""Resolved runtime configuration for ASR service creation."""
|
|
|
|
provider: str
|
|
sample_rate: int
|
|
mode: Optional[ASRMode] = None
|
|
language: str = "auto"
|
|
api_key: Optional[str] = None
|
|
api_url: Optional[str] = None
|
|
model: Optional[str] = None
|
|
app_id: Optional[str] = None
|
|
resource_id: Optional[str] = None
|
|
cluster: Optional[str] = None
|
|
uid: Optional[str] = None
|
|
request_params: Optional[Dict[str, Any]] = None
|
|
enable_interim: bool = False
|
|
interim_interval_ms: int = 500
|
|
min_audio_for_interim_ms: int = 300
|
|
on_transcript: Optional[TranscriptCallback] = None
|
|
|
|
|
|
class ASRPort(Protocol):
|
|
"""Port for speech recognition providers."""
|
|
|
|
mode: ASRMode
|
|
|
|
async def connect(self) -> None:
|
|
"""Establish connection to ASR provider."""
|
|
|
|
async def disconnect(self) -> None:
|
|
"""Release ASR resources."""
|
|
|
|
async def send_audio(self, audio: bytes) -> None:
|
|
"""Push one PCM audio chunk for recognition."""
|
|
|
|
async def receive_transcripts(self) -> AsyncIterator[ASRResult]:
|
|
"""Stream partial/final recognition results."""
|
|
|
|
|
|
class OfflineASRPort(ASRPort, Protocol):
|
|
"""Port for offline/buffered ASR providers."""
|
|
|
|
mode: Literal["offline"]
|
|
|
|
async def start_interim_transcription(self) -> None:
|
|
"""Start interim transcription loop."""
|
|
|
|
async def stop_interim_transcription(self) -> None:
|
|
"""Stop interim transcription loop."""
|
|
|
|
def clear_buffer(self) -> None:
|
|
"""Clear provider-side ASR buffer."""
|
|
|
|
async def get_final_transcription(self) -> str:
|
|
"""Return final transcription for the current utterance."""
|
|
|
|
def get_and_clear_text(self) -> str:
|
|
"""Return buffered text and clear internal state."""
|
|
|
|
|
|
class StreamingASRPort(ASRPort, Protocol):
|
|
"""Port for streaming ASR providers."""
|
|
|
|
mode: Literal["streaming"]
|
|
|
|
async def begin_utterance(self) -> None:
|
|
"""Start a new utterance stream."""
|
|
|
|
async def end_utterance(self) -> None:
|
|
"""Signal end of current utterance stream."""
|
|
|
|
async def wait_for_final_transcription(self, timeout_ms: int = 800) -> str:
|
|
"""Wait for final transcript after utterance end."""
|
|
|
|
def clear_utterance(self) -> None:
|
|
"""Reset utterance-local state."""
|