- Introduced `asr_interim_enabled` field in the Assistant model to control interim ASR results. - Updated AssistantBase and AssistantUpdate schemas to include the new field. - Modified the database schema to add the `asr_interim_enabled` column. - Enhanced runtime metadata to reflect interim ASR settings. - Updated API endpoints and tests to validate the new functionality. - Adjusted documentation to include details about interim ASR results configuration.
364 lines
12 KiB
Python
364 lines
12 KiB
Python
"""OpenAI-compatible ASR (Automatic Speech Recognition) Service.
|
|
|
|
Uses the SiliconFlow API for speech-to-text transcription.
|
|
API: https://docs.siliconflow.cn/cn/api-reference/audio/create-audio-transcriptions
|
|
"""
|
|
|
|
import asyncio
|
|
import io
|
|
import os
|
|
import wave
|
|
from typing import AsyncIterator, Optional, Callable, Awaitable
|
|
from urllib.parse import urlparse, urlunparse
|
|
from loguru import logger
|
|
|
|
try:
|
|
import aiohttp
|
|
AIOHTTP_AVAILABLE = True
|
|
except ImportError:
|
|
AIOHTTP_AVAILABLE = False
|
|
logger.warning("aiohttp not available - OpenAICompatibleASRService will not work")
|
|
|
|
from providers.common.base import BaseASRService, ASRResult, ServiceState
|
|
|
|
|
|
class OpenAICompatibleASRService(BaseASRService):
|
|
"""
|
|
OpenAI-compatible ASR service for speech-to-text transcription.
|
|
|
|
Features:
|
|
- Buffers incoming audio chunks
|
|
- Provides interim transcriptions periodically (for streaming to client)
|
|
- Final transcription on EOU
|
|
|
|
API Details:
|
|
- Endpoint: POST https://api.siliconflow.cn/v1/audio/transcriptions
|
|
- Models: FunAudioLLM/SenseVoiceSmall (default), TeleAI/TeleSpeechASR
|
|
- Input: Audio file (multipart/form-data)
|
|
- Output: {"text": "transcribed text"}
|
|
"""
|
|
|
|
# Supported models
|
|
MODELS = {
|
|
"sensevoice": "FunAudioLLM/SenseVoiceSmall",
|
|
"telespeech": "TeleAI/TeleSpeechASR",
|
|
}
|
|
|
|
API_URL = "https://api.siliconflow.cn/v1/audio/transcriptions"
|
|
|
|
def __init__(
|
|
self,
|
|
api_key: Optional[str] = None,
|
|
api_url: Optional[str] = None,
|
|
model: str = "FunAudioLLM/SenseVoiceSmall",
|
|
sample_rate: int = 16000,
|
|
language: str = "auto",
|
|
enable_interim: bool = False,
|
|
interim_interval_ms: int = 500, # How often to send interim results
|
|
min_audio_for_interim_ms: int = 300, # Min audio before first interim
|
|
on_transcript: Optional[Callable[[str, bool], Awaitable[None]]] = None
|
|
):
|
|
"""
|
|
Initialize OpenAI-compatible ASR service.
|
|
|
|
Args:
|
|
api_key: Provider API key
|
|
api_url: Provider API URL (defaults to SiliconFlow endpoint)
|
|
model: ASR model name or alias
|
|
sample_rate: Audio sample rate (16000 recommended)
|
|
language: Language code (auto for automatic detection)
|
|
enable_interim: Whether to generate interim transcriptions in offline mode
|
|
interim_interval_ms: How often to generate interim transcriptions
|
|
min_audio_for_interim_ms: Minimum audio duration before first interim
|
|
on_transcript: Callback for transcription results (text, is_final)
|
|
"""
|
|
super().__init__(sample_rate=sample_rate, language=language)
|
|
self.mode = "offline"
|
|
|
|
if not AIOHTTP_AVAILABLE:
|
|
raise RuntimeError("aiohttp is required for OpenAICompatibleASRService")
|
|
|
|
self.api_key = api_key
|
|
raw_api_url = api_url or os.getenv("ASR_API_URL") or self.API_URL
|
|
self.api_url = self._resolve_transcriptions_endpoint(raw_api_url)
|
|
self.model = self.MODELS.get(model.lower(), model)
|
|
self.enable_interim = bool(enable_interim)
|
|
self.interim_interval_ms = interim_interval_ms
|
|
self.min_audio_for_interim_ms = min_audio_for_interim_ms
|
|
self.on_transcript = on_transcript
|
|
|
|
# Session
|
|
self._session: Optional[aiohttp.ClientSession] = None
|
|
|
|
# Audio buffer
|
|
self._audio_buffer: bytes = b""
|
|
self._current_text: str = ""
|
|
self._last_interim_time: float = 0
|
|
|
|
# Transcript queue for async iteration
|
|
self._transcript_queue: asyncio.Queue[ASRResult] = asyncio.Queue()
|
|
|
|
# Background task for interim results
|
|
self._interim_task: Optional[asyncio.Task] = None
|
|
self._running = False
|
|
|
|
logger.info(f"OpenAICompatibleASRService initialized with model: {self.model}")
|
|
|
|
@staticmethod
|
|
def _resolve_transcriptions_endpoint(api_url: str) -> str:
|
|
"""
|
|
Accept either:
|
|
- base URL: https://host/v1
|
|
- full endpoint: https://host/v1/audio/transcriptions
|
|
and always return the final transcriptions endpoint URL.
|
|
"""
|
|
raw = str(api_url or "").strip()
|
|
if not raw:
|
|
return OpenAICompatibleASRService.API_URL
|
|
|
|
parsed = urlparse(raw)
|
|
path = (parsed.path or "").rstrip("/")
|
|
if path.endswith("/audio/transcriptions"):
|
|
return raw
|
|
|
|
if not path:
|
|
new_path = "/audio/transcriptions"
|
|
else:
|
|
new_path = f"{path}/audio/transcriptions"
|
|
|
|
return urlunparse(parsed._replace(path=new_path))
|
|
|
|
async def connect(self) -> None:
|
|
"""Connect to the service."""
|
|
if not self.api_key:
|
|
raise ValueError("ASR API key not provided. Configure agent.asr.api_key in YAML.")
|
|
self._session = aiohttp.ClientSession(
|
|
headers={
|
|
"Authorization": f"Bearer {self.api_key}"
|
|
}
|
|
)
|
|
self._running = True
|
|
self.state = ServiceState.CONNECTED
|
|
logger.info("OpenAICompatibleASRService connected")
|
|
|
|
async def disconnect(self) -> None:
|
|
"""Disconnect and cleanup."""
|
|
self._running = False
|
|
|
|
if self._interim_task:
|
|
self._interim_task.cancel()
|
|
try:
|
|
await self._interim_task
|
|
except asyncio.CancelledError:
|
|
pass
|
|
self._interim_task = None
|
|
|
|
if self._session:
|
|
await self._session.close()
|
|
self._session = None
|
|
|
|
self._audio_buffer = b""
|
|
self._current_text = ""
|
|
self.state = ServiceState.DISCONNECTED
|
|
logger.info("OpenAICompatibleASRService disconnected")
|
|
|
|
async def send_audio(self, audio: bytes) -> None:
|
|
"""
|
|
Buffer incoming audio data.
|
|
|
|
Args:
|
|
audio: PCM audio data (16-bit, mono)
|
|
"""
|
|
self._audio_buffer += audio
|
|
|
|
async def transcribe_buffer(self, is_final: bool = False) -> Optional[str]:
|
|
"""
|
|
Transcribe current audio buffer.
|
|
|
|
Args:
|
|
is_final: Whether this is the final transcription
|
|
|
|
Returns:
|
|
Transcribed text or None if not enough audio
|
|
"""
|
|
if not self._session:
|
|
logger.warning("ASR session not connected")
|
|
return None
|
|
|
|
if not is_final and not self.enable_interim:
|
|
return None
|
|
|
|
# Check minimum audio duration
|
|
audio_duration_ms = len(self._audio_buffer) / (self.sample_rate * 2) * 1000
|
|
|
|
if not is_final and audio_duration_ms < self.min_audio_for_interim_ms:
|
|
return None
|
|
|
|
if audio_duration_ms < 100: # Less than 100ms - too short
|
|
return None
|
|
|
|
try:
|
|
# Convert PCM to WAV in memory
|
|
wav_buffer = io.BytesIO()
|
|
with wave.open(wav_buffer, 'wb') as wav_file:
|
|
wav_file.setnchannels(1)
|
|
wav_file.setsampwidth(2) # 16-bit
|
|
wav_file.setframerate(self.sample_rate)
|
|
wav_file.writeframes(self._audio_buffer)
|
|
|
|
wav_buffer.seek(0)
|
|
wav_data = wav_buffer.read()
|
|
|
|
# Send to API
|
|
form_data = aiohttp.FormData()
|
|
form_data.add_field(
|
|
'file',
|
|
wav_data,
|
|
filename='audio.wav',
|
|
content_type='audio/wav'
|
|
)
|
|
form_data.add_field('model', self.model)
|
|
|
|
async with self._session.post(self.api_url, data=form_data) as response:
|
|
if response.status == 200:
|
|
result = await response.json()
|
|
text = result.get("text", "").strip()
|
|
|
|
if text:
|
|
self._current_text = text
|
|
|
|
# Notify via callback
|
|
if self.on_transcript:
|
|
await self.on_transcript(text, is_final)
|
|
|
|
# Queue result
|
|
await self._transcript_queue.put(
|
|
ASRResult(text=text, is_final=is_final)
|
|
)
|
|
|
|
logger.debug(f"ASR {'final' if is_final else 'interim'}: {text[:50]}...")
|
|
return text
|
|
else:
|
|
error_text = await response.text()
|
|
logger.error(f"ASR API error {response.status}: {error_text}")
|
|
return None
|
|
|
|
except Exception as e:
|
|
logger.error(f"ASR transcription error: {e}")
|
|
return None
|
|
|
|
async def get_final_transcription(self) -> str:
|
|
"""
|
|
Get final transcription and clear buffer.
|
|
|
|
Call this when EOU is detected.
|
|
|
|
Returns:
|
|
Final transcribed text
|
|
"""
|
|
# Transcribe full buffer as final
|
|
text = await self.transcribe_buffer(is_final=True)
|
|
|
|
# Clear buffer
|
|
result = text or self._current_text
|
|
self._audio_buffer = b""
|
|
self._current_text = ""
|
|
|
|
return result
|
|
|
|
def get_and_clear_text(self) -> str:
|
|
"""
|
|
Get accumulated text and clear buffer.
|
|
|
|
Compatible with BufferedASRService interface.
|
|
"""
|
|
text = self._current_text
|
|
self._current_text = ""
|
|
self._audio_buffer = b""
|
|
return text
|
|
|
|
def get_audio_buffer(self) -> bytes:
|
|
"""Get current audio buffer."""
|
|
return self._audio_buffer
|
|
|
|
def get_audio_duration_ms(self) -> float:
|
|
"""Get current audio buffer duration in milliseconds."""
|
|
return len(self._audio_buffer) / (self.sample_rate * 2) * 1000
|
|
|
|
def clear_buffer(self) -> None:
|
|
"""Clear audio and text buffers."""
|
|
self._audio_buffer = b""
|
|
self._current_text = ""
|
|
|
|
async def receive_transcripts(self) -> AsyncIterator[ASRResult]:
|
|
"""
|
|
Async iterator for transcription results.
|
|
|
|
Yields:
|
|
ASRResult with text and is_final flag
|
|
"""
|
|
while self._running:
|
|
try:
|
|
result = await asyncio.wait_for(
|
|
self._transcript_queue.get(),
|
|
timeout=0.1
|
|
)
|
|
yield result
|
|
except asyncio.TimeoutError:
|
|
continue
|
|
except asyncio.CancelledError:
|
|
break
|
|
|
|
async def start_interim_transcription(self) -> None:
|
|
"""
|
|
Start background task for interim transcriptions.
|
|
|
|
This periodically transcribes buffered audio for
|
|
real-time feedback to the user.
|
|
"""
|
|
if not self.enable_interim:
|
|
return
|
|
|
|
if self._interim_task and not self._interim_task.done():
|
|
return
|
|
|
|
self._interim_task = asyncio.create_task(self._interim_loop())
|
|
|
|
async def stop_interim_transcription(self) -> None:
|
|
"""Stop interim transcription task."""
|
|
if self._interim_task:
|
|
self._interim_task.cancel()
|
|
try:
|
|
await self._interim_task
|
|
except asyncio.CancelledError:
|
|
pass
|
|
self._interim_task = None
|
|
|
|
async def _interim_loop(self) -> None:
|
|
"""Background loop for interim transcriptions."""
|
|
import time
|
|
|
|
while self._running:
|
|
try:
|
|
await asyncio.sleep(self.interim_interval_ms / 1000)
|
|
|
|
# Check if we have enough new audio
|
|
current_time = time.time()
|
|
time_since_last = (current_time - self._last_interim_time) * 1000
|
|
|
|
if time_since_last >= self.interim_interval_ms:
|
|
audio_duration = self.get_audio_duration_ms()
|
|
|
|
if audio_duration >= self.min_audio_for_interim_ms:
|
|
await self.transcribe_buffer(is_final=False)
|
|
self._last_interim_time = current_time
|
|
|
|
except asyncio.CancelledError:
|
|
break
|
|
except Exception as e:
|
|
logger.error(f"Interim transcription error: {e}")
|
|
|
|
|
|
# Backward-compatible alias
|
|
SiliconFlowASRService = OpenAICompatibleASRService
|