Add Volcengine support for TTS and ASR services

- Introduced Volcengine as a new provider for both TTS and ASR services.
- Updated configuration files to include Volcengine-specific parameters such as app_id, resource_id, and uid.
- Enhanced the ASR service to support streaming mode with Volcengine's API.
- Modified existing tests to validate the integration of Volcengine services.
- Updated documentation to reflect the addition of Volcengine as a supported provider for TTS and ASR.
- Refactored service factory to accommodate Volcengine alongside existing providers.
This commit is contained in:
Xin Wang
2026-03-08 23:09:50 +08:00
parent 3604db21eb
commit aeeeee20d1
18 changed files with 1256 additions and 12 deletions

View File

@@ -793,6 +793,23 @@ class DuplexPipeline:
return False
return None
@staticmethod
def _coerce_json_object(value: Any) -> Optional[Dict[str, Any]]:
if isinstance(value, dict):
return dict(value)
if isinstance(value, str):
raw = value.strip()
if not raw:
return None
try:
parsed = json.loads(raw)
except json.JSONDecodeError:
logger.warning("Ignoring invalid JSON object config: {}", raw[:120])
return None
if isinstance(parsed, dict):
return parsed
return None
@staticmethod
def _is_dashscope_tts_provider(provider: Any) -> bool:
normalized = str(provider or "").strip().lower()
@@ -804,7 +821,7 @@ class DuplexPipeline:
if normalized_mode in {"offline", "streaming"}:
return normalized_mode # type: ignore[return-value]
normalized_provider = str(provider or "").strip().lower()
if normalized_provider == "dashscope":
if normalized_provider in {"dashscope", "volcengine"}:
return "streaming"
return "offline"
@@ -963,6 +980,10 @@ class DuplexPipeline:
tts_api_url = self._runtime_tts.get("baseUrl") or settings.tts_api_url
tts_voice = self._runtime_tts.get("voice") or settings.tts_voice
tts_model = self._runtime_tts.get("model") or settings.tts_model
tts_app_id = self._runtime_tts.get("appId") or settings.tts_app_id
tts_resource_id = self._runtime_tts.get("resourceId") or settings.tts_resource_id
tts_cluster = self._runtime_tts.get("cluster") or settings.tts_cluster
tts_uid = self._runtime_tts.get("uid") or settings.tts_uid
tts_speed = float(self._runtime_tts.get("speed") or settings.tts_speed)
tts_mode = self._resolved_dashscope_tts_mode()
runtime_mode = str(self._runtime_tts.get("mode") or "").strip()
@@ -978,6 +999,10 @@ class DuplexPipeline:
api_url=str(tts_api_url).strip() if tts_api_url else None,
voice=str(tts_voice),
model=str(tts_model).strip() if tts_model else None,
app_id=str(tts_app_id).strip() if tts_app_id else None,
resource_id=str(tts_resource_id).strip() if tts_resource_id else None,
cluster=str(tts_cluster).strip() if tts_cluster else None,
uid=str(tts_uid).strip() if tts_uid else None,
sample_rate=settings.sample_rate,
speed=tts_speed,
mode=str(tts_mode),
@@ -1006,6 +1031,13 @@ class DuplexPipeline:
asr_api_key = self._runtime_asr.get("apiKey")
asr_api_url = self._runtime_asr.get("baseUrl") or settings.asr_api_url
asr_model = self._runtime_asr.get("model") or settings.asr_model
asr_app_id = self._runtime_asr.get("appId") or settings.asr_app_id
asr_resource_id = self._runtime_asr.get("resourceId") or settings.asr_resource_id
asr_cluster = self._runtime_asr.get("cluster") or settings.asr_cluster
asr_uid = self._runtime_asr.get("uid") or settings.asr_uid
asr_request_params = self._coerce_json_object(self._runtime_asr.get("requestParams"))
if asr_request_params is None:
asr_request_params = self._coerce_json_object(settings.asr_request_params_json)
asr_enable_interim = self._coerce_bool(self._runtime_asr.get("enableInterim"))
if asr_enable_interim is None:
asr_enable_interim = bool(settings.asr_enable_interim)
@@ -1022,6 +1054,11 @@ class DuplexPipeline:
api_key=str(asr_api_key).strip() if asr_api_key else None,
api_url=str(asr_api_url).strip() if asr_api_url else None,
model=str(asr_model).strip() if asr_model else None,
app_id=str(asr_app_id).strip() if asr_app_id else None,
resource_id=str(asr_resource_id).strip() if asr_resource_id else None,
cluster=str(asr_cluster).strip() if asr_cluster else None,
uid=str(asr_uid).strip() if asr_uid else None,
request_params=asr_request_params,
enable_interim=asr_enable_interim,
interim_interval_ms=asr_interim_interval,
min_audio_for_interim_ms=asr_min_audio_ms,

View File

@@ -3,7 +3,7 @@
from __future__ import annotations
from dataclasses import dataclass
from typing import AsyncIterator, Awaitable, Callable, Literal, Optional, Protocol
from typing import Any, AsyncIterator, Awaitable, Callable, Dict, Literal, Optional, Protocol
from providers.common.base import ASRResult
@@ -22,6 +22,11 @@ class ASRServiceSpec:
api_key: Optional[str] = None
api_url: Optional[str] = None
model: Optional[str] = None
app_id: Optional[str] = None
resource_id: Optional[str] = None
cluster: Optional[str] = None
uid: Optional[str] = None
request_params: Optional[Dict[str, Any]] = None
enable_interim: bool = False
interim_interval_ms: int = 500
min_audio_for_interim_ms: int = 300

View File

@@ -19,6 +19,10 @@ class TTSServiceSpec:
api_key: Optional[str] = None
api_url: Optional[str] = None
model: Optional[str] = None
app_id: Optional[str] = None
resource_id: Optional[str] = None
cluster: Optional[str] = None
uid: Optional[str] = None
mode: str = "commit"