Add Volcengine support for TTS and ASR services

- Introduced Volcengine as a new provider for both TTS and ASR services.
- Updated configuration files to include Volcengine-specific parameters such as app_id, resource_id, and uid.
- Enhanced the ASR service to support streaming mode with Volcengine's API.
- Modified existing tests to validate the integration of Volcengine services.
- Updated documentation to reflect the addition of Volcengine as a supported provider for TTS and ASR.
- Refactored service factory to accommodate Volcengine alongside existing providers.
This commit is contained in:
Xin Wang
2026-03-08 23:09:50 +08:00
parent 3604db21eb
commit aeeeee20d1
18 changed files with 1256 additions and 12 deletions

View File

@@ -17,14 +17,17 @@ from runtime.ports import (
)
from providers.asr.buffered import BufferedASRService
from providers.asr.dashscope import DashScopeRealtimeASRService
from providers.asr.volcengine import VolcengineRealtimeASRService
from providers.tts.dashscope import DashScopeTTSService
from providers.llm.openai import MockLLMService, OpenAILLMService
from providers.asr.openai_compatible import OpenAICompatibleASRService
from providers.tts.openai_compatible import OpenAICompatibleTTSService
from providers.tts.mock import MockTTSService
from providers.tts.volcengine import VolcengineTTSService
_OPENAI_COMPATIBLE_PROVIDERS = {"openai_compatible", "openai-compatible", "siliconflow"}
_DASHSCOPE_PROVIDERS = {"dashscope"}
_VOLCENGINE_PROVIDERS = {"volcengine"}
_SUPPORTED_LLM_PROVIDERS = {"openai", *_OPENAI_COMPATIBLE_PROVIDERS}
@@ -37,6 +40,10 @@ class DefaultRealtimeServiceFactory(RealtimeServiceFactory):
_DEFAULT_DASHSCOPE_ASR_MODEL = "qwen3-asr-flash-realtime"
_DEFAULT_OPENAI_COMPATIBLE_TTS_MODEL = "FunAudioLLM/CosyVoice2-0.5B"
_DEFAULT_OPENAI_COMPATIBLE_ASR_MODEL = "FunAudioLLM/SenseVoiceSmall"
_DEFAULT_VOLCENGINE_TTS_URL = "https://openspeech.bytedance.com/api/v3/tts/unidirectional"
_DEFAULT_VOLCENGINE_TTS_RESOURCE_ID = "seed-tts-2.0"
_DEFAULT_VOLCENGINE_ASR_REALTIME_URL = "wss://openspeech.bytedance.com/api/v3/sauc/bigmodel"
_DEFAULT_VOLCENGINE_ASR_MODEL = "bigmodel"
@staticmethod
def _normalize_provider(provider: Any) -> str:
@@ -81,6 +88,19 @@ class DefaultRealtimeServiceFactory(RealtimeServiceFactory):
speed=spec.speed,
)
if provider in _VOLCENGINE_PROVIDERS and spec.api_key:
return VolcengineTTSService(
api_key=spec.api_key,
api_url=spec.api_url or self._DEFAULT_VOLCENGINE_TTS_URL,
voice=spec.voice,
model=spec.model,
app_id=spec.app_id,
resource_id=spec.resource_id or self._DEFAULT_VOLCENGINE_TTS_RESOURCE_ID,
uid=spec.uid,
sample_rate=spec.sample_rate,
speed=spec.speed,
)
if provider in _OPENAI_COMPATIBLE_PROVIDERS and spec.api_key:
return OpenAICompatibleTTSService(
api_key=spec.api_key,
@@ -110,6 +130,20 @@ class DefaultRealtimeServiceFactory(RealtimeServiceFactory):
on_transcript=spec.on_transcript,
)
if provider in _VOLCENGINE_PROVIDERS and spec.api_key:
return VolcengineRealtimeASRService(
api_key=spec.api_key,
api_url=spec.api_url or self._DEFAULT_VOLCENGINE_ASR_REALTIME_URL,
model=spec.model or self._DEFAULT_VOLCENGINE_ASR_MODEL,
sample_rate=spec.sample_rate,
language=spec.language,
app_id=spec.app_id,
resource_id=spec.resource_id,
uid=spec.uid,
request_params=spec.request_params,
on_transcript=spec.on_transcript,
)
if provider in _OPENAI_COMPATIBLE_PROVIDERS and spec.api_key:
return OpenAICompatibleASRService(
api_key=spec.api_key,