Add Volcengine support for TTS and ASR services

- Introduced Volcengine as a new provider for both TTS and ASR services. - Updated configuration files to include Volcengine-specific parameters such as app_id, resource_id, and uid. - Enhanced the ASR service to support streaming mode with Volcengine's API. - Modified existing tests to validate the integration of Volcengine services. - Updated documentation to reflect the addition of Volcengine as a supported provider for TTS and ASR. - Refactored service factory to accommodate Volcengine alongside existing providers.
2026-03-08 23:09:50 +08:00
parent 3604db21eb
commit aeeeee20d1
18 changed files with 1256 additions and 12 deletions
--- a/engine/providers/factory/default.py
+++ b/engine/providers/factory/default.py
@@ -17,14 +17,17 @@ from runtime.ports import (
 )
 from providers.asr.buffered import BufferedASRService
 from providers.asr.dashscope import DashScopeRealtimeASRService
+from providers.asr.volcengine import VolcengineRealtimeASRService
 from providers.tts.dashscope import DashScopeTTSService
 from providers.llm.openai import MockLLMService, OpenAILLMService
 from providers.asr.openai_compatible import OpenAICompatibleASRService
 from providers.tts.openai_compatible import OpenAICompatibleTTSService
 from providers.tts.mock import MockTTSService
+from providers.tts.volcengine import VolcengineTTSService

 _OPENAI_COMPATIBLE_PROVIDERS = {"openai_compatible", "openai-compatible", "siliconflow"}
 _DASHSCOPE_PROVIDERS = {"dashscope"}
+_VOLCENGINE_PROVIDERS = {"volcengine"}
 _SUPPORTED_LLM_PROVIDERS = {"openai", *_OPENAI_COMPATIBLE_PROVIDERS}


@@ -37,6 +40,10 @@ class DefaultRealtimeServiceFactory(RealtimeServiceFactory):
    _DEFAULT_DASHSCOPE_ASR_MODEL = "qwen3-asr-flash-realtime"
    _DEFAULT_OPENAI_COMPATIBLE_TTS_MODEL = "FunAudioLLM/CosyVoice2-0.5B"
    _DEFAULT_OPENAI_COMPATIBLE_ASR_MODEL = "FunAudioLLM/SenseVoiceSmall"
+    _DEFAULT_VOLCENGINE_TTS_URL = "https://openspeech.bytedance.com/api/v3/tts/unidirectional"
+    _DEFAULT_VOLCENGINE_TTS_RESOURCE_ID = "seed-tts-2.0"
+    _DEFAULT_VOLCENGINE_ASR_REALTIME_URL = "wss://openspeech.bytedance.com/api/v3/sauc/bigmodel"
+    _DEFAULT_VOLCENGINE_ASR_MODEL = "bigmodel"

    @staticmethod
    def _normalize_provider(provider: Any) -> str:
@@ -81,6 +88,19 @@ class DefaultRealtimeServiceFactory(RealtimeServiceFactory):
                speed=spec.speed,
            )

+        if provider in _VOLCENGINE_PROVIDERS and spec.api_key:
+            return VolcengineTTSService(
+                api_key=spec.api_key,
+                api_url=spec.api_url or self._DEFAULT_VOLCENGINE_TTS_URL,
+                voice=spec.voice,
+                model=spec.model,
+                app_id=spec.app_id,
+                resource_id=spec.resource_id or self._DEFAULT_VOLCENGINE_TTS_RESOURCE_ID,
+                uid=spec.uid,
+                sample_rate=spec.sample_rate,
+                speed=spec.speed,
+            )
+
        if provider in _OPENAI_COMPATIBLE_PROVIDERS and spec.api_key:
            return OpenAICompatibleTTSService(
                api_key=spec.api_key,
@@ -110,6 +130,20 @@ class DefaultRealtimeServiceFactory(RealtimeServiceFactory):
                on_transcript=spec.on_transcript,
            )

+        if provider in _VOLCENGINE_PROVIDERS and spec.api_key:
+            return VolcengineRealtimeASRService(
+                api_key=spec.api_key,
+                api_url=spec.api_url or self._DEFAULT_VOLCENGINE_ASR_REALTIME_URL,
+                model=spec.model or self._DEFAULT_VOLCENGINE_ASR_MODEL,
+                sample_rate=spec.sample_rate,
+                language=spec.language,
+                app_id=spec.app_id,
+                resource_id=spec.resource_id,
+                uid=spec.uid,
+                request_params=spec.request_params,
+                on_transcript=spec.on_transcript,
+            )
+
        if provider in _OPENAI_COMPATIBLE_PROVIDERS and spec.api_key:
            return OpenAICompatibleASRService(
                api_key=spec.api_key,