add dashscope tts

This commit is contained in:
Xin Wang
2026-02-26 03:02:48 +08:00
parent 6744646390
commit 562341a72c
9 changed files with 542 additions and 59 deletions

View File

@@ -30,6 +30,7 @@ from processors.eou import EouDetector
from processors.vad import SileroVAD, VADProcessor
from services.asr import BufferedASRService
from services.base import BaseASRService, BaseLLMService, BaseTTSService, LLMMessage, LLMStreamEvent
from services.dashscope_tts import DashScopeTTSService
from services.llm import MockLLMService, OpenAILLMService
from services.openai_compatible_asr import OpenAICompatibleASRService
from services.openai_compatible_tts import OpenAICompatibleTTSService
@@ -349,6 +350,21 @@ class DuplexPipeline:
if not output_mode:
output_mode = "audio" if self._tts_output_enabled() else "text"
tts_model = str(
self._runtime_tts.get("model")
or settings.tts_model
or (self._default_dashscope_tts_model() if self._is_dashscope_tts_provider(tts_provider) else "")
)
tts_config = {
"enabled": self._tts_output_enabled(),
"provider": tts_provider,
"model": tts_model,
"voice": str(self._runtime_tts.get("voice") or settings.tts_voice),
"speed": float(self._runtime_tts.get("speed") or settings.tts_speed),
}
if self._is_dashscope_tts_provider(tts_provider):
tts_config["mode"] = self._resolved_dashscope_tts_mode()
return {
"output": {"mode": output_mode},
"services": {
@@ -363,13 +379,7 @@ class DuplexPipeline:
"interimIntervalMs": int(self._runtime_asr.get("interimIntervalMs") or settings.asr_interim_interval_ms),
"minAudioMs": int(self._runtime_asr.get("minAudioMs") or settings.asr_min_audio_ms),
},
"tts": {
"enabled": self._tts_output_enabled(),
"provider": tts_provider,
"model": str(self._runtime_tts.get("model") or settings.tts_model or ""),
"voice": str(self._runtime_tts.get("voice") or settings.tts_voice),
"speed": float(self._runtime_tts.get("speed") or settings.tts_speed),
},
"tts": tts_config,
},
"tools": {
"allowlist": self._resolved_tool_allowlist(),
@@ -484,6 +494,11 @@ class DuplexPipeline:
normalized = str(provider or "").strip().lower()
return normalized in {"openai_compatible", "openai-compatible", "siliconflow"}
@staticmethod
def _is_dashscope_tts_provider(provider: Any) -> bool:
normalized = str(provider or "").strip().lower()
return normalized == "dashscope"
@staticmethod
def _is_llm_provider_supported(provider: Any) -> bool:
normalized = str(provider or "").strip().lower()
@@ -496,6 +511,28 @@ class DuplexPipeline:
return "https://api.siliconflow.cn/v1"
return None
@staticmethod
def _default_dashscope_tts_realtime_url() -> str:
return "wss://dashscope.aliyuncs.com/api-ws/v1/realtime"
@staticmethod
def _default_dashscope_tts_model() -> str:
return "qwen3-tts-flash-realtime"
def _resolved_dashscope_tts_mode(self) -> str:
raw_mode = str(self._runtime_tts.get("mode") or settings.tts_mode or "commit").strip().lower()
if raw_mode in {"commit", "server_commit"}:
return raw_mode
return "commit"
def _use_engine_sentence_split_for_tts(self) -> bool:
tts_provider = str(self._runtime_tts.get("provider") or settings.tts_provider).strip().lower()
if not self._is_dashscope_tts_provider(tts_provider):
return True
# DashScope commit mode is client-driven and expects engine-side segmentation.
# server_commit mode lets DashScope handle segmentation on appended text.
return self._resolved_dashscope_tts_mode() != "server_commit"
def _tts_output_enabled(self) -> bool:
enabled = self._coerce_bool(self._runtime_tts.get("enabled"))
if enabled is not None:
@@ -610,8 +647,26 @@ class DuplexPipeline:
tts_voice = self._runtime_tts.get("voice") or settings.tts_voice
tts_model = self._runtime_tts.get("model") or settings.tts_model
tts_speed = float(self._runtime_tts.get("speed") or settings.tts_speed)
tts_mode = self._resolved_dashscope_tts_mode()
runtime_mode = str(self._runtime_tts.get("mode") or "").strip()
if runtime_mode and not self._is_dashscope_tts_provider(tts_provider):
logger.warning(
"services.tts.mode is DashScope-only and will be ignored "
f"for provider={tts_provider}"
)
if self._is_openai_compatible_provider(tts_provider) and tts_api_key:
if self._is_dashscope_tts_provider(tts_provider) and tts_api_key:
self.tts_service = DashScopeTTSService(
api_key=tts_api_key,
api_url=tts_api_url or self._default_dashscope_tts_realtime_url(),
voice=tts_voice,
model=tts_model or self._default_dashscope_tts_model(),
mode=str(tts_mode),
sample_rate=settings.sample_rate,
speed=tts_speed
)
logger.info("Using DashScope realtime TTS service")
elif self._is_openai_compatible_provider(tts_provider) and tts_api_key:
self.tts_service = OpenAICompatibleTTSService(
api_key=tts_api_key,
api_url=tts_api_url,
@@ -1379,6 +1434,7 @@ class DuplexPipeline:
round_response = ""
tool_calls: List[Dict[str, Any]] = []
allow_text_output = True
use_engine_sentence_split = self._use_engine_sentence_split_for_tts()
async for raw_event in self.llm_service.generate_stream(messages):
if self._interrupt_event.is_set():
@@ -1446,52 +1502,56 @@ class DuplexPipeline:
):
await self._flush_pending_llm_delta()
while True:
split_result = extract_tts_sentence(
sentence_buffer,
end_chars=self._SENTENCE_END_CHARS,
trailing_chars=self._SENTENCE_TRAILING_CHARS,
closers=self._SENTENCE_CLOSERS,
min_split_spoken_chars=self._MIN_SPLIT_SPOKEN_CHARS,
hold_trailing_at_buffer_end=True,
force=False,
)
if not split_result:
break
sentence, sentence_buffer = split_result
if not sentence:
continue
sentence = f"{pending_punctuation}{sentence}".strip()
pending_punctuation = ""
if not sentence:
continue
if not has_spoken_content(sentence):
pending_punctuation = sentence
continue
if self._tts_output_enabled() and not self._interrupt_event.is_set():
if not first_audio_sent:
self._start_tts()
await self._send_event(
{
**ev(
"output.audio.start",
trackId=self.track_audio_out,
)
},
priority=10,
)
first_audio_sent = True
await self._speak_sentence(
sentence,
fade_in_ms=0,
fade_out_ms=8,
if use_engine_sentence_split:
while True:
split_result = extract_tts_sentence(
sentence_buffer,
end_chars=self._SENTENCE_END_CHARS,
trailing_chars=self._SENTENCE_TRAILING_CHARS,
closers=self._SENTENCE_CLOSERS,
min_split_spoken_chars=self._MIN_SPLIT_SPOKEN_CHARS,
hold_trailing_at_buffer_end=True,
force=False,
)
if not split_result:
break
sentence, sentence_buffer = split_result
if not sentence:
continue
remaining_text = f"{pending_punctuation}{sentence_buffer}".strip()
sentence = f"{pending_punctuation}{sentence}".strip()
pending_punctuation = ""
if not sentence:
continue
if not has_spoken_content(sentence):
pending_punctuation = sentence
continue
if self._tts_output_enabled() and not self._interrupt_event.is_set():
if not first_audio_sent:
self._start_tts()
await self._send_event(
{
**ev(
"output.audio.start",
trackId=self.track_audio_out,
)
},
priority=10,
)
first_audio_sent = True
await self._speak_sentence(
sentence,
fade_in_ms=0,
fade_out_ms=8,
)
if use_engine_sentence_split:
remaining_text = f"{pending_punctuation}{sentence_buffer}".strip()
else:
remaining_text = sentence_buffer.strip()
await self._flush_pending_llm_delta()
if (
self._tts_output_enabled()