add dashscope tts
This commit is contained in:
@@ -30,6 +30,7 @@ from processors.eou import EouDetector
|
||||
from processors.vad import SileroVAD, VADProcessor
|
||||
from services.asr import BufferedASRService
|
||||
from services.base import BaseASRService, BaseLLMService, BaseTTSService, LLMMessage, LLMStreamEvent
|
||||
from services.dashscope_tts import DashScopeTTSService
|
||||
from services.llm import MockLLMService, OpenAILLMService
|
||||
from services.openai_compatible_asr import OpenAICompatibleASRService
|
||||
from services.openai_compatible_tts import OpenAICompatibleTTSService
|
||||
@@ -349,6 +350,21 @@ class DuplexPipeline:
|
||||
if not output_mode:
|
||||
output_mode = "audio" if self._tts_output_enabled() else "text"
|
||||
|
||||
tts_model = str(
|
||||
self._runtime_tts.get("model")
|
||||
or settings.tts_model
|
||||
or (self._default_dashscope_tts_model() if self._is_dashscope_tts_provider(tts_provider) else "")
|
||||
)
|
||||
tts_config = {
|
||||
"enabled": self._tts_output_enabled(),
|
||||
"provider": tts_provider,
|
||||
"model": tts_model,
|
||||
"voice": str(self._runtime_tts.get("voice") or settings.tts_voice),
|
||||
"speed": float(self._runtime_tts.get("speed") or settings.tts_speed),
|
||||
}
|
||||
if self._is_dashscope_tts_provider(tts_provider):
|
||||
tts_config["mode"] = self._resolved_dashscope_tts_mode()
|
||||
|
||||
return {
|
||||
"output": {"mode": output_mode},
|
||||
"services": {
|
||||
@@ -363,13 +379,7 @@ class DuplexPipeline:
|
||||
"interimIntervalMs": int(self._runtime_asr.get("interimIntervalMs") or settings.asr_interim_interval_ms),
|
||||
"minAudioMs": int(self._runtime_asr.get("minAudioMs") or settings.asr_min_audio_ms),
|
||||
},
|
||||
"tts": {
|
||||
"enabled": self._tts_output_enabled(),
|
||||
"provider": tts_provider,
|
||||
"model": str(self._runtime_tts.get("model") or settings.tts_model or ""),
|
||||
"voice": str(self._runtime_tts.get("voice") or settings.tts_voice),
|
||||
"speed": float(self._runtime_tts.get("speed") or settings.tts_speed),
|
||||
},
|
||||
"tts": tts_config,
|
||||
},
|
||||
"tools": {
|
||||
"allowlist": self._resolved_tool_allowlist(),
|
||||
@@ -484,6 +494,11 @@ class DuplexPipeline:
|
||||
normalized = str(provider or "").strip().lower()
|
||||
return normalized in {"openai_compatible", "openai-compatible", "siliconflow"}
|
||||
|
||||
@staticmethod
|
||||
def _is_dashscope_tts_provider(provider: Any) -> bool:
|
||||
normalized = str(provider or "").strip().lower()
|
||||
return normalized == "dashscope"
|
||||
|
||||
@staticmethod
|
||||
def _is_llm_provider_supported(provider: Any) -> bool:
|
||||
normalized = str(provider or "").strip().lower()
|
||||
@@ -496,6 +511,28 @@ class DuplexPipeline:
|
||||
return "https://api.siliconflow.cn/v1"
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _default_dashscope_tts_realtime_url() -> str:
|
||||
return "wss://dashscope.aliyuncs.com/api-ws/v1/realtime"
|
||||
|
||||
@staticmethod
|
||||
def _default_dashscope_tts_model() -> str:
|
||||
return "qwen3-tts-flash-realtime"
|
||||
|
||||
def _resolved_dashscope_tts_mode(self) -> str:
|
||||
raw_mode = str(self._runtime_tts.get("mode") or settings.tts_mode or "commit").strip().lower()
|
||||
if raw_mode in {"commit", "server_commit"}:
|
||||
return raw_mode
|
||||
return "commit"
|
||||
|
||||
def _use_engine_sentence_split_for_tts(self) -> bool:
|
||||
tts_provider = str(self._runtime_tts.get("provider") or settings.tts_provider).strip().lower()
|
||||
if not self._is_dashscope_tts_provider(tts_provider):
|
||||
return True
|
||||
# DashScope commit mode is client-driven and expects engine-side segmentation.
|
||||
# server_commit mode lets DashScope handle segmentation on appended text.
|
||||
return self._resolved_dashscope_tts_mode() != "server_commit"
|
||||
|
||||
def _tts_output_enabled(self) -> bool:
|
||||
enabled = self._coerce_bool(self._runtime_tts.get("enabled"))
|
||||
if enabled is not None:
|
||||
@@ -610,8 +647,26 @@ class DuplexPipeline:
|
||||
tts_voice = self._runtime_tts.get("voice") or settings.tts_voice
|
||||
tts_model = self._runtime_tts.get("model") or settings.tts_model
|
||||
tts_speed = float(self._runtime_tts.get("speed") or settings.tts_speed)
|
||||
tts_mode = self._resolved_dashscope_tts_mode()
|
||||
runtime_mode = str(self._runtime_tts.get("mode") or "").strip()
|
||||
if runtime_mode and not self._is_dashscope_tts_provider(tts_provider):
|
||||
logger.warning(
|
||||
"services.tts.mode is DashScope-only and will be ignored "
|
||||
f"for provider={tts_provider}"
|
||||
)
|
||||
|
||||
if self._is_openai_compatible_provider(tts_provider) and tts_api_key:
|
||||
if self._is_dashscope_tts_provider(tts_provider) and tts_api_key:
|
||||
self.tts_service = DashScopeTTSService(
|
||||
api_key=tts_api_key,
|
||||
api_url=tts_api_url or self._default_dashscope_tts_realtime_url(),
|
||||
voice=tts_voice,
|
||||
model=tts_model or self._default_dashscope_tts_model(),
|
||||
mode=str(tts_mode),
|
||||
sample_rate=settings.sample_rate,
|
||||
speed=tts_speed
|
||||
)
|
||||
logger.info("Using DashScope realtime TTS service")
|
||||
elif self._is_openai_compatible_provider(tts_provider) and tts_api_key:
|
||||
self.tts_service = OpenAICompatibleTTSService(
|
||||
api_key=tts_api_key,
|
||||
api_url=tts_api_url,
|
||||
@@ -1379,6 +1434,7 @@ class DuplexPipeline:
|
||||
round_response = ""
|
||||
tool_calls: List[Dict[str, Any]] = []
|
||||
allow_text_output = True
|
||||
use_engine_sentence_split = self._use_engine_sentence_split_for_tts()
|
||||
|
||||
async for raw_event in self.llm_service.generate_stream(messages):
|
||||
if self._interrupt_event.is_set():
|
||||
@@ -1446,52 +1502,56 @@ class DuplexPipeline:
|
||||
):
|
||||
await self._flush_pending_llm_delta()
|
||||
|
||||
while True:
|
||||
split_result = extract_tts_sentence(
|
||||
sentence_buffer,
|
||||
end_chars=self._SENTENCE_END_CHARS,
|
||||
trailing_chars=self._SENTENCE_TRAILING_CHARS,
|
||||
closers=self._SENTENCE_CLOSERS,
|
||||
min_split_spoken_chars=self._MIN_SPLIT_SPOKEN_CHARS,
|
||||
hold_trailing_at_buffer_end=True,
|
||||
force=False,
|
||||
)
|
||||
if not split_result:
|
||||
break
|
||||
sentence, sentence_buffer = split_result
|
||||
if not sentence:
|
||||
continue
|
||||
|
||||
sentence = f"{pending_punctuation}{sentence}".strip()
|
||||
pending_punctuation = ""
|
||||
if not sentence:
|
||||
continue
|
||||
|
||||
if not has_spoken_content(sentence):
|
||||
pending_punctuation = sentence
|
||||
continue
|
||||
|
||||
if self._tts_output_enabled() and not self._interrupt_event.is_set():
|
||||
if not first_audio_sent:
|
||||
self._start_tts()
|
||||
await self._send_event(
|
||||
{
|
||||
**ev(
|
||||
"output.audio.start",
|
||||
trackId=self.track_audio_out,
|
||||
)
|
||||
},
|
||||
priority=10,
|
||||
)
|
||||
first_audio_sent = True
|
||||
|
||||
await self._speak_sentence(
|
||||
sentence,
|
||||
fade_in_ms=0,
|
||||
fade_out_ms=8,
|
||||
if use_engine_sentence_split:
|
||||
while True:
|
||||
split_result = extract_tts_sentence(
|
||||
sentence_buffer,
|
||||
end_chars=self._SENTENCE_END_CHARS,
|
||||
trailing_chars=self._SENTENCE_TRAILING_CHARS,
|
||||
closers=self._SENTENCE_CLOSERS,
|
||||
min_split_spoken_chars=self._MIN_SPLIT_SPOKEN_CHARS,
|
||||
hold_trailing_at_buffer_end=True,
|
||||
force=False,
|
||||
)
|
||||
if not split_result:
|
||||
break
|
||||
sentence, sentence_buffer = split_result
|
||||
if not sentence:
|
||||
continue
|
||||
|
||||
remaining_text = f"{pending_punctuation}{sentence_buffer}".strip()
|
||||
sentence = f"{pending_punctuation}{sentence}".strip()
|
||||
pending_punctuation = ""
|
||||
if not sentence:
|
||||
continue
|
||||
|
||||
if not has_spoken_content(sentence):
|
||||
pending_punctuation = sentence
|
||||
continue
|
||||
|
||||
if self._tts_output_enabled() and not self._interrupt_event.is_set():
|
||||
if not first_audio_sent:
|
||||
self._start_tts()
|
||||
await self._send_event(
|
||||
{
|
||||
**ev(
|
||||
"output.audio.start",
|
||||
trackId=self.track_audio_out,
|
||||
)
|
||||
},
|
||||
priority=10,
|
||||
)
|
||||
first_audio_sent = True
|
||||
|
||||
await self._speak_sentence(
|
||||
sentence,
|
||||
fade_in_ms=0,
|
||||
fade_out_ms=8,
|
||||
)
|
||||
|
||||
if use_engine_sentence_split:
|
||||
remaining_text = f"{pending_punctuation}{sentence_buffer}".strip()
|
||||
else:
|
||||
remaining_text = sentence_buffer.strip()
|
||||
await self._flush_pending_llm_delta()
|
||||
if (
|
||||
self._tts_output_enabled()
|
||||
|
||||
Reference in New Issue
Block a user