"""Volcengine TTS service. Uses Volcengine's unidirectional HTTP streaming TTS API and adapts streamed base64 audio chunks into engine-native ``TTSChunk`` events. """ from __future__ import annotations import asyncio import base64 import codecs import json import os import uuid from typing import Any, AsyncIterator, Optional import aiohttp from loguru import logger from providers.common.base import BaseTTSService, ServiceState, TTSChunk class VolcengineTTSService(BaseTTSService): """Streaming TTS adapter for Volcengine's HTTP v3 API.""" DEFAULT_API_URL = "https://openspeech.bytedance.com/api/v3/tts/unidirectional" DEFAULT_RESOURCE_ID = "seed-tts-2.0" def __init__( self, api_key: Optional[str] = None, api_url: Optional[str] = None, voice: str = "zh_female_shuangkuaisisi_moon_bigtts", model: Optional[str] = None, app_id: Optional[str] = None, resource_id: Optional[str] = None, uid: Optional[str] = None, sample_rate: int = 16000, speed: float = 1.0, ) -> None: super().__init__(voice=voice, sample_rate=sample_rate, speed=speed) self.api_key = api_key or os.getenv("VOLCENGINE_TTS_API_KEY") or os.getenv("TTS_API_KEY") self.api_url = api_url or os.getenv("VOLCENGINE_TTS_API_URL") or self.DEFAULT_API_URL self.model = str(model or os.getenv("VOLCENGINE_TTS_MODEL") or "").strip() or None self.app_id = app_id or os.getenv("VOLCENGINE_TTS_APP_ID") or os.getenv("TTS_APP_ID") self.resource_id = resource_id or os.getenv("VOLCENGINE_TTS_RESOURCE_ID") or self.DEFAULT_RESOURCE_ID self.uid = uid or os.getenv("VOLCENGINE_TTS_UID") self._session: Optional[aiohttp.ClientSession] = None self._cancel_event = asyncio.Event() self._synthesis_lock = asyncio.Lock() self._pending_audio: list[bytes] = [] async def connect(self) -> None: if not self.api_key: raise ValueError("Volcengine TTS API key not provided. Configure agent.tts.api_key in YAML.") if not self.app_id: raise ValueError("Volcengine TTS app_id not provided. Configure agent.tts.app_id in YAML.") timeout = aiohttp.ClientTimeout(total=None, sock_read=None, sock_connect=15) self._session = aiohttp.ClientSession(timeout=timeout) self.state = ServiceState.CONNECTED logger.info( "Volcengine TTS service ready: speaker={}, sample_rate={}, resource_id={}", self.voice, self.sample_rate, self.resource_id, ) async def disconnect(self) -> None: self._cancel_event.set() if self._session is not None: await self._session.close() self._session = None self.state = ServiceState.DISCONNECTED logger.info("Volcengine TTS service disconnected") async def synthesize(self, text: str) -> bytes: audio = b"" async for chunk in self.synthesize_stream(text): audio += chunk.audio return audio async def synthesize_stream(self, text: str) -> AsyncIterator[TTSChunk]: if not self._session: raise RuntimeError("Volcengine TTS service not connected") if not text.strip(): return async with self._synthesis_lock: self._cancel_event.clear() headers = { "Content-Type": "application/json", "X-Api-App-Key": str(self.app_id), "X-Api-Access-Key": str(self.api_key), "X-Api-Resource-Id": str(self.resource_id), "X-Api-Request-Id": str(uuid.uuid4()), } payload = { "user": { "uid": str(self.uid or self.app_id), }, "req_params": { "text": text, "speaker": self.voice, "audio_params": { "format": "pcm", "sample_rate": self.sample_rate, "speech_rate": self._speech_rate_percent(self.speed), }, }, } if self.model: payload["req_params"]["model"] = self.model chunk_size = max(1, self.sample_rate * 2 // 10) audio_buffer = b"" pending_chunk: Optional[bytes] = None try: async with self._session.post(self.api_url, headers=headers, json=payload) as response: if response.status != 200: error_text = await response.text() raise RuntimeError(f"Volcengine TTS error {response.status}: {error_text}") async for audio_bytes in self._iter_audio_bytes(response): if self._cancel_event.is_set(): logger.info("Volcengine TTS synthesis cancelled") return audio_buffer += audio_bytes while len(audio_buffer) >= chunk_size: emitted = audio_buffer[:chunk_size] audio_buffer = audio_buffer[chunk_size:] if pending_chunk is not None: yield TTSChunk(audio=pending_chunk, sample_rate=self.sample_rate, is_final=False) pending_chunk = emitted if self._cancel_event.is_set(): return if pending_chunk is not None: if audio_buffer: yield TTSChunk(audio=pending_chunk, sample_rate=self.sample_rate, is_final=False) pending_chunk = None else: yield TTSChunk(audio=pending_chunk, sample_rate=self.sample_rate, is_final=True) pending_chunk = None if audio_buffer: yield TTSChunk(audio=audio_buffer, sample_rate=self.sample_rate, is_final=True) except asyncio.CancelledError: logger.info("Volcengine TTS synthesis cancelled via asyncio") raise except Exception as exc: logger.error("Volcengine TTS synthesis error: {}", exc) raise async def cancel(self) -> None: self._cancel_event.set() async def _iter_audio_bytes(self, response: aiohttp.ClientResponse) -> AsyncIterator[bytes]: decoder = json.JSONDecoder() utf8_decoder = codecs.getincrementaldecoder("utf-8")() text_buffer = "" self._pending_audio.clear() async for raw_chunk in response.content.iter_any(): text_buffer += utf8_decoder.decode(raw_chunk) text_buffer = self._yield_audio_payloads(decoder, text_buffer) while self._pending_audio: yield self._pending_audio.pop(0) text_buffer += utf8_decoder.decode(b"", final=True) text_buffer = self._yield_audio_payloads(decoder, text_buffer) while self._pending_audio: yield self._pending_audio.pop(0) def _yield_audio_payloads(self, decoder: json.JSONDecoder, text_buffer: str) -> str: while True: stripped = text_buffer.lstrip() if not stripped: return "" if len(stripped) != len(text_buffer): text_buffer = stripped try: payload, idx = decoder.raw_decode(text_buffer) except json.JSONDecodeError: return text_buffer text_buffer = text_buffer[idx:] audio = self._extract_audio_bytes(payload) if audio: self._pending_audio.append(audio) def _extract_audio_bytes(self, payload: Any) -> bytes: if not isinstance(payload, dict): return b"" code = payload.get("code") if code not in (None, 0, 20000000): message = str(payload.get("message") or "unknown error") raise RuntimeError(f"Volcengine TTS stream error {code}: {message}") encoded = payload.get("data") if isinstance(encoded, str) and encoded.strip(): try: return base64.b64decode(encoded) except Exception as exc: logger.warning("Failed to decode Volcengine TTS audio chunk: {}", exc) return b"" @staticmethod def _speech_rate_percent(speed: float) -> int: clamped = max(0.5, min(2.0, float(speed or 1.0))) return int(round((clamped - 1.0) * 100))