ai-video-fullstack/backend/services/pipecat/xfyun_tts.py

from __future__ import annotations

import base64
import hashlib
import hmac
import json
import os
import re
import unicodedata
from collections.abc import AsyncGenerator, AsyncIterator
from datetime import datetime, timezone
from email.utils import format_datetime
from typing import Any
from urllib.parse import urlencode, urlparse

from loguru import logger

from pipecat.frames.frames import ErrorFrame, Frame
from pipecat.services.settings import TTSSettings
from pipecat.services.tts_service import TTSService
from websockets.asyncio.client import connect


DEFAULT_XFYUN_TTS_URL = "wss://tts-api.xfyun.cn/v2/tts"

# Strip characters Xfyun's online TTS cannot synthesize. The engine silently
# rejects (or returns empty audio for) text containing emoji and other
# non-BMP symbols, which surfaces as "request finished without audio data".
_EMOJI_AND_SYMBOL_RE = re.compile(
    "["
    "\U0001F300-\U0001FAFF"  # misc pictographs, emoji, symbols, transport, etc.
    "\U00002600-\U000027BF"  # misc symbols and dingbats
    "\U0001F1E6-\U0001F1FF"  # regional indicators (flags)
    "\uFE00-\uFE0F"           # variation selectors
    "\u200D"                  # zero-width joiner
    "]",
    flags=re.UNICODE,
)


class XfyunTTSService(TTSService):
    """iFlytek/Xfyun online TTS service for Pipecat.

    Xfyun's API is not OpenAI-compatible. It uses a signed WebSocket URL,
    receives one JSON request per synthesis, and streams text WebSocket
    messages containing base64-encoded audio chunks. This service requests
    raw PCM so the chunks can become Pipecat audio frames without MP3 decode.
    """

    def __init__(
        self,
        *,
        app_id: str,
        api_key: str,
        api_secret: str,
        voice: str,
        url: str | None = None,
        sample_rate: int = 16000,
        source_sample_rate: int = 16000,
        encoding: str = "raw",
        text_encoding: str = "UTF8",
        speed: int = 50,
        volume: int = 50,
        pitch: int = 50,
        timeout: float = 30.0,
        **kwargs,
    ) -> None:
        super().__init__(
            sample_rate=sample_rate,
            settings=TTSSettings(model=None, voice=voice, language=None),
            **kwargs,
        )
        self._app_id = app_id or os.environ.get("XFYUN_APP_ID", "")
        self._api_key = api_key or os.environ.get("XFYUN_API_KEY", "")
        self._api_secret = api_secret or os.environ.get("XFYUN_API_SECRET", "")
        self._voice = voice
        self._url = url or DEFAULT_XFYUN_TTS_URL
        self._source_sample_rate = source_sample_rate
        self._encoding = encoding
        self._text_encoding = text_encoding
        self._speed = speed
        self._volume = volume
        self._pitch = pitch
        self._timeout = timeout
        self._last_failure_detail: str | None = None

    async def run_tts(self, text: str, context_id: str) -> AsyncGenerator[Frame, None]:
        if not text:
            return

        if not self._app_id or not self._api_key or not self._api_secret:
            yield ErrorFrame(error="Xfyun TTS requires app_id, api_key, and api_secret")
            return

        sanitized = _sanitize_text_for_tts(text)
        if not sanitized:
            logger.debug(
                f"{self}: skipping Xfyun TTS, text became empty after sanitization "
                f"(original={text!r})"
            )
            return

        if sanitized != text:
            logger.debug(
                f"{self}: sanitized Xfyun TTS text "
                f"(original={text!r}, sanitized={sanitized!r})"
            )

        if len(sanitized.encode("utf-8")) >= 8000:
            yield ErrorFrame(error="Xfyun TTS text must be less than 8000 UTF-8 bytes")
            return

        if self._encoding != "raw":
            yield ErrorFrame(error="Xfyun TTS is configured for PCM output; set aue/encoding to raw")
            return

        try:
            await self.start_tts_usage_metrics(sanitized)

            first_frame = True
            async for frame in self._stream_audio_frames_from_iterator(
                self._iter_audio_chunks(sanitized),
                in_sample_rate=self._source_sample_rate,
                context_id=context_id,
            ):
                if first_frame:
                    await self.stop_ttfb_metrics()
                    first_frame = False
                yield frame

            if first_frame:
                detail = self._last_failure_detail or "no audio frames received"
                yield ErrorFrame(
                    error=(
                        f"Xfyun TTS request finished without audio data ({detail}); "
                        f"text={sanitized!r}"
                    )
                )
        except Exception as exc:
            yield ErrorFrame(error=f"Xfyun TTS request failed: {exc}")

    async def _iter_audio_chunks(self, text: str) -> AsyncIterator[bytes]:
        request = self._build_request_frame(text)
        auth_url = _build_auth_url(self._url, self._api_key, self._api_secret)

        self._last_failure_detail = None
        frames_received = 0
        audio_bytes_received = 0
        last_status: int | None = None
        last_sid: str | None = None
        saw_status_2 = False

        async with connect(auth_url, max_size=None, open_timeout=self._timeout) as websocket:
            await websocket.send(json.dumps(request, ensure_ascii=False))

            async for raw_message in websocket:
                frames_received += 1
                payload = json.loads(raw_message)
                code = payload.get("code", -1)
                sid = payload.get("sid")
                if sid:
                    last_sid = sid
                if code != 0:
                    err_msg = payload.get("message", "unknown error")
                    raise RuntimeError(f"code={code}, sid={sid}, message={err_msg}")

                data = payload.get("data")
                if not isinstance(data, dict):
                    continue

                last_status = data.get("status", last_status)

                audio_b64 = data.get("audio")
                if audio_b64:
                    audio_bytes = base64.b64decode(audio_b64)
                    audio_bytes_received += len(audio_bytes)
                    yield audio_bytes

                if data.get("status") == 2:
                    saw_status_2 = True
                    break

        if audio_bytes_received == 0:
            self._last_failure_detail = (
                f"frames={frames_received}, audio_bytes=0, "
                f"last_status={last_status}, saw_status_2={saw_status_2}, sid={last_sid}"
            )
            logger.warning(
                f"{self}: Xfyun TTS produced no audio ({self._last_failure_detail})"
            )

    def _build_request_frame(self, text: str) -> dict[str, Any]:
        business: dict[str, Any] = {
            "aue": self._encoding,
            "auf": f"audio/L16;rate={self._source_sample_rate}",
            "vcn": self._voice,
            "speed": self._speed,
            "volume": self._volume,
            "pitch": self._pitch,
            "tte": self._text_encoding,
        }

        return {
            "common": {"app_id": self._app_id},
            "business": business,
            "data": {
                "status": 2,
                "text": base64.b64encode(text.encode("utf-8")).decode("utf-8"),
            },
        }


def _sanitize_text_for_tts(text: str) -> str:
    """Strip characters Xfyun's online TTS cannot synthesize.

    The Xfyun ``/v2/tts`` engine silently drops or rejects emoji, pictographs,
    dingbats, regional-indicator flags, variation selectors, and zero-width
    joiners.  When such characters appear in the input the synthesis can
    finish without any audio data ("Xfyun TTS request finished without audio
    data").  We also drop control characters (other than common whitespace)
    and "Symbol, Other" codepoints, then collapse runs of whitespace.
    """
    if not text:
        return text

    cleaned = _EMOJI_AND_SYMBOL_RE.sub("", text)
    filtered: list[str] = []
    for ch in cleaned:
        category = unicodedata.category(ch)
        if category == "So":
            continue
        if category.startswith("C") and ch not in ("\n", "\r", "\t"):
            continue
        filtered.append(ch)
    return re.sub(r"\s+", " ", "".join(filtered)).strip()


def _build_auth_url(url: str, api_key: str, api_secret: str) -> str:
    parsed = urlparse(url)
    host = parsed.netloc
    path = parsed.path or "/v2/tts"
    date = format_datetime(datetime.now(timezone.utc), usegmt=True)
    request_line = f"GET {path} HTTP/1.1"
    signature_origin = f"host: {host}\ndate: {date}\n{request_line}"
    signature_sha = hmac.new(
        api_secret.encode("utf-8"),
        signature_origin.encode("utf-8"),
        digestmod=hashlib.sha256,
    ).digest()
    signature = base64.b64encode(signature_sha).decode("utf-8")
    authorization_origin = (
        f'api_key="{api_key}", algorithm="hmac-sha256", '
        f'headers="host date request-line", signature="{signature}"'
    )
    authorization = base64.b64encode(authorization_origin.encode("utf-8")).decode("utf-8")
    query = urlencode({"authorization": authorization, "date": date, "host": host})
    return f"{url}?{query}"