Files
ai-video-fullstack/backend/services/interface_catalog.py
Xin Wang 0309c154b5 Implement StepFun Realtime service and enhance AssistantConfig
- Add new fields to AssistantConfig for realtime interface configuration, including types, values, and secrets.
- Introduce StepFunRealtimeService to handle speech-to-speech processing via WebSocket, integrating STT, LLM, and TTS functionalities.
- Refactor pipeline execution to support a new realtime mode, allowing direct text input processing and immediate responses.
- Update model resource testing to include validation for StepFun Realtime connections.
- Enhance service factory to create realtime services based on configuration settings.
- Modify README documentation to reflect new realtime capabilities and usage instructions.
2026-06-14 23:41:40 +08:00

175 lines
5.9 KiB
Python

"""Built-in concrete interface definitions used by backend and dynamic forms."""
from __future__ import annotations
from typing import Any
def field(
key: str,
label: str,
*,
group: str = "values",
type_: str = "text",
required: bool = False,
default: Any = None,
options: list[str] | None = None,
) -> dict:
value = {
"key": key,
"label": label,
"group": group,
"type": type_,
"required": required,
}
if default is not None:
value["default"] = default
if options:
value["options"] = options
return value
OPENAI_COMMON = [
field("modelId", "Model ID", required=True),
field("apiUrl", "API URL", type_="url", required=True),
field("apiKey", "API Key", group="secrets", type_="password", required=True),
]
XFYUN_AUTH = [
field("apiUrl", "WebSocket URL", type_="url", required=True),
field("appId", "App ID", group="secrets", type_="password", required=True),
field("apiKey", "API Key", group="secrets", type_="password", required=True),
field("apiSecret", "API Secret", group="secrets", type_="password", required=True),
]
INTERFACE_DEFINITIONS: list[dict] = [
{
"interface_type": "openai-llm",
"name": "OpenAI Compatible LLM",
"capability": "LLM",
"fields": OPENAI_COMMON
+ [field("temperature", "Temperature", type_="number", default=0.7)],
},
{
"interface_type": "openai-asr",
"name": "OpenAI Compatible ASR",
"capability": "ASR",
"fields": OPENAI_COMMON + [field("language", "Language", default="zh")],
},
{
"interface_type": "openai-tts",
"name": "OpenAI Compatible TTS",
"capability": "TTS",
"fields": OPENAI_COMMON
+ [
field("voice", "Voice"),
field("speed", "Speed", type_="number", default=1.0),
field("sourceSampleRate", "Source Sample Rate", type_="number", default=24000),
],
},
{
"interface_type": "openai-embedding",
"name": "OpenAI Compatible Embedding",
"capability": "Embedding",
"fields": OPENAI_COMMON + [field("dimensions", "Dimensions", type_="number")],
},
{
"interface_type": "openai-realtime",
"name": "OpenAI Realtime",
"capability": "Realtime",
"fields": OPENAI_COMMON + [field("voice", "Voice")],
},
{
"interface_type": "stepfun-realtime",
"name": "StepFun StepAudio Realtime",
"capability": "Realtime",
"fields": OPENAI_COMMON
+ [
field("voice", "Voice", required=True, default="linjiajiejie"),
field("inputSampleRate", "Input Sample Rate", type_="number", default=24000),
field("outputSampleRate", "Output Sample Rate", type_="number", default=24000),
field("prefixPaddingMs", "VAD Prefix Padding (ms)", type_="number", default=500),
field("silenceDurationMs", "VAD Silence Duration (ms)", type_="number", default=300),
field(
"energyAwakenessThreshold",
"VAD Energy Threshold",
type_="number",
default=2500,
),
],
},
{
"interface_type": "xfyun-asr",
"name": "Xfyun Streaming ASR",
"capability": "ASR",
"fields": XFYUN_AUTH
+ [
field("language", "Language", default="zh_cn"),
field("domain", "Domain", default="iat"),
field("accent", "Accent", default="mandarin"),
field("dynamicCorrection", "Dynamic Correction", type_="boolean", default=False),
field("frameSize", "Frame Size", type_="number", default=1280),
],
},
{
"interface_type": "xfyun-tts",
"name": "Xfyun TTS",
"capability": "TTS",
"fields": XFYUN_AUTH
+ [
field("voice", "Voice"),
field("speed", "Speed", type_="number", default=50),
field("volume", "Volume", type_="number", default=50),
field("pitch", "Pitch", type_="number", default=50),
field("sourceSampleRate", "Source Sample Rate", type_="number", default=16000),
],
},
{
"interface_type": "xfyun-super-tts",
"name": "Xfyun Super TTS",
"capability": "TTS",
"fields": XFYUN_AUTH
+ [
field("voice", "Voice"),
field("speed", "Speed", type_="number", default=50),
field("volume", "Volume", type_="number", default=50),
field("pitch", "Pitch", type_="number", default=50),
field("oralLevel", "Oral Level", default="mid"),
field("sourceSampleRate", "Source Sample Rate", type_="number", default=24000),
field("textAggregationMode", "Text Aggregation Mode", default="token"),
],
},
{
"interface_type": "dashscope-llm",
"name": "DashScope LLM",
"capability": "LLM",
"fields": OPENAI_COMMON
+ [field("temperature", "Temperature", type_="number", default=0.7)],
},
{
"interface_type": "dashscope-asr",
"name": "DashScope ASR",
"capability": "ASR",
"fields": OPENAI_COMMON + [field("language", "Language", default="zh")],
},
{
"interface_type": "dashscope-tts",
"name": "DashScope TTS",
"capability": "TTS",
"fields": OPENAI_COMMON + [field("voice", "Voice")],
},
{
"interface_type": "gemini-realtime",
"name": "Gemini Realtime",
"capability": "Realtime",
"fields": OPENAI_COMMON,
},
]
def validate_fields(definition: dict, values: dict, secrets: dict) -> None:
for item in definition["fields"]:
source = secrets if item["group"] == "secrets" else values
value = source.get(item["key"])
if item.get("required") and (value is None or value == ""):
raise ValueError(f"{item['label']} is required")