diff --git a/README.md b/README.md index e69de29..05018fe 100644 --- a/README.md +++ b/README.md @@ -0,0 +1,3 @@ +# AI VideoAssistant + +一个多媒体全双工交互服务网关 \ No newline at end of file diff --git a/api/docs/assistant.md b/api/docs/assistant.md new file mode 100644 index 0000000..cad5ba0 --- /dev/null +++ b/api/docs/assistant.md @@ -0,0 +1,254 @@ +# 助手 (Assistant) API + +助手 API 用于管理 AI 小助手的创建、配置和操作。 + +## 基础信息 + +| 项目 | 值 | +|------|-----| +| Base URL | `/api/v1/assistants` | +| 认证方式 | Bearer Token (预留) | + +--- + +## 数据模型 + +### Assistant + +```typescript +interface Assistant { + id: string; // 助手唯一标识 (8位UUID) + user_id: number; // 所属用户ID + name: string; // 助手名称 + call_count: number; // 调用次数 + opener: string; // 开场白 + prompt: string; // 系统提示词/人格设定 + knowledge_base_id?: string; // 关联知识库ID + language: string; // 语言: "zh" | "en" + voice?: string; // 声音ID + speed: number; // 语速 (0.5-2.0) + hotwords: string[]; // 热词列表 + tools: string[]; // 启用的工具ID列表 + interruption_sensitivity: number; // 打断灵敏度 (ms) + config_mode: string; // 配置模式: "platform" | "dify" | "fastgpt" | "none" + api_url?: string; // 外部API URL + api_key?: string; // 外部API Key + // 模型关联 (新增) + llm_model_id?: string; // LLM模型ID + asr_model_id?: string; // ASR模型ID + embedding_model_id?: string; // Embedding模型ID + rerank_model_id?: string; // Rerank模型ID + created_at: string; + updated_at: string; +} +``` + +--- + +## API 端点 + +### 1. 获取助手列表 + +```http +GET /api/v1/assistants +``` + +**Query Parameters:** + +| 参数 | 类型 | 必填 | 默认值 | 说明 | +|------|------|------|--------|------| +| page | int | 否 | 1 | 页码 | +| limit | int | 否 | 50 | 每页数量 | + +**Response:** + +```json +{ + "total": 100, + "page": 1, + "limit": 50, + "list": [ + { + "id": "abc12345", + "user_id": 1, + "name": "客服助手", + "call_count": 128, + "opener": "您好,请问有什么可以帮助您?", + "prompt": "你是一个专业的客服人员...", + "language": "zh", + "voice": "voice_001", + "speed": 1.0, + "hotwords": ["帮助", "退款"], + "tools": ["query_order", "refund"], + "interruption_sensitivity": 500, + "config_mode": "platform", + "llm_model_id": "llm_001", + "asr_model_id": "asr_001", + "created_at": "2024-01-15T10:30:00Z", + "updated_at": "2024-01-15T10:30:00Z" + } + ] +} +``` + +--- + +### 2. 获取单个助手详情 + +```http +GET /api/v1/assistants/{id} +``` + +**Path Parameters:** + +| 参数 | 类型 | 说明 | +|------|------|------| +| id | string | 助手ID | + +**Response:** + +```json +{ + "id": "abc12345", + "user_id": 1, + "name": "客服助手", + "call_count": 128, + "opener": "您好,请问有什么可以帮助您?", + "prompt": "你是一个专业的客服人员...", + "knowledge_base_id": "kb_001", + "language": "zh", + "voice": "voice_001", + "speed": 1.0, + "hotwords": ["帮助", "退款"], + "tools": ["query_order", "refund"], + "interruption_sensitivity": 500, + "config_mode": "platform", + "api_url": "https://api.example.com", + "llm_model_id": "llm_001", + "asr_model_id": "asr_001", + "embedding_model_id": "emb_001", + "rerank_model_id": "rerank_001", + "created_at": "2024-01-15T10:30:00Z", + "updated_at": "2024-01-15T10:30:00Z" +} +``` + +--- + +### 3. 创建助手 + +```http +POST /api/v1/assistants +``` + +**Request Body:** + +```json +{ + "name": "客服助手", + "opener": "您好,请问有什么可以帮助您?", + "prompt": "你是一个专业的客服人员,擅长解答产品问题和处理投诉。", + "knowledgeBaseId": "kb_001", + "language": "zh", + "voice": "voice_001", + "speed": 1.0, + "hotwords": ["帮助", "退款", "物流"], + "tools": ["query_order", "refund", "track_order"], + "interruptionSensitivity": 500, + "configMode": "platform", + "llmModelId": "llm_001", + "asrModelId": "asr_001", + "embeddingModelId": "emb_001", + "rerankModelId": "rerank_001" +} +``` + +**Fields 说明:** + +| 字段 | 类型 | 必填 | 说明 | +|------|------|------|------| +| name | string | 是 | 助手名称 | +| opener | string | 否 | 开场白,默认空字符串 | +| prompt | string | 否 | 系统提示词,默认空字符串 | +| knowledgeBaseId | string | 否 | 关联知识库ID | +| language | string | 否 | 语言,默认 "zh" | +| voice | string | 否 | 声音资源ID | +| speed | number | 否 | 语速,默认 1.0 | +| hotwords | string[] | 否 | 热词列表 | +| tools | string[] | 否 | 启用的工具ID列表 | +| interruptionSensitivity | number | 否 | 打断灵敏度(ms),默认 500 | +| configMode | string | 否 | 配置模式,默认 "platform" | +| llmModelId | string | 否 | LLM模型ID | +| asrModelId | string | 否 | ASR模型ID | +| embeddingModelId | string | 否 | Embedding模型ID | +| rerankModelId | string | 否 | Rerank模型ID | + +--- + +### 4. 更新助手 + +```http +PUT /api/v1/assistants/{id} +``` + +**Request Body:** (部分更新) + +```json +{ + "name": "高级客服助手", + "prompt": "你是一个高级客服人员...", + "speed": 1.2 +} +``` + +--- + +### 5```http +DELETE. 删除助手 + + /api/v1/assistants/{id} +``` + +**Response:** + +```json +{ + "message": "Deleted successfully" +} +``` + +--- + +### 6. 获取助手调用统计 + +```http +GET /api/v1/assistants/{id}/stats +``` + +**Response:** + +```json +{ + "assistant_id": "abc12345", + "total_calls": 128, + "connected_calls": 120, + "missed_calls": 8, + "avg_duration_seconds": 180, + "today_calls": 15 +} +``` + +--- + +## 建议的 Schema 改进 + +基于 web/types.ts 的分析,建议在 `schemas.py` 中补充以下字段: + +```python +class AssistantBase(BaseModel): + # ... 现有字段 ... + llm_model_id: Optional[str] = None + asr_model_id: Optional[str] = None + embedding_model_id: Optional[str] = None + rerank_model_id: Optional[str] = None +``` diff --git a/api/docs/history-records.md b/api/docs/history-records.md new file mode 100644 index 0000000..0a55843 --- /dev/null +++ b/api/docs/history-records.md @@ -0,0 +1,464 @@ +# 历史记录 (History Records) API + +历史记录 API 用于管理通话记录和对话历史。 + +## 基础信息 + +| 项目 | 值 | +|------|-----| +| Base URL | `/api/v1/history` | +| 认证方式 | Bearer Token (预留) | + +--- + +## 数据模型 + +### CallRecord + +```typescript +interface CallRecord { + id: string; // 通话记录ID + user_id: number; // 所属用户ID + assistant_id?: string; // 关联助手ID + source: string; // 来源: "debug" | "external" + status: string; // 状态: "connected" | "missed" | "failed" + started_at: string; // 开始时间 ISO8601 + ended_at?: string; // 结束时间 ISO8601 + duration_seconds?: int; // 通话时长(秒) + summary?: string; // 通话摘要 + cost?: number; // 费用 + metadata?: object; // 元数据 + created_at: string; // 创建时间 +} +``` + +### TranscriptSegment + +```typescript +interface TranscriptSegment { + turn_index: int; // 对话轮次 + speaker: string; // 说话者: "human" | "ai" + content: string; // 转写内容 + confidence?: float; // 置信度 0-1 + start_ms: int; // 开始时间(毫秒) + end_ms: int; // 结束时间(毫秒) + duration_ms?: int; // 持续时间(毫秒) + audio_url?: string; // 音频URL + emotion?: string; // 情绪标签 +} +``` + +### InteractionDetail + +```typescript +interface InteractionDetail { + role: "user" | "assistant"; // 角色 + content: string; // 文本内容或转写文本 + audio_url?: string; // 音频URL + image_urls?: string[]; // 图片URL列表(视频场景) + timestamp: string; // 时间戳 +} +``` + +--- + +## API 端点 + +### 1. 获取通话记录列表 + +```http +GET /api/v1/history +``` + +**Query Parameters:** + +| 参数 | 类型 | 必填 | 说明 | +|------|------|------|------| +| assistant_id | string | 否 | 过滤助手ID | +| status | string | 否 | 过滤状态: "connected" \| "missed" \| "failed" | +| source | string | 否 | 过滤来源: "debug" \| "external" | +| start_date | string | 否 | 开始日期 ISO8601 | +| end_date | string | 否 | 结束日期 ISO8601 | +| page | int | 否 | 页码,默认 1 | +| limit | int | 否 | 每页数量,默认 20 | + +**Response:** + +```json +{ + "total": 150, + "page": 1, + "limit": 20, + "list": [ + { + "id": "call_001", + "user_id": 1, + "assistant_id": "abc12345", + "source": "debug", + "status": "connected", + "started_at": "2024-01-15T14:30:00Z", + "ended_at": "2024-01-15T14:33:00Z", + "duration_seconds": 180, + "summary": "用户咨询产品A的售后服务", + "cost": 0.05, + "created_at": "2024-01-15T14:30:00Z" + }, + { + "id": "call_002", + "user_id": 1, + "assistant_id": "abc12345", + "source": "external", + "status": "missed", + "started_at": "2024-01-15T14:00:00Z", + "duration_seconds": 0, + "created_at": "2024-01-15T14:00:00Z" + } + ] +} +``` + +--- + +### 2. 获取通话详情 + +```http +GET /api/v1/history/{call_id} +``` + +**Response:** + +```json +{ + "id": "call_001", + "user_id": 1, + "assistant_id": "abc12345", + "source": "debug", + "status": "connected", + "started_at": "2024-01-15T14:30:00Z", + "ended_at": "2024-01-15T14:33:00Z", + "duration_seconds": 180, + "summary": "用户咨询产品A的售后服务", + "cost": 0.05, + "metadata": { + "browser": "Chrome", + "os": "Windows" + }, + "transcripts": [ + { + "turn_index": 0, + "speaker": "human", + "content": "您好,我想咨询一下产品A的售后服务", + "confidence": 0.98, + "start_ms": 0, + "end_ms": 3500, + "duration_ms": 3500, + "audio_url": "https://storage.example.com/audio/call_001/turn_0.mp3" + }, + { + "turn_index": 1, + "speaker": "ai", + "content": "您好!产品A享有7天无理由退货和一年质保服务。请问您遇到了什么问题?", + "confidence": 0.95, + "start_ms": 4000, + "end_ms": 12000, + "duration_ms": 8000, + "audio_url": "https://storage.example.com/audio/call_001/turn_1.mp3" + }, + { + "turn_index": 2, + "speaker": "human", + "content": "我发现产品A有一个功能坏了", + "confidence": 0.92, + "start_ms": 13000, + "end_ms": 18000, + "duration_ms": 5000, + "audio_url": "https://storage.example.com/audio/call_001/turn_2.mp3" + } + ] +} +``` + +--- + +### 3. 创建通话记录 + +```http +POST /api/v1/history +``` + +**Request Body:** + +```json +{ + "user_id": 1, + "assistant_id": "abc12345", + "source": "debug" +} +``` + +**Response:** + +```json +{ + "id": "call_new001", + "user_id": 1, + "assistant_id": "abc12345", + "source": "debug", + "status": "connected", + "started_at": "2024-01-15T15:00:00Z", + "created_at": "2024-01-15T15:00:00Z" +} +``` + +--- + +### 4. 更新通话记录 + +```http +PUT /api/v1/history/{call_id} +``` + +**Request Body:** + +```json +{ + "status": "connected", + "summary": "用户咨询产品A的售后服务,已引导用户提交工单", + "duration_seconds": 180, + "ended_at": "2024-01-15T14:33:00Z" +} +``` + +--- + +### 5. 删除通话记录 + +```http +DELETE /api/v1/history/{call_id} +``` + +--- + +### 6. 添加转写片段 + +```http +POST /api/v1/history/{call_id}/transcripts +``` + +**Request Body:** + +```json +{ + "turn_index": 3, + "speaker": "ai", + "content": "好的,我已经为您创建了工单,编号 #12345,请保持电话畅通,我们的客服人员会在24小时内联系您。", + "confidence": 0.96, + "start_ms": 20000, + "end_ms": 28000, + "duration_ms": 8000, + "emotion": "neutral" +} +``` + +**Response:** + +```json +{ + "id": 100, + "turn_index": 3, + "speaker": "ai", + "content": "好的,我已经为您创建了工单...", + "confidence": 0.96, + "start_ms": 20000, + "end_ms": 28000, + "duration_ms": 8000, + "audio_url": "https://storage.example.com/audio/call_001/turn_3.mp3", + "emotion": "neutral" +} +``` + +--- + +### 7. 获取音频 + +```http +GET /api/v1/history/{call_id}/audio/{turn_index} +``` + +**Response:** 重定向到音频文件 URL + +--- + +### 8. 搜索通话记录 + +```http +GET /api/v1/history/search +``` + +**Query Parameters:** + +| 参数 | 类型 | 必填 | 说明 | +|------|------|------|------| +| q | string | 是 | 搜索关键词 | +| page | int | 否 | 页码 | +| limit | int | 否 | 每页数量 | + +**Response:** + +```json +{ + "total": 5, + "page": 1, + "limit": 20, + "list": [ + { + "id": "call_001", + "started_at": "2024-01-15T14:30:00Z", + "matched_content": "用户咨询产品A的售后服务" + } + ] +} +``` + +--- + +### 9. 获取统计信息 + +```http +GET /api/v1/history/stats +``` + +**Query Parameters:** + +| 参数 | 类型 | 必填 | 说明 | +|------|------|------|------| +| start_date | string | 否 | 开始日期 | +| end_date | string | 否 | 结束日期 | +| assistant_id | string | 否 | 助手ID | + +**Response:** + +```json +{ + "total_calls": 150, + "connected_calls": 135, + "missed_calls": 15, + "failed_calls": 0, + "avg_duration_seconds": 180, + "total_cost": 7.50, + "by_status": { + "connected": 135, + "missed": 15, + "failed": 0 + }, + "by_source": { + "debug": 100, + "external": 50 + }, + "daily_trend": [ + { + "date": "2024-01-15", + "calls": 20, + "connected": 18, + "avg_duration": 175 + } + ] +} +``` + +--- + +## 推荐的 Schema 定义 + +```python +# ============ Call Record ============ +class CallRecordSource(str, Enum): + DEBUG = "debug" + EXTERNAL = "external" + +class CallRecordStatus(str, Enum): + CONNECTED = "connected" + MISSED = "missed" + FAILED = "failed" + +class CallRecordBase(BaseModel): + assistant_id: Optional[str] = None + source: str = "debug" + +class CallRecordCreate(CallRecordBase): + pass + +class CallRecordUpdate(BaseModel): + status: Optional[str] = None + summary: Optional[str] = None + duration_seconds: Optional[int] = None + ended_at: Optional[str] = None + cost: Optional[float] = None + metadata: Optional[dict] = None + +class TranscriptSegment(BaseModel): + turn_index: int + speaker: str # "human" | "ai" + content: str + confidence: Optional[float] = None + start_ms: int + end_ms: int + duration_ms: Optional[int] = None + emotion: Optional[str] = None + +class CallRecordOut(BaseModel): + id: str + user_id: int + assistant_id: Optional[str] = None + source: str + status: str + started_at: str + ended_at: Optional[str] = None + duration_seconds: Optional[int] = None + summary: Optional[str] = None + cost: float = 0.0 + metadata: dict = {} + created_at: datetime + transcripts: List[TranscriptSegment] = [] + + class Config: + from_attributes = True + +class TranscriptCreate(BaseModel): + turn_index: int + speaker: str + content: str + confidence: Optional[float] = None + start_ms: int + end_ms: int + duration_ms: Optional[int] = None + emotion: Optional[str] = None + +class TranscriptOut(TranscriptCreate): + id: int + audio_url: Optional[str] = None + + class Config: + from_attributes = True + +class HistoryStats(BaseModel): + total_calls: int + connected_calls: int + missed_calls: int + failed_calls: int + avg_duration_seconds: float + total_cost: float + by_status: dict + by_source: dict + daily_trend: List[dict] +``` + +--- + +## Web 端对应接口映射 + +| Web Type | API Endpoint | +|----------|--------------| +| CallLog (list) | `GET /api/v1/history` | +| CallLog (detail) | `GET /api/v1/history/{id}` | +| InteractionDetail | `GET /api/v1/history/{id}` (transcripts 字段) | diff --git a/api/docs/index.md b/api/docs/index.md new file mode 100644 index 0000000..be4ed8c --- /dev/null +++ b/api/docs/index.md @@ -0,0 +1,66 @@ +# API 文档索引 + +本文档描述 AI-VideoAssistant 项目的后端 API 接口规范。 + +## 目录 + +| 模块 | 文件 | 说明 | +|------|------|------| +| 小助手 | [assistant.md](./assistant.md) | AI 助手管理 | +| 模型接入 | [model-access.md](./model-access.md) | LLM/ASR/TTS 模型配置 | +| 语音识别 | [speech-recognition.md](./speech-recognition.md) | ASR 模型配置 | +| 声音资源 | [voice-resources.md](./voice-resources.md) | TTS 声音库管理 | +| 历史记录 | [history-records.md](./history-records.md) | 通话记录和转写 | + +--- + +## 基础信息 + +- **Base URL**: `http://localhost:8000/api/v1` +- **认证方式**: Bearer Token (预留) +- **Content-Type**: application/json + +--- + +## 状态码说明 + +| 状态码 | 说明 | +|--------|------| +| 200 | 成功 | +| 201 | 创建成功 | +| 400 | 请求参数错误 | +| 401 | 未认证 | +| 403 | 无权限 | +| 404 | 资源不存在 | +| 500 | 服务器错误 | + +--- + +## 通用响应格式 + +### 成功响应 + +```json +{ + "data": { ... } +} +``` + +### 错误响应 + +```json +{ + "detail": "错误描述" +} +``` + +### 列表响应 + +```json +{ + "total": 100, + "page": 1, + "limit": 20, + "list": [ ... ] +} +``` diff --git a/api/docs/model-access.md b/api/docs/model-access.md new file mode 100644 index 0000000..d68f163 --- /dev/null +++ b/api/docs/model-access.md @@ -0,0 +1,412 @@ +# 模型接入 (Model Access) API + +模型接入 API 用于管理 LLM、Embedding、Rerank 等 AI 模型的配置。 + +## 基础信息 + +| 项目 | 值 | +|------|-----| +| Base URL | `/api/v1/models` | +| 认证方式 | Bearer Token (预留) | + +--- + +## 数据模型 + +### LLMModel + +```typescript +interface LLMModel { + id: string; // 模型唯一标识 + user_id: number; // 所属用户ID + name: string; // 模型显示名称 + vendor: string; // 供应商: "OpenAI Compatible" | "SiliconFlow" | "Dify" | "FastGPT" + type: string; // 类型: "text" | "embedding" | "rerank" + base_url: string; // API Base URL + api_key: string; // API Key + model_name?: string; // 实际模型名称 + temperature?: number; // 温度参数 (仅text类型) + context_length?: int; // 上下文长度 + enabled: boolean; // 是否启用 + created_at: string; + updated_at: string; +} +``` + +### ASRModel (语音识别模型) + +```typescript +interface ASRModel { + id: string; + user_id: number; + name: string; + vendor: string; // "OpenAI Compatible" | "Azure" | "阿里云" | "讯飞" + language: string; // "zh" | "en" | "Multi-lingual" + base_url: string; + api_key: string; + model_name?: string; // 如 "whisper-1", "SenseVoiceSmall" + enabled: boolean; + created_at: string; +} +``` + +### TTSModel (语音合成模型 - 可选) + +```typescript +interface TTSModel { + id: string; + user_id: number; + name: string; + vendor: string; // "Ali" | "Volcano" | "Minimax" | "硅基流动" + language: string; // "zh" | "en" + voice_list?: string[]; // 支持的声音列表 + enabled: boolean; + created_at: string; +} +``` + +--- + +## API 端点 + +### LLM 模型 + +#### 1. 获取 LLM 模型列表 + +```http +GET /api/v1/models/llm +``` + +**Query Parameters:** + +| 参数 | 类型 | 必填 | 说明 | +|------|------|------|------| +| type | string | 否 | 过滤类型: "text" \| "embedding" \| "rerank" | +| vendor | string | 否 | 过滤供应商 | +| enabled | boolean | 否 | 过滤启用状态 | + +**Response:** + +```json +{ + "total": 5, + "list": [ + { + "id": "llm_001", + "user_id": 1, + "name": "GPT-4o", + "vendor": "OpenAI Compatible", + "type": "text", + "base_url": "https://api.openai.com/v1", + "api_key": "sk-***", + "model_name": "gpt-4o", + "temperature": 0.7, + "context_length": 128000, + "enabled": true, + "created_at": "2024-01-15T10:30:00Z" + }, + { + "id": "emb_001", + "user_id": 1, + "name": "Embedding-3-Small", + "vendor": "OpenAI Compatible", + "type": "embedding", + "base_url": "https://api.openai.com/v1", + "api_key": "sk-***", + "model_name": "text-embedding-3-small", + "enabled": true + } + ] +} +``` + +#### 2. 获取单个 LLM 模型详情 + +```http +GET /api/v1/models/llm/{id} +``` + +#### 3. 创建 LLM 模型 + +```http +POST /api/v1/models/llm +``` + +**Request Body:** + +```json +{ + "name": "GPT-4o", + "vendor": "OpenAI Compatible", + "type": "text", + "base_url": "https://api.openai.com/v1", + "api_key": "sk-your-api-key", + "model_name": "gpt-4o", + "temperature": 0.7, + "context_length": 128000, + "enabled": true +} +``` + +#### 4. 更新 LLM 模型 + +```http +PUT /api/v1/models/llm/{id} +``` + +**Request Body:** (部分更新) + +```json +{ + "name": "GPT-4o-Updated", + "temperature": 0.8, + "enabled": false +} +``` + +#### 5. 删除 LLM 模型 + +```http +DELETE /api/v1/models/llm/{id} +``` + +#### 6. 测试 LLM 模型连接 + +```http +POST /api/v1/models/llm/{id}/test +``` + +**Response:** + +```json +{ + "success": true, + "latency_ms": 150, + "message": "Connection successful" +} +``` + +--- + +### ASR 模型 + +#### 1. 获取 ASR 模型列表 + +```http +GET /api/v1/models/asr +``` + +**Response:** + +```json +{ + "total": 3, + "list": [ + { + "id": "asr_001", + "user_id": 1, + "name": "Whisper-1", + "vendor": "OpenAI Compatible", + "language": "Multi-lingual", + "base_url": "https://api.openai.com/v1", + "api_key": "sk-***", + "model_name": "whisper-1", + "enabled": true, + "created_at": "2024-01-15T10:30:00Z" + }, + { + "id": "asr_002", + "user_id": 1, + "name": "SenseVoice-Small", + "vendor": "OpenAI Compatible", + "language": "zh", + "base_url": "https://api.speech.ai/v1", + "api_key": "sk-***", + "model_name": "sensevoice-small", + "enabled": true + } + ] +} +``` + +#### 2. 创建 ASR 模型 + +```http +POST /api/v1/models/asr +``` + +**Request Body:** + +```json +{ + "name": "SenseVoice-Small", + "vendor": "OpenAI Compatible", + "language": "zh", + "base_url": "https://api.speech.ai/v1", + "api_key": "sk-your-api-key", + "model_name": "sensevoice-small", + "enabled": true +} +``` + +#### 3. 测试 ASR 模型 + +```http +POST /api/v1/models/asr/{id}/test +``` + +**Request Body:** + +```json +{ + "audio_url": "https://example.com/test.wav" +} +``` + +**Response:** + +```json +{ + "success": true, + "transcript": "测试音频内容", + "language": "zh", + "latency_ms": 500 +} +``` + +--- + +### TTS 模型 (可选) + +#### 1. 获取 TTS 模型列表 + +```http +GET /api/v1/models/tts +``` + +#### 2. 创建 TTS 模型 + +```http +POST /api/v1/models/tts +``` + +**Request Body:** + +```json +{ + "name": "阿里云语音合成", + "vendor": "Ali", + "language": "zh", + "base_url": "https://nlp.cn-shanghai.aliyuncs.com", + "api_key": "sk-***", + "enabled": true +} +``` + +--- + +## 推荐的 Schema 定义 + +```python +# ============ LLM Model ============ +class LLMModelType(str, Enum): + TEXT = "text" + EMBEDDING = "embedding" + RERANK = "rerank" + +class LLMModelVendor(str, Enum): + OPENAI_COMPATIBLE = "OpenAI Compatible" + SILICONFLOW = "SiliconFlow" + DIFY = "Dify" + FASTGPT = "FastGPT" + +class LLMModelBase(BaseModel): + name: str + vendor: str + type: LLMModelType + base_url: str + api_key: str + model_name: Optional[str] = None + temperature: Optional[float] = None + context_length: Optional[int] = None + enabled: bool = True + +class LLMModelCreate(LLMModelBase): + pass + +class LLMModelUpdate(BaseModel): + name: Optional[str] = None + base_url: Optional[str] = None + api_key: Optional[str] = None + model_name: Optional[str] = None + temperature: Optional[float] = None + context_length: Optional[int] = None + enabled: Optional[bool] = None + +class LLMModelOut(LLMModelBase): + id: str + user_id: int + created_at: datetime + updated_at: datetime + + class Config: + from_attributes = True + +# ============ ASR Model ============ +class ASRModelBase(BaseModel): + name: str + vendor: str + language: str # "zh" | "en" | "Multi-lingual" + base_url: str + api_key: str + model_name: Optional[str] = None + enabled: bool = True + +class ASRModelCreate(ASRModelBase): + pass + +class ASRModelOut(ASRModelBase): + id: str + user_id: int + created_at: datetime + + class Config: + from_attributes = True +``` + +--- + +## 供应商配置示例 + +### OpenAI Compatible + +```json +{ + "vendor": "OpenAI Compatible", + "base_url": "https://api.openai.com/v1", + "api_key": "sk-xxx", + "model_name": "gpt-4o" +} +``` + +### SiliconFlow + +```json +{ + "vendor": "SiliconFlow", + "base_url": "https://api.siliconflow.com/v1", + "api_key": "sf-xxx", + "model_name": "deepseek-v3" +} +``` + +### Dify + +```json +{ + "vendor": "Dify", + "base_url": "https://your-dify.domain.com/v1", + "api_key": "app-xxx", + "model_name": "gpt-4" +} +``` diff --git a/api/docs/speech-recognition.md b/api/docs/speech-recognition.md new file mode 100644 index 0000000..fe97f0e --- /dev/null +++ b/api/docs/speech-recognition.md @@ -0,0 +1,364 @@ +# 语音识别 (Speech Recognition / ASR) API + +语音识别 API 用于管理语音识别模型的配置和调用。 + +## 基础信息 + +| 项目 | 值 | +|------|-----| +| Base URL | `/api/v1/asr` | +| 认证方式 | Bearer Token (预留) | + +--- + +## 数据模型 + +### ASRConfig + +```typescript +interface ASRConfig { + id: string; // 配置ID + user_id: number; // 所属用户ID + name: string; // 配置名称 + vendor: string; // 供应商 + language: string; // 识别语言 + base_url: string; // API地址 + api_key: string; // API密钥 + model_name?: string; // 模型名称 + hotwords?: string[]; // 热词增强 + enable_punctuation: boolean; // 是否启用标点 + enable_normalization: boolean; // 是否启用文本规范化 + enabled: boolean; + created_at: string; +} +``` + +--- + +## API 端点 + +### 1. 获取 ASR 配置列表 + +```http +GET /api/v1/asr +``` + +**Query Parameters:** + +| 参数 | 类型 | 必填 | 说明 | +|------|------|------|------| +| language | string | 否 | 过滤语言 | +| vendor | string | 否 | 过滤供应商 | +| enabled | boolean | 否 | 过滤启用状态 | + +**Response:** + +```json +{ + "total": 3, + "list": [ + { + "id": "asr_001", + "user_id": 1, + "name": "Whisper 多语种识别", + "vendor": "OpenAI Compatible", + "language": "Multi-lingual", + "base_url": "https://api.openai.com/v1", + "api_key": "sk-***", + "model_name": "whisper-1", + "enable_punctuation": true, + "enable_normalization": true, + "enabled": true, + "created_at": "2024-01-15T10:30:00Z" + }, + { + "id": "asr_002", + "user_id": 1, + "name": "SenseVoice 中文识别", + "vendor": "OpenAI Compatible", + "language": "zh", + "base_url": "https://api.speech.ai/v1", + "api_key": "sk-***", + "model_name": "sensevoice-small", + "hotwords": ["小助手", "帮我"], + "enabled": true + } + ] +} +``` + +--- + +### 2. 获取单个 ASR 配置详情 + +```http +GET /api/v1/asr/{id} +``` + +**Response:** + +```json +{ + "id": "asr_001", + "user_id": 1, + "name": "Whisper 多语种识别", + "vendor": "OpenAI Compatible", + "language": "Multi-lingual", + "base_url": "https://api.openai.com/v1", + "api_key": "sk-xxx", + "model_name": "whisper-1", + "hotwords": [], + "enable_punctuation": true, + "enable_normalization": true, + "enabled": true, + "created_at": "2024-01-15T10:30:00Z" +} +``` + +--- + +### 3. 创建 ASR 配置 + +```http +POST /api/v1/asr +``` + +**Request Body:** + +```json +{ + "name": "SenseVoice 中文识别", + "vendor": "OpenAI Compatible", + "language": "zh", + "base_url": "https://api.speech.ai/v1", + "api_key": "sk-your-api-key", + "model_name": "sensevoice-small", + "hotwords": ["小助手", "帮我"], + "enable_punctuation": true, + "enable_normalization": true, + "enabled": true +} +``` + +**Fields 说明:** + +| 字段 | 类型 | 必填 | 说明 | +|------|------|------|------| +| name | string | 是 | 配置名称 | +| vendor | string | 是 | 供应商: "OpenAI Compatible" / "Azure" / "阿里云" / "讯飞" | +| language | string | 是 | 语言: "zh" / "en" / "Multi-lingual" | +| base_url | string | 是 | API Base URL | +| api_key | string | 是 | API Key | +| model_name | string | 否 | 模型名称 | +| hotwords | string[] | 否 | 热词列表,提升识别准确率 | +| enable_punctuation | boolean | 否 | 是否输出标点,默认 true | +| enable_normalization | boolean | 否 | 是否文本规范化,默认 true | +| enabled | boolean | 否 | 是否启用,默认 true | + +--- + +### 4. 更新 ASR 配置 + +```http +PUT /api/v1/asr/{id} +``` + +**Request Body:** (部分更新) + +```json +{ + "name": "Whisper-1 优化版", + "language": "zh", + "enable_punctuation": true +} +``` + +--- + +### 5. 删除 ASR 配置 + +```http +DELETE /api/v1/asr/{id} +``` + +--- + +### 6. 测试 ASR 识别 + +```http +POST /api/v1/asr/{id}/test +``` + +**Request Body:** + +```json +{ + "audio_url": "https://example.com/test-audio.wav", + "audio_data": "base64_encoded_audio" // 二选一 +} +``` + +**Response:** + +```json +{ + "success": true, + "transcript": "您好,请问有什么可以帮助您?", + "language": "zh", + "confidence": 0.95, + "duration_ms": 3000, + "latency_ms": 450 +} +``` + +--- + +### 7. 实时语音识别 (流式) + +```http +WS /api/v1/asr/{id}/stream +``` + +**连接参数:** + +| 参数 | 类型 | 说明 | +|------|------|------| +| audio_format | string | 音频格式: "pcm" / "mp3" / "wav" | +| sample_rate | int | 采样率: 16000 / 44100 | +| channels | int | 声道数: 1 (单声道) / 2 (立体声) | + +**消息格式:** + +客户端发送 (音频数据): +```json +{ + "type": "audio", + "data": "base64_encoded_audio_chunk" +} +``` + +服务端返回 (识别结果): +```json +{ + "type": "transcript", + "text": "您好", + "is_final": false +} +``` + +```json +{ + "type": "transcript", + "text": "您好,请问有什么可以帮助您?", + "is_final": true +} +``` + +--- + +## 推荐的 Schema 定义 + +```python +# ============ ASR Model ============ +class ASRLanguage(str, Enum): + ZH = "zh" + EN = "en" + MULTILINGUAL = "Multi-lingual" + +class ASRVendor(str, Enum): + OPENAI_COMPATIBLE = "OpenAI Compatible" + AZURE = "Azure" + ALI = "阿里云" + IFLYTEK = "讯飞" + +class ASRModelBase(BaseModel): + name: str + vendor: str + language: str # "zh" | "en" | "Multi-lingual" + base_url: str + api_key: str + model_name: Optional[str] = None + hotwords: List[str] = [] + enable_punctuation: bool = True + enable_normalization: bool = True + enabled: bool = True + +class ASRModelCreate(ASRModelBase): + pass + +class ASRModelUpdate(BaseModel): + name: Optional[str] = None + language: Optional[str] = None + base_url: Optional[str] = None + api_key: Optional[str] = None + model_name: Optional[str] = None + hotwords: Optional[List[str]] = None + enable_punctuation: Optional[bool] = None + enable_normalization: Optional[bool] = None + enabled: Optional[bool] = None + +class ASRModelOut(ASRModelBase): + id: str + user_id: int + created_at: datetime + + class Config: + from_attributes = True + +class ASRTestRequest(BaseModel): + audio_url: Optional[str] = None + audio_data: Optional[str] = None # base64 encoded + +class ASRTestResponse(BaseModel): + success: bool + transcript: Optional[str] = None + language: Optional[str] = None + confidence: Optional[float] = None + duration_ms: Optional[int] = None + latency_ms: Optional[int] = None + error: Optional[str] = None +``` + +--- + +## 供应商配置示例 + +### OpenAI Whisper + +```json +{ + "vendor": "OpenAI Compatible", + "base_url": "https://api.openai.com/v1", + "api_key": "sk-xxx", + "model_name": "whisper-1", + "language": "Multi-lingual", + "enable_punctuation": true, + "enable_normalization": true +} +``` + +### 阿里云智能语音 + +```json +{ + "vendor": "阿里云", + "base_url": "https://filetrans.cn-shanghai.aliyuncs.com/v1", + "api_key": "your-access-key-id:your-access-key-secret", + "model_name": "nls.cn-shanghai", + "language": "zh", + "hotwords": ["产品名称", "公司名"] +} +``` + +### 讯飞语音 + +```json +{ + "vendor": "讯飞", + "base_url": "https://iat-api.xfyun.cn/v2/iat", + "api_key": "your-appid:your-api-key", + "model_name": "iat", + "language": "zh", + "enable_punctuation": true +} +``` diff --git a/api/docs/voice-resources.md b/api/docs/voice-resources.md new file mode 100644 index 0000000..23c68ac --- /dev/null +++ b/api/docs/voice-resources.md @@ -0,0 +1,379 @@ +# 声音资源 (Voice Resources) API + +声音资源 API 用于管理 TTS 语音合成的声音配置。 + +## 基础信息 + +| 项目 | 值 | +|------|-----| +| Base URL | `/api/v1/voices` | +| 认证方式 | Bearer Token (预留) | + +--- + +## 数据模型 + +### Voice + +```typescript +interface Voice { + id: string; // 声音ID + user_id?: number; // 所属用户ID (系统声音可为null) + name: string; // 声音名称 + vendor: string; // 供应商: "Ali" | "Volcano" | "Minimax" | "硅基流动" + gender: string; // 性别: "Male" | "Female" + language: string; // 语言: "zh" | "en" + description: string; // 描述 + // 扩展参数 (voice_params) + model?: string; // 语音模型标识 + voice_key?: string; // 厂商voice_key + speed?: number; // 默认语速 (0.5-2.0) + gain?: number; // 音量增益 (-10~10 dB) + pitch?: number; // 音调调整 + enabled: boolean; + is_system: boolean; // 是否系统预设 + created_at: string; +} +``` + +### VoiceParams + +```typescript +interface VoiceParams { + speed: number; // 语速 0.5-2.0 + gain: number; // 音量增益 -10~10 dB + pitch: number; // 音调调整 +``` + +--- + +## API 端点 + +### 1. 获取声音列表 + +```http +GET /api/v1/voices +``` + +**Query Parameters:** + +| 参数 | 类型 | 必填 | 说明 | +|------|------|------|------| +| vendor | string | 否 | 过滤供应商 | +| gender | string | 否 | 过滤性别: "Male" \| "Female" | +| language | string | 否 | 过滤语言: "zh" \| "en" | +| is_system | boolean | 否 | 是否系统预设 | + +**Response:** + +```json +{ + "total": 15, + "list": [ + { + "id": "voice_001", + "user_id": null, + "name": "晓云", + "vendor": "Ali", + "gender": "Female", + "language": "zh", + "description": "温柔女声,适合客服场景", + "model": "paimeng", + "voice_key": "xiaoyun", + "speed": 1.0, + "gain": 0, + "pitch": 0, + "enabled": true, + "is_system": true, + "created_at": "2024-01-15T10:30:00Z" + }, + { + "id": "voice_002", + "user_id": null, + "name": "Kevin", + "vendor": "Volcano", + "gender": "Male", + "language": "en", + "description": "专业男声,适合商务场景", + "model": "知心", + "voice_key": "kevin_male", + "speed": 1.0, + "gain": 0, + "enabled": true, + "is_system": true + }, + { + "id": "voice_003", + "user_id": 1, + "name": "定制客服女声", + "vendor": "Minimax", + "gender": "Female", + "language": "zh", + "description": "定制的客服女声", + "model": "abcs", + "voice_key": "custom_voice_001", + "speed": 1.1, + "gain": 2, + "enabled": true, + "is_system": false + } + ] +} +``` + +--- + +### 2. 获取单个声音详情 + +```http +GET /api/v1/voices/{id} +``` + +**Response:** + +```json +{ + "id": "voice_001", + "user_id": null, + "name": "晓云", + "vendor": "Ali", + "gender": "Female", + "language": "zh", + "description": "温柔女声,适合客服场景", + "model": "paimeng", + "voice_key": "xiaoyun", + "speed": 1.0, + "gain": 0, + "pitch": 0, + "enabled": true, + "is_system": true, + "created_at": "2024-01-15T10:30:00Z" +} +``` + +--- + +### 3. 创建声音配置 + +```http +POST /api/v1/voices +``` + +**Request Body:** + +```json +{ + "name": "定制客服女声", + "vendor": "Minimax", + "gender": "Female", + "language": "zh", + "description": "定制的客服女声", + "model": "abcs", + "voice_key": "custom_voice_001", + "speed": 1.1, + "gain": 2, + "pitch": 0, + "enabled": true +} +``` + +**Fields 说明:** + +| 字段 | 类型 | 必填 | 说明 | +|------|------|------|------| +| name | string | 是 | 声音名称 | +| vendor | string | 是 | 供应商 | +| gender | string | 是 | 性别: "Male" \| "Female" | +| language | string | 是 | 语言: "zh" \| "en" | +| description | string | 否 | 描述信息 | +| model | string | 是 | 厂商语音模型标识 | +| voice_key | string | 是 | 厂商voice_key | +| speed | number | 否 | 默认语速 (0.5-2.0),默认 1.0 | +| gain | number | 否 | 音量增益 (-10~10 dB),默认 0 | +| pitch | number | 否 | 音调调整,默认 0 | +| enabled | boolean | 否 | 是否启用,默认 true | + +--- + +### 4. 更新声音配置 + +```http +PUT /api/v1/voices/{id} +``` + +**Request Body:** (部分更新) + +```json +{ + "name": "优化版客服女声", + "speed": 1.15, + "gain": 1 +} +``` + +--- + +### 5. 删除声音配置 + +```http +DELETE /api/v1/voices/{id} +``` + +**注意:** 系统预设声音不可删除 + +--- + +### 6. 预览声音 + +```http +POST /api/v1/voices/{id}/preview +``` + +**Request Body:** + +```json +{ + "text": "您好,请问有什么可以帮助您?", + "speed": 1.0, + "gain": 0 +} +``` + +**Response:** + +```json +{ + "success": true, + "audio_url": "https://storage.example.com/preview/voice_001_preview.mp3", + "duration_ms": 2500 +} +``` + +--- + +### 7. 获取供应商声音列表 + +```http +GET /api/v1/voices/vendors/{vendor}/available +``` + +**Path Parameters:** + +| 参数 | 类型 | 说明 | +|------|------|------| +| vendor | string | 供应商名称 | + +**Response:** + +```json +{ + "vendor": "Ali", + "voices": [ + { + "model": "paimeng", + "voice_key": "xiaoyun", + "name": "晓云", + "gender": "Female", + "language": "zh" + }, + { + "model": "paimeng", + "voice_key": "guang", + "name": "广志", + "gender": "Male", + "language": "zh" + }, + { + "model": "maimeng", + "voice_key": "sijia", + "name": "思佳", + "gender": "Female", + "language": "zh" + } + ] +} +``` + +--- + +## 推荐的 Schema 定义 + +```python +# ============ Voice ============ +class VoiceGender(str, Enum): + MALE = "Male" + FEMALE = "Female" + +class VoiceBase(BaseModel): + name: str + vendor: str + gender: str + language: str # "zh" | "en" + description: str = "" + model: str + voice_key: str + speed: float = 1.0 + gain: int = 0 + pitch: int = 0 + enabled: bool = True + +class VoiceCreate(VoiceBase): + pass + +class VoiceUpdate(BaseModel): + name: Optional[str] = None + description: Optional[str] = None + speed: Optional[float] = None + gain: Optional[int] = None + pitch: Optional[int] = None + enabled: Optional[bool] = None + +class VoiceOut(VoiceBase): + id: str + user_id: Optional[int] = None + is_system: bool = False + created_at: datetime + + class Config: + from_attributes = True + +class VoicePreviewRequest(BaseModel): + text: str + speed: Optional[float] = None + gain: Optional[int] = None + pitch: Optional[int] = None + +class VoicePreviewResponse(BaseModel): + success: bool + audio_url: Optional[str] = None + duration_ms: Optional[int] = None + error: Optional[str] = None +``` + +--- + +## 供应商声音示例 + +### 阿里云 + +| voice_key | name | gender | language | description | +|-----------|------|--------|----------|-------------| +| xiaoyun | 晓云 | Female | zh | 温柔女声 | +| guang | 广志 | Male | zh | 磁性男声 | +| sijia | 思佳 | Female | zh | 知性女声 | +| yunxiang | 云翔 | Male | zh | 活力男声 | + +### 火山引擎 + +| voice_key | name | gender | language | description | +|-----------|------|--------|----------|-------------| +| doubao | 豆包 | Female | zh | 活泼女声 | +| kevin | Kevin | Male | en | 专业男声 | +| lucy | Lucy | Female | en | 甜美女声 | + +### MiniMax + +| voice_key | name | gender | language | description | +|-----------|------|--------|----------|-------------| +| abby | Abby | Female | en | 自然女声 | +| john | John | Male | en | 成熟男声 | diff --git a/web/package-lock.json b/web/package-lock.json index 512869b..f0f5943 100644 --- a/web/package-lock.json +++ b/web/package-lock.json @@ -1,11 +1,11 @@ { - "name": "ai-videoassistant", + "name": "ai视频助手", "version": "0.0.0", "lockfileVersion": 3, "requires": true, "packages": { "": { - "name": "ai-videoassistant", + "name": "ai视频助手", "version": "0.0.0", "dependencies": { "@google/genai": "^1.39.0",