Improve KB upload

This commit is contained in:
Xin Wang
2026-02-10 10:50:47 +08:00
parent 375181a524
commit 30f757529f
3 changed files with 177 additions and 22 deletions

View File

@@ -1,8 +1,10 @@
from fastapi import APIRouter, Depends, HTTPException, Query from fastapi import APIRouter, Depends, HTTPException, UploadFile, File, Form, Request
from sqlalchemy.orm import Session from sqlalchemy.orm import Session
from typing import Optional from typing import Optional
import uuid import uuid
import os import os
import json
from io import BytesIO
from datetime import datetime from datetime import datetime
from ..db import get_db from ..db import get_db
@@ -20,6 +22,57 @@ from ..vector_store import (
router = APIRouter(prefix="/knowledge", tags=["knowledge"]) router = APIRouter(prefix="/knowledge", tags=["knowledge"])
def _refresh_kb_stats(db: Session, kb_id: str) -> None:
kb = db.query(KnowledgeBase).filter(KnowledgeBase.id == kb_id).first()
if not kb:
return
docs = db.query(KnowledgeDocument).filter(KnowledgeDocument.kb_id == kb_id).all()
completed_docs = [d for d in docs if d.status == "completed"]
kb.doc_count = len(completed_docs)
kb.chunk_count = sum(max(0, d.chunk_count or 0) for d in completed_docs)
def _decode_text_bytes(raw: bytes) -> str:
for encoding in ("utf-8", "utf-8-sig", "gb18030", "gbk", "latin-1"):
try:
return raw.decode(encoding)
except UnicodeDecodeError:
continue
return raw.decode("utf-8", errors="ignore")
def _extract_text_from_upload(filename: str, content_type: Optional[str], raw: bytes) -> str:
ext = os.path.splitext((filename or "").lower())[1]
if ext in {".txt", ".md", ".csv"}:
return _decode_text_bytes(raw)
if ext == ".json":
try:
parsed = json.loads(_decode_text_bytes(raw))
return json.dumps(parsed, ensure_ascii=False, indent=2)
except Exception:
return _decode_text_bytes(raw)
if ext == ".pdf":
try:
from pypdf import PdfReader # type: ignore
except Exception as exc:
raise ValueError("PDF parsing requires installing pypdf") from exc
reader = PdfReader(BytesIO(raw))
return "\n".join((page.extract_text() or "") for page in reader.pages).strip()
if ext == ".docx":
try:
from docx import Document # type: ignore
except Exception as exc:
raise ValueError("DOCX parsing requires installing python-docx") from exc
doc = Document(BytesIO(raw))
return "\n".join(p.text for p in doc.paragraphs).strip()
if ext == ".doc":
raise ValueError("DOC format is not supported for auto indexing. Please convert to DOCX/TXT.")
# fallback: attempt plain text decode
if (content_type or "").startswith("text/"):
return _decode_text_bytes(raw)
raise ValueError(f"Unsupported file type for auto indexing: {ext or content_type or 'unknown'}")
def kb_to_dict(kb: KnowledgeBase) -> dict: def kb_to_dict(kb: KnowledgeBase) -> dict:
return { return {
"id": kb.id, "id": kb.id,
@@ -191,20 +244,93 @@ def delete_knowledge_base(kb_id: str, db: Session = Depends(get_db)):
# ============ Documents ============ # ============ Documents ============
@router.post("/bases/{kb_id}/documents") @router.post("/bases/{kb_id}/documents")
def upload_document( async def upload_document(
kb_id: str, kb_id: str,
file: Optional[UploadFile] = File(default=None),
name: Optional[str] = Form(default=None),
size: Optional[str] = Form(default=None),
file_type: Optional[str] = Form(default=None),
storage_url: Optional[str] = Form(default=None),
data: Optional[KnowledgeDocumentCreate] = None, data: Optional[KnowledgeDocumentCreate] = None,
name: Optional[str] = Query(default=None), request: Request = None,
size: Optional[str] = Query(default=None),
file_type: Optional[str] = Query(default=None),
storage_url: Optional[str] = Query(default=None),
db: Session = Depends(get_db) db: Session = Depends(get_db)
): ):
kb = db.query(KnowledgeBase).filter(KnowledgeBase.id == kb_id).first() kb = db.query(KnowledgeBase).filter(KnowledgeBase.id == kb_id).first()
if not kb: if not kb:
raise HTTPException(status_code=404, detail="Knowledge base not found") raise HTTPException(status_code=404, detail="Knowledge base not found")
# New mode: multipart file upload with automatic indexing
if file is not None:
filename = file.filename or "uploaded.txt"
file_type_value = file.content_type or file_type or "application/octet-stream"
raw = file.file.read()
if not raw:
raise HTTPException(status_code=400, detail="Uploaded file is empty")
doc = KnowledgeDocument(
id=str(uuid.uuid4())[:8],
kb_id=kb_id,
name=filename,
size=f"{len(raw)} bytes",
file_type=file_type_value,
storage_url=storage_url,
status="processing",
upload_date=datetime.utcnow().isoformat()
)
db.add(doc)
db.commit()
db.refresh(doc)
try:
if vector_store.get_collection(kb_id) is None:
vector_store.create_collection(kb_id, kb.embedding_model)
text = _extract_text_from_upload(filename, file.content_type, raw)
if not text.strip():
raise ValueError("No textual content extracted from file")
chunk_count = index_document(kb_id, doc.id, text)
doc.status = "completed"
doc.chunk_count = chunk_count
doc.processed_at = datetime.utcnow()
doc.error_message = None
_refresh_kb_stats(db, kb_id)
db.commit()
return {
"id": doc.id,
"name": doc.name,
"size": doc.size,
"fileType": doc.file_type,
"storageUrl": doc.storage_url,
"status": doc.status,
"chunkCount": doc.chunk_count,
"message": "Document uploaded and indexed",
}
except ValueError as exc:
doc.status = "failed"
doc.error_message = str(exc)
_refresh_kb_stats(db, kb_id)
db.commit()
raise HTTPException(status_code=400, detail=str(exc)) from exc
except Exception as exc:
doc.status = "failed"
doc.error_message = str(exc)
_refresh_kb_stats(db, kb_id)
db.commit()
raise HTTPException(status_code=500, detail=f"Failed to index uploaded file: {exc}") from exc
# Backward-compatible mode: metadata-only document creation
if data is None: if data is None:
if not name and not size and request is not None:
try:
raw_payload = await request.json()
if isinstance(raw_payload, dict):
name = raw_payload.get("name")
size = raw_payload.get("size")
file_type = raw_payload.get("fileType") or raw_payload.get("file_type") or file_type
storage_url = raw_payload.get("storageUrl") or raw_payload.get("storage_url") or storage_url
except Exception:
pass
if not name or not size: if not name or not size:
raise HTTPException(status_code=422, detail="name and size are required") raise HTTPException(status_code=422, detail="name and size are required")
data = KnowledgeDocumentCreate( data = KnowledgeDocumentCreate(
@@ -266,21 +392,21 @@ def index_document_content(kb_id: str, doc_id: str, request: DocumentIndexReques
db.commit() db.commit()
try: try:
if vector_store.get_collection(kb_id) is None:
kb = db.query(KnowledgeBase).filter(KnowledgeBase.id == kb_id).first()
vector_store.create_collection(kb_id, kb.embedding_model if kb else "text-embedding-3-small")
chunk_count = index_document(kb_id, doc_id, request.content) chunk_count = index_document(kb_id, doc_id, request.content)
doc.status = "completed" doc.status = "completed"
doc.chunk_count = chunk_count doc.chunk_count = chunk_count
doc.processed_at = datetime.utcnow() doc.processed_at = datetime.utcnow()
kb = db.query(KnowledgeBase).filter(KnowledgeBase.id == kb_id).first() doc.error_message = None
kb.doc_count = db.query(KnowledgeDocument).filter( _refresh_kb_stats(db, kb_id)
KnowledgeDocument.kb_id == kb_id,
KnowledgeDocument.status == "completed"
).count()
kb.chunk_count += chunk_count
db.commit() db.commit()
return {"message": "Document indexed", "chunkCount": chunk_count} return {"message": "Document indexed", "chunkCount": chunk_count}
except Exception as e: except Exception as e:
doc.status = "failed" doc.status = "failed"
doc.error_message = str(e) doc.error_message = str(e)
_refresh_kb_stats(db, kb_id)
db.commit() db.commit()
raise HTTPException(status_code=500, detail=str(e)) raise HTTPException(status_code=500, detail=str(e))
@@ -297,11 +423,8 @@ def delete_document(kb_id: str, doc_id: str, db: Session = Depends(get_db)):
delete_document_from_vector(kb_id, doc_id) delete_document_from_vector(kb_id, doc_id)
except Exception: except Exception:
pass pass
kb = db.query(KnowledgeBase).filter(KnowledgeBase.id == kb_id).first()
if kb:
kb.chunk_count = max(0, kb.chunk_count - (doc.chunk_count or 0))
kb.doc_count = max(0, kb.doc_count - 1)
db.delete(doc) db.delete(doc)
_refresh_kb_stats(db, kb_id)
db.commit() db.commit()
return {"message": "Deleted successfully"} return {"message": "Deleted successfully"}

View File

@@ -120,6 +120,19 @@ class TestKnowledgeAPI:
assert "id" in data assert "id" in data
assert data["status"] == "pending" assert data["status"] == "pending"
def test_upload_file_auto_index(self, client):
"""Test uploading a real file triggers auto indexing."""
create_response = client.post("/api/knowledge/bases", json={"name": "Auto Index KB"})
kb_id = create_response.json()["id"]
content = "Line one about product.\nLine two about warranty."
files = {"file": ("auto-index.txt", content.encode("utf-8"), "text/plain")}
response = client.post(f"/api/knowledge/bases/{kb_id}/documents", files=files)
assert response.status_code == 200
data = response.json()
assert data["status"] == "completed"
assert data["chunkCount"] >= 1
def test_delete_document(self, client): def test_delete_document(self, client):
"""Test deleting a document from knowledge base""" """Test deleting a document from knowledge base"""
# Create KB first # Create KB first

View File

@@ -611,12 +611,31 @@ export const deleteKnowledgeBase = async (kbId: string): Promise<void> => {
}; };
export const uploadKnowledgeDocument = async (kbId: string, file: File): Promise<void> => { export const uploadKnowledgeDocument = async (kbId: string, file: File): Promise<void> => {
const payload = { const formData = new FormData();
name: file.name, formData.append('file', file);
size: `${(file.size / 1024).toFixed(1)} KB`, formData.append('name', file.name);
fileType: file.type || 'txt', formData.append('size', `${file.size} bytes`);
}; formData.append('file_type', file.type || 'application/octet-stream');
await apiRequest(`/knowledge/bases/${kbId}/documents`, { method: 'POST', body: payload });
const base = (import.meta.env.VITE_API_BASE_URL || 'http://127.0.0.1:8100/api').replace(/\/+$/, '');
const url = `${base}/knowledge/bases/${kbId}/documents`;
const response = await fetch(url, {
method: 'POST',
body: formData,
});
if (!response.ok) {
let message = `Upload failed: ${response.status}`;
try {
const data = await response.json();
if (data?.detail) {
message = typeof data.detail === 'string' ? data.detail : message;
}
} catch {
// ignore parse error
}
throw new Error(message);
}
}; };
export const deleteKnowledgeDocument = async (kbId: string, docId: string): Promise<void> => { export const deleteKnowledgeDocument = async (kbId: string, docId: string): Promise<void> => {