refactor(rag): 梳理文档入库与检索服务结构

This commit is contained in:
2026-05-30 00:44:52 +08:00
parent f68b44f325
commit ccfe5eb667
6 changed files with 284 additions and 103 deletions

View File

@@ -1,6 +1,6 @@
import importlib.util
import json
import re
import importlib.util
from dataclasses import dataclass
from datetime import datetime, timezone
from pathlib import Path
@@ -12,11 +12,61 @@ from .chroma_store import upsert_chunks
@dataclass
class IngestResult:
"""RAG 入库统一返回结构,供 Documents 模块稳定消费。"""
success: bool
chunks_count: int = 0
error: str = ""
def ingest_document(
scenario_id: str,
source_file: str,
text: str,
collection: str,
document_id: int | None = None,
store_path: str | Path | None = None,
) -> IngestResult:
"""
将单个文档文本切分后写入知识库。
运行策略:
- 如果显式传入 `store_path`,说明当前是测试或降级模式,走本地 JSON 存储。
- 如果未传入且环境可用 chromadb则走真实 Chroma 持久化。
"""
if not text.strip():
return IngestResult(success=False, error="文档内容为空")
if _should_use_chroma(store_path):
return _ingest_chroma_document(
document_id=document_id,
scenario_id=scenario_id,
source_file=source_file,
text=text,
collection=collection,
)
resolved_store_path = Path(store_path) if store_path else _default_store_path()
chunks = _build_chunks(
scenario_id=scenario_id,
source_file=source_file,
text=text,
collection=collection,
document_id=document_id,
chunk_id_prefix=source_file,
)
persisted_chunks = _filter_out_same_document_chunks(
_load_store(resolved_store_path),
scenario_id=scenario_id,
collection=collection,
document_id=document_id,
)
_save_store(resolved_store_path, [*persisted_chunks, *chunks])
return IngestResult(success=True, chunks_count=len(chunks))
def _should_use_chroma(store_path: str | Path | None) -> bool:
"""只在未指定测试存储路径且安装 chromadb 时启用真实向量库。"""
return store_path is None and importlib.util.find_spec("chromadb") is not None
def _default_store_path() -> Path:
return Path(settings.CHROMA_PATH) / "rag_store.json"
@@ -35,6 +85,13 @@ def _save_store(store_path: Path, chunks: list[dict]) -> None:
def _split_text(text: str, chunk_size: int = 800, overlap: int = 120) -> list[str]:
"""
使用固定窗口 + overlap 切分文本。
该策略简单但稳定,便于解释:
- chunk_size 控制每个片段最大长度
- overlap 保证相邻片段共享上下文,降低边界信息丢失
"""
normalized = re.sub(r"\s+", " ", text).strip()
if not normalized:
return []
@@ -49,44 +106,46 @@ def _split_text(text: str, chunk_size: int = 800, overlap: int = 120) -> list[st
return chunks
def ingest_document(
def _build_chunks(
scenario_id: str,
source_file: str,
text: str,
collection: str,
document_id: int | None = None,
store_path: str | Path | None = None,
) -> IngestResult:
if not text.strip():
return IngestResult(success=False, error="文档内容为空")
if store_path is None and importlib.util.find_spec("chromadb") is not None:
return _ingest_chroma_document(document_id, scenario_id, source_file, text, collection)
resolved_store_path = Path(store_path) if store_path else _default_store_path()
existing_chunks = [
document_id: int | None,
chunk_id_prefix: str,
) -> list[dict]:
"""把原始文本切分并封装为统一 chunk 结构。"""
created_at = datetime.now(timezone.utc).isoformat()
return [
{
"scenario_id": scenario_id,
"document_id": document_id,
"collection": collection,
"source": source_file,
"chunk_id": f"{scenario_id}:{chunk_id_prefix}:{index}",
"content": chunk_text,
"created_at": created_at,
}
for index, chunk_text in enumerate(_split_text(text), start=1)
]
def _filter_out_same_document_chunks(
chunks: list[dict],
scenario_id: str,
collection: str,
document_id: int | None,
) -> list[dict]:
"""重新入库同一 document_id 时,先删除旧 chunk避免重复检索。"""
return [
chunk
for chunk in _load_store(resolved_store_path)
for chunk in chunks
if not (
chunk.get("document_id") == document_id
and chunk.get("scenario_id") == scenario_id
and chunk.get("collection") == collection
)
]
created_at = datetime.now(timezone.utc).isoformat()
new_chunks = []
for index, chunk_text in enumerate(_split_text(text), start=1):
new_chunks.append(
{
"scenario_id": scenario_id,
"document_id": document_id,
"collection": collection,
"source": source_file,
"chunk_id": f"{scenario_id}:{source_file}:{index}",
"content": chunk_text,
"created_at": created_at,
}
)
_save_store(resolved_store_path, [*existing_chunks, *new_chunks])
return IngestResult(success=True, chunks_count=len(new_chunks))
def _ingest_chroma_document(
@@ -96,19 +155,15 @@ def _ingest_chroma_document(
text: str,
collection: str,
) -> IngestResult:
created_at = datetime.now(timezone.utc).isoformat()
chunks = [
{
"scenario_id": scenario_id,
"document_id": document_id,
"collection": collection,
"source": source_file,
"chunk_id": f"{scenario_id}:{document_id or source_file}:{index}",
"content": chunk_text,
"created_at": created_at,
}
for index, chunk_text in enumerate(_split_text(text), start=1)
]
"""真实 Chroma 模式的入库分支。"""
chunks = _build_chunks(
scenario_id=scenario_id,
source_file=source_file,
text=text,
collection=collection,
document_id=document_id,
chunk_id_prefix=str(document_id or source_file),
)
try:
upsert_chunks(collection=collection, chunks=chunks)
except Exception as exc: