refactor(rag): 梳理文档入库与检索服务结构

2026-05-30 00:44:52 +08:00
parent f68b44f325
commit ccfe5eb667
6 changed files with 284 additions and 103 deletions
--- a/agent_core/rag/ingest.py
+++ b/agent_core/rag/ingest.py
@@ -1,6 +1,6 @@
+import importlib.util
 import json
 import re
-import importlib.util
 from dataclasses import dataclass
 from datetime import datetime, timezone
 from pathlib import Path
@@ -12,11 +12,61 @@ from .chroma_store import upsert_chunks

@dataclass
 class IngestResult:
+    """RAG 入库统一返回结构，供 Documents 模块稳定消费。"""
    success: bool
    chunks_count: int = 0
    error: str = ""


+def ingest_document(
+    scenario_id: str,
+    source_file: str,
+    text: str,
+    collection: str,
+    document_id: int | None = None,
+    store_path: str | Path | None = None,
+) -> IngestResult:
+    """
+    将单个文档文本切分后写入知识库。
+
+    运行策略：
+    - 如果显式传入 `store_path`，说明当前是测试或降级模式，走本地 JSON 存储。
+    - 如果未传入且环境可用 chromadb，则走真实 Chroma 持久化。
+    """
+    if not text.strip():
+        return IngestResult(success=False, error="文档内容为空")
+    if _should_use_chroma(store_path):
+        return _ingest_chroma_document(
+            document_id=document_id,
+            scenario_id=scenario_id,
+            source_file=source_file,
+            text=text,
+            collection=collection,
+        )
+    resolved_store_path = Path(store_path) if store_path else _default_store_path()
+    chunks = _build_chunks(
+        scenario_id=scenario_id,
+        source_file=source_file,
+        text=text,
+        collection=collection,
+        document_id=document_id,
+        chunk_id_prefix=source_file,
+    )
+    persisted_chunks = _filter_out_same_document_chunks(
+        _load_store(resolved_store_path),
+        scenario_id=scenario_id,
+        collection=collection,
+        document_id=document_id,
+    )
+    _save_store(resolved_store_path, [*persisted_chunks, *chunks])
+    return IngestResult(success=True, chunks_count=len(chunks))
+
+
+def _should_use_chroma(store_path: str | Path | None) -> bool:
+    """只在未指定测试存储路径且安装 chromadb 时启用真实向量库。"""
+    return store_path is None and importlib.util.find_spec("chromadb") is not None
+
+
 def _default_store_path() -> Path:
    return Path(settings.CHROMA_PATH) / "rag_store.json"

@@ -35,6 +85,13 @@ def _save_store(store_path: Path, chunks: list[dict]) -> None:


 def _split_text(text: str, chunk_size: int = 800, overlap: int = 120) -> list[str]:
+    """
+    使用固定窗口 + overlap 切分文本。
+
+    该策略简单但稳定，便于解释：
+    - chunk_size 控制每个片段最大长度
+    - overlap 保证相邻片段共享上下文，降低边界信息丢失
+    """
    normalized = re.sub(r"\s+", " ", text).strip()
    if not normalized:
        return []
@@ -49,44 +106,46 @@ def _split_text(text: str, chunk_size: int = 800, overlap: int = 120) -> list[st
    return chunks


-def ingest_document(
+def _build_chunks(
    scenario_id: str,
    source_file: str,
    text: str,
    collection: str,
-    document_id: int | None = None,
-    store_path: str | Path | None = None,
-) -> IngestResult:
-    if not text.strip():
-        return IngestResult(success=False, error="文档内容为空")
-    if store_path is None and importlib.util.find_spec("chromadb") is not None:
-        return _ingest_chroma_document(document_id, scenario_id, source_file, text, collection)
-    resolved_store_path = Path(store_path) if store_path else _default_store_path()
-    existing_chunks = [
+    document_id: int | None,
+    chunk_id_prefix: str,
+) -> list[dict]:
+    """把原始文本切分并封装为统一 chunk 结构。"""
+    created_at = datetime.now(timezone.utc).isoformat()
+    return [
+        {
+            "scenario_id": scenario_id,
+            "document_id": document_id,
+            "collection": collection,
+            "source": source_file,
+            "chunk_id": f"{scenario_id}:{chunk_id_prefix}:{index}",
+            "content": chunk_text,
+            "created_at": created_at,
+        }
+        for index, chunk_text in enumerate(_split_text(text), start=1)
+    ]
+
+
+def _filter_out_same_document_chunks(
+    chunks: list[dict],
+    scenario_id: str,
+    collection: str,
+    document_id: int | None,
+) -> list[dict]:
+    """重新入库同一 document_id 时，先删除旧 chunk，避免重复检索。"""
+    return [
        chunk
-        for chunk in _load_store(resolved_store_path)
+        for chunk in chunks
        if not (
            chunk.get("document_id") == document_id
            and chunk.get("scenario_id") == scenario_id
            and chunk.get("collection") == collection
        )
    ]
-    created_at = datetime.now(timezone.utc).isoformat()
-    new_chunks = []
-    for index, chunk_text in enumerate(_split_text(text), start=1):
-        new_chunks.append(
-            {
-                "scenario_id": scenario_id,
-                "document_id": document_id,
-                "collection": collection,
-                "source": source_file,
-                "chunk_id": f"{scenario_id}:{source_file}:{index}",
-                "content": chunk_text,
-                "created_at": created_at,
-            }
-        )
-    _save_store(resolved_store_path, [*existing_chunks, *new_chunks])
-    return IngestResult(success=True, chunks_count=len(new_chunks))


 def _ingest_chroma_document(
@@ -96,19 +155,15 @@ def _ingest_chroma_document(
    text: str,
    collection: str,
 ) -> IngestResult:
-    created_at = datetime.now(timezone.utc).isoformat()
-    chunks = [
-        {
-            "scenario_id": scenario_id,
-            "document_id": document_id,
-            "collection": collection,
-            "source": source_file,
-            "chunk_id": f"{scenario_id}:{document_id or source_file}:{index}",
-            "content": chunk_text,
-            "created_at": created_at,
-        }
-        for index, chunk_text in enumerate(_split_text(text), start=1)
-    ]
+    """真实 Chroma 模式的入库分支。"""
+    chunks = _build_chunks(
+        scenario_id=scenario_id,
+        source_file=source_file,
+        text=text,
+        collection=collection,
+        document_id=document_id,
+        chunk_id_prefix=str(document_id or source_file),
+    )
    try:
        upsert_chunks(collection=collection, chunks=chunks)
    except Exception as exc: