refactor(rag): 梳理文档入库与检索服务结构

2026-05-30 00:44:52 +08:00
parent f68b44f325
commit ccfe5eb667
6 changed files with 284 additions and 103 deletions
--- a/agent_core/rag/chroma_store.py
+++ b/agent_core/rag/chroma_store.py
@@ -6,6 +6,7 @@ from agent_core.llm_provider import create_embedding_provider


 def _client(path: str | Path | None = None):
+    """按给定路径初始化 Chroma 持久化客户端。"""
    import chromadb

    resolved_path = str(path or settings.CHROMA_PATH)
@@ -13,6 +14,7 @@ def _client(path: str | Path | None = None):


 def _embedding_provider():
+    """从 Django settings 构造 Embedding Provider，避免在业务层散落配置读取。"""
    return create_embedding_provider(
        {
            "EMBEDDING_API_KEY": settings.EMBEDDING_API_KEY,
@@ -27,6 +29,11 @@ def upsert_chunks(
    chunks: list[dict],
    store_path: str | Path | None = None,
 ) -> None:
+    """
+    将 chunk 写入 Chroma。
+
+    同一 document_id 重新入库前会先删除旧记录，保证一次文档只有一份有效向量数据。
+    """
    client = _client(store_path)
    chroma_collection = client.get_or_create_collection(collection)
    document_ids = {chunk["document_id"] for chunk in chunks if chunk.get("document_id") is not None}
@@ -59,6 +66,7 @@ def query_chunks(
    document_ids: list[int] | None = None,
    store_path: str | Path | None = None,
 ) -> list[dict]:
+    """执行向量检索，并把 Chroma 原始结果转换为统一引用结构。"""
    client = _client(store_path)
    chroma_collection = client.get_or_create_collection(collection)
    where: dict = {"scenario_id": scenario_id}