refactor(rag): 梳理文档入库与检索服务结构

This commit is contained in:
2026-05-30 00:44:52 +08:00
parent f68b44f325
commit ccfe5eb667
6 changed files with 284 additions and 103 deletions

View File

@@ -6,6 +6,7 @@ from agent_core.llm_provider import create_embedding_provider
def _client(path: str | Path | None = None):
"""按给定路径初始化 Chroma 持久化客户端。"""
import chromadb
resolved_path = str(path or settings.CHROMA_PATH)
@@ -13,6 +14,7 @@ def _client(path: str | Path | None = None):
def _embedding_provider():
"""从 Django settings 构造 Embedding Provider避免在业务层散落配置读取。"""
return create_embedding_provider(
{
"EMBEDDING_API_KEY": settings.EMBEDDING_API_KEY,
@@ -27,6 +29,11 @@ def upsert_chunks(
chunks: list[dict],
store_path: str | Path | None = None,
) -> None:
"""
将 chunk 写入 Chroma。
同一 document_id 重新入库前会先删除旧记录,保证一次文档只有一份有效向量数据。
"""
client = _client(store_path)
chroma_collection = client.get_or_create_collection(collection)
document_ids = {chunk["document_id"] for chunk in chunks if chunk.get("document_id") is not None}
@@ -59,6 +66,7 @@ def query_chunks(
document_ids: list[int] | None = None,
store_path: str | Path | None = None,
) -> list[dict]:
"""执行向量检索,并把 Chroma 原始结果转换为统一引用结构。"""
client = _client(store_path)
chroma_collection = client.get_or_create_collection(collection)
where: dict = {"scenario_id": scenario_id}