refactor(rag): 梳理文档入库与检索服务结构
This commit is contained in:
@@ -6,6 +6,7 @@ from agent_core.llm_provider import create_embedding_provider
|
||||
|
||||
|
||||
def _client(path: str | Path | None = None):
|
||||
"""按给定路径初始化 Chroma 持久化客户端。"""
|
||||
import chromadb
|
||||
|
||||
resolved_path = str(path or settings.CHROMA_PATH)
|
||||
@@ -13,6 +14,7 @@ def _client(path: str | Path | None = None):
|
||||
|
||||
|
||||
def _embedding_provider():
|
||||
"""从 Django settings 构造 Embedding Provider,避免在业务层散落配置读取。"""
|
||||
return create_embedding_provider(
|
||||
{
|
||||
"EMBEDDING_API_KEY": settings.EMBEDDING_API_KEY,
|
||||
@@ -27,6 +29,11 @@ def upsert_chunks(
|
||||
chunks: list[dict],
|
||||
store_path: str | Path | None = None,
|
||||
) -> None:
|
||||
"""
|
||||
将 chunk 写入 Chroma。
|
||||
|
||||
同一 document_id 重新入库前会先删除旧记录,保证一次文档只有一份有效向量数据。
|
||||
"""
|
||||
client = _client(store_path)
|
||||
chroma_collection = client.get_or_create_collection(collection)
|
||||
document_ids = {chunk["document_id"] for chunk in chunks if chunk.get("document_id") is not None}
|
||||
@@ -59,6 +66,7 @@ def query_chunks(
|
||||
document_ids: list[int] | None = None,
|
||||
store_path: str | Path | None = None,
|
||||
) -> list[dict]:
|
||||
"""执行向量检索,并把 Chroma 原始结果转换为统一引用结构。"""
|
||||
client = _client(store_path)
|
||||
chroma_collection = client.get_or_create_collection(collection)
|
||||
where: dict = {"scenario_id": scenario_id}
|
||||
|
||||
Reference in New Issue
Block a user