refactor(rag): 梳理文档入库与检索服务结构

This commit is contained in:
2026-05-30 00:44:52 +08:00
parent f68b44f325
commit ccfe5eb667
6 changed files with 284 additions and 103 deletions

View File

@@ -1,5 +1,5 @@
from agent_core.orchestrator import build_messages, run_agent
from agent_core.rag.ingest import ingest_document
from agent_core.rag.ingest import _split_text, ingest_document
from agent_core.rag.retriever import retrieve
@@ -221,3 +221,30 @@ def test_run_agent_uses_retrieved_document_chunks(tmp_path):
assert result.references[0]["source"] == "sop.md"
assert "隔离现场" in result.references[0]["content"]
def test_rag_split_text_keeps_overlap_and_non_empty_chunks():
chunks = _split_text("A" * 20, chunk_size=8, overlap=3)
assert chunks == ["AAAAAAAA", "AAAAAAAA", "AAAAAAAA", "AAAAA"]
def test_retrieve_returns_empty_when_query_has_no_overlap(tmp_path):
store_path = tmp_path / "rag_store.json"
ingest_document(
scenario_id="knowledge_qa",
source_file="rules.md",
text="这里描述的是报销流程和审批链。",
collection="knowledge_qa",
store_path=store_path,
)
chunks = retrieve(
scenario_id="knowledge_qa",
query="设备点检",
collection="knowledge_qa",
top_k=3,
store_path=store_path,
)
assert chunks == []

View File

@@ -3,7 +3,7 @@ from django.urls import reverse
from apps.documents.forms import DocumentUploadForm
from apps.documents.models import UploadedDocument
from apps.documents.services import extract_text
from apps.documents.services import extract_text, index_document
def test_upload_txt_document_creates_uploaded_record(client, db):
@@ -128,3 +128,20 @@ def test_index_failure_message_is_visible_on_document_list(client, db, monkeypat
assert response.status_code == 200
assert "文档入库失败,请检查错误原因后重试" in content
assert "模拟入库失败" in content
def test_index_document_marks_failed_when_extracted_text_is_empty(db, monkeypatch):
document = UploadedDocument.objects.create(
scenario_id="knowledge_qa",
original_name="empty.md",
file_type="md",
size=0,
status="uploaded",
)
monkeypatch.setattr("apps.documents.services.extract_text", lambda target: " ")
updated_document = index_document(document)
assert updated_document.status == UploadedDocument.STATUS_FAILED
assert "文档内容为空" in updated_document.error_message