fix(kb): 完善知识库入库和重建索引
This commit is contained in:
@@ -3,6 +3,7 @@ from django.core.files.uploadedfile import SimpleUploadedFile
|
||||
from django.urls import reverse
|
||||
|
||||
from review_agent.knowledge_base import build_knowledge_base_context, delete_document, search_knowledge_base
|
||||
from review_agent.views import rebuild_knowledge_base_index
|
||||
from review_agent.models import KnowledgeBaseDocument
|
||||
|
||||
|
||||
@@ -16,6 +17,7 @@ def test_knowledge_base_context_reports_rule_and_sources():
|
||||
assert context["rule"]["requirement_count"] > 0
|
||||
assert context["source_count"] > 0
|
||||
assert context["collection_name"] == "nmpa_ivd_registration_v1"
|
||||
assert not any("模拟题二" in source["relative_path"] for source in context["sources"])
|
||||
|
||||
|
||||
def test_knowledge_base_page_requires_login(client):
|
||||
@@ -36,6 +38,11 @@ def test_knowledge_base_page_renders_for_user(client, django_user_model):
|
||||
content = response.content.decode("utf-8")
|
||||
tabbar = content[content.index('<div class="tabbar"') : content.index("</div>", content.index('<div class="tabbar"'))]
|
||||
assert tabbar.index("审核智能体") < tabbar.index("知识库管理") < tabbar.index("附件管理")
|
||||
assert "data-rebuild-url=" in content
|
||||
assert 'id="knowledgeRebuildIndexButton"' in content
|
||||
assert "重建索引" in content
|
||||
assert 'data-source-action="index"' in content
|
||||
assert "手动入库" in content
|
||||
|
||||
|
||||
def test_knowledge_base_status_api(client, django_user_model):
|
||||
@@ -48,6 +55,53 @@ def test_knowledge_base_status_api(client, django_user_model):
|
||||
assert response.json()["rule"]["code"] == "nmpa_ivd_registration_v1"
|
||||
|
||||
|
||||
def test_knowledge_base_rebuild_index_api(client, django_user_model, monkeypatch):
|
||||
user = django_user_model.objects.create_user(username="owner", password="pass")
|
||||
client.force_login(user)
|
||||
calls = []
|
||||
|
||||
monkeypatch.setattr(
|
||||
"review_agent.views.rebuild_knowledge_base_index",
|
||||
lambda: calls.append("rebuild") or {"chunk_count": 12},
|
||||
)
|
||||
|
||||
response = client.post(reverse("knowledge_base_rebuild_index"))
|
||||
|
||||
assert response.status_code == 200
|
||||
assert response.json()["chunk_count"] == 12
|
||||
assert response.json()["knowledge_base"]["collection"]["count"] >= 0
|
||||
assert calls == ["rebuild"]
|
||||
|
||||
|
||||
def test_rebuild_knowledge_base_index_requests_reset(settings, tmp_path, monkeypatch):
|
||||
settings.MEDIA_ROOT = tmp_path
|
||||
settings.REGULATORY_RAG_CHROMA_PATH = tmp_path / "chroma"
|
||||
settings.REGULATORY_RAG_CHROMA_PATH.mkdir()
|
||||
stale_file = settings.REGULATORY_RAG_CHROMA_PATH / "chroma.sqlite3"
|
||||
stale_file.write_text("stale", encoding="utf-8")
|
||||
calls = []
|
||||
|
||||
monkeypatch.setattr("review_agent.views.load_rule_file", lambda: {"source_material_dir": "docs/0.原始材料"})
|
||||
monkeypatch.setattr("review_agent.views.get_embedding_provider", lambda: "provider")
|
||||
monkeypatch.setattr(
|
||||
"review_agent.views.build_chroma_index",
|
||||
lambda source_dir, embedding_provider, reset=False: calls.append(
|
||||
{
|
||||
"source_dir": source_dir,
|
||||
"embedding_provider": embedding_provider,
|
||||
"reset": reset,
|
||||
}
|
||||
)
|
||||
or 8,
|
||||
)
|
||||
|
||||
payload = rebuild_knowledge_base_index()
|
||||
|
||||
assert payload["chunk_count"] == 8
|
||||
assert calls[0]["embedding_provider"] == "provider"
|
||||
assert calls[0]["reset"] is True
|
||||
|
||||
|
||||
def test_knowledge_base_search_rejects_blank_query():
|
||||
payload = search_knowledge_base("")
|
||||
|
||||
@@ -103,6 +157,8 @@ def test_knowledge_base_search_api_returns_payload(client, django_user_model):
|
||||
|
||||
def test_knowledge_base_document_crud_api(client, settings, tmp_path, django_user_model):
|
||||
settings.MEDIA_ROOT = tmp_path
|
||||
settings.REGULATORY_RAG_CHROMA_PATH = tmp_path / "chroma"
|
||||
settings.REGULATORY_RAG_PROVIDER = "deterministic"
|
||||
user = django_user_model.objects.create_user(username="owner", password="pass")
|
||||
client.force_login(user)
|
||||
|
||||
@@ -199,6 +255,8 @@ def test_knowledge_base_document_api_is_scoped_to_owner(client, django_user_mode
|
||||
|
||||
def test_knowledge_base_document_manual_index_api(client, settings, tmp_path, django_user_model):
|
||||
settings.MEDIA_ROOT = tmp_path
|
||||
settings.REGULATORY_RAG_CHROMA_PATH = tmp_path / "chroma"
|
||||
settings.REGULATORY_RAG_PROVIDER = "deterministic"
|
||||
user = django_user_model.objects.create_user(username="owner", password="pass")
|
||||
client.force_login(user)
|
||||
source_path = tmp_path / "manual.md"
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
import sys
|
||||
|
||||
import pytest
|
||||
|
||||
from review_agent.regulatory_review.services.rag_citation import (
|
||||
@@ -7,6 +9,7 @@ from review_agent.regulatory_review.services.rag_citation import (
|
||||
from review_agent.regulatory_review.services.rag_embedding import SiliconFlowEmbeddingProvider
|
||||
from review_agent.regulatory_review.services.rag_index import chunk_text
|
||||
from review_agent.regulatory_review.services.rag_index import collect_source_chunks
|
||||
from review_agent.regulatory_review.services.rag_index import build_chroma_index
|
||||
|
||||
|
||||
def test_siliconflow_embedding_provider_posts_expected_payload(monkeypatch):
|
||||
@@ -86,3 +89,141 @@ def test_collect_source_chunks_requires_attachment4_extraction(monkeypatch, tmp_
|
||||
|
||||
with pytest.raises(RuntimeError, match="附件 4"):
|
||||
collect_source_chunks(source_dir)
|
||||
|
||||
|
||||
def test_collect_source_chunks_excludes_demo_agent_materials(monkeypatch, tmp_path):
|
||||
source_dir = tmp_path / "sources"
|
||||
source_dir.mkdir()
|
||||
demo_dir = source_dir / "【模拟题二】试剂盒临床注册文件准备与审核Agent"
|
||||
demo_dir.mkdir()
|
||||
(demo_dir / "【模拟题二】试剂盒临床注册文件准备与审核Agent.md").write_text("题目材料", encoding="utf-8")
|
||||
(source_dir / "【模拟题二】试剂盒临床注册文件准备与审核Agent.docx").write_bytes(b"demo")
|
||||
real_source = source_dir / "附件 4 体外诊断试剂注册申报资料要求及说明.doc"
|
||||
real_source.write_bytes(b"rule")
|
||||
|
||||
def fake_extract(path):
|
||||
return "附件4 正文" if path == real_source else "不应被抽取"
|
||||
|
||||
monkeypatch.setattr("review_agent.regulatory_review.services.rag_index.extract_text_from_path", fake_extract)
|
||||
|
||||
chunks = collect_source_chunks(source_dir)
|
||||
|
||||
assert chunks
|
||||
assert all("模拟题二" not in chunk.metadata["source"] for chunk in chunks)
|
||||
|
||||
|
||||
def test_build_chroma_index_reset_recreates_collection_without_deleting_index_dir(settings, monkeypatch, tmp_path):
|
||||
settings.MEDIA_ROOT = tmp_path
|
||||
persist_path = tmp_path / "chroma"
|
||||
persist_path.mkdir()
|
||||
stale_file = persist_path / "chroma.sqlite3"
|
||||
stale_file.write_text("stale", encoding="utf-8")
|
||||
source_dir = tmp_path / "sources"
|
||||
source_dir.mkdir()
|
||||
(source_dir / "rule.md").write_text("注册检验报告要求", encoding="utf-8")
|
||||
client_states = []
|
||||
deleted_collections = []
|
||||
|
||||
class FakeCollection:
|
||||
def upsert(self, **kwargs):
|
||||
return None
|
||||
|
||||
class FakeClient:
|
||||
def __init__(self, path):
|
||||
client_states.append({"path": path, "stale_exists": stale_file.exists()})
|
||||
|
||||
def delete_collection(self, name):
|
||||
deleted_collections.append(name)
|
||||
|
||||
def get_or_create_collection(self, name):
|
||||
return FakeCollection()
|
||||
|
||||
class FakeSharedSystemClient:
|
||||
@staticmethod
|
||||
def clear_system_cache():
|
||||
client_states.append({"path": "cache-cleared", "stale_exists": stale_file.exists()})
|
||||
|
||||
monkeypatch.setitem(sys.modules, "chromadb", type("FakeChromaModule", (), {"PersistentClient": FakeClient}))
|
||||
monkeypatch.setitem(
|
||||
sys.modules,
|
||||
"chromadb.api.shared_system_client",
|
||||
type("FakeSharedSystemClientModule", (), {"SharedSystemClient": FakeSharedSystemClient}),
|
||||
)
|
||||
|
||||
count = build_chroma_index(
|
||||
source_dir=source_dir,
|
||||
embedding_provider=lambda texts: [[0.1, 0.2] for _ in texts],
|
||||
persist_path=persist_path,
|
||||
collection_name="test",
|
||||
reset=True,
|
||||
)
|
||||
|
||||
assert count == 1
|
||||
assert client_states == [
|
||||
{"path": str(persist_path), "stale_exists": True},
|
||||
{"path": "cache-cleared", "stale_exists": True},
|
||||
{"path": str(persist_path), "stale_exists": True},
|
||||
]
|
||||
assert stale_file.exists()
|
||||
assert deleted_collections == ["test"]
|
||||
|
||||
|
||||
def test_build_chroma_index_reset_clears_bad_index_dir_after_chroma_cache_reset(settings, monkeypatch, tmp_path):
|
||||
settings.MEDIA_ROOT = tmp_path
|
||||
persist_path = tmp_path / "chroma"
|
||||
persist_path.mkdir()
|
||||
stale_file = persist_path / "chroma.sqlite3"
|
||||
stale_file.write_text("stale", encoding="utf-8")
|
||||
source_dir = tmp_path / "sources"
|
||||
source_dir.mkdir()
|
||||
(source_dir / "rule.md").write_text("注册检验报告要求", encoding="utf-8")
|
||||
events = []
|
||||
|
||||
class FakeCollection:
|
||||
def upsert(self, **kwargs):
|
||||
return None
|
||||
|
||||
class BrokenThenFreshClient:
|
||||
attempts = 0
|
||||
|
||||
def __init__(self, path):
|
||||
BrokenThenFreshClient.attempts += 1
|
||||
events.append(("client", BrokenThenFreshClient.attempts, stale_file.exists()))
|
||||
if BrokenThenFreshClient.attempts == 1:
|
||||
raise ValueError("Could not connect to tenant default_tenant")
|
||||
|
||||
def get_or_create_collection(self, name):
|
||||
return FakeCollection()
|
||||
|
||||
class FakeSharedSystemClient:
|
||||
@staticmethod
|
||||
def clear_system_cache():
|
||||
events.append(("clear_cache", stale_file.exists()))
|
||||
|
||||
fake_chromadb = type(
|
||||
"FakeChromaModule",
|
||||
(),
|
||||
{"PersistentClient": BrokenThenFreshClient},
|
||||
)
|
||||
monkeypatch.setitem(sys.modules, "chromadb", fake_chromadb)
|
||||
monkeypatch.setitem(
|
||||
sys.modules,
|
||||
"chromadb.api.shared_system_client",
|
||||
type("FakeSharedSystemClientModule", (), {"SharedSystemClient": FakeSharedSystemClient}),
|
||||
)
|
||||
|
||||
count = build_chroma_index(
|
||||
source_dir=source_dir,
|
||||
embedding_provider=lambda texts: [[0.1, 0.2] for _ in texts],
|
||||
persist_path=persist_path,
|
||||
collection_name="test",
|
||||
reset=True,
|
||||
)
|
||||
|
||||
assert count == 1
|
||||
assert events == [
|
||||
("client", 1, True),
|
||||
("clear_cache", True),
|
||||
("client", 2, False),
|
||||
]
|
||||
assert not stale_file.exists()
|
||||
|
||||
Reference in New Issue
Block a user