fix(kb): 完善知识库入库和重建索引

This commit is contained in:
2026-06-08 23:45:34 +08:00
parent d8cd95e590
commit 2b5093040d
9 changed files with 355 additions and 9 deletions

View File

@@ -10,8 +10,8 @@ from django.core.files.uploadedfile import UploadedFile
from review_agent.models import KnowledgeBaseDocument
from review_agent.regulatory_review.services.rag_citation import RagIndexUnavailable, retrieve_citations
from review_agent.regulatory_review.services.rag_embedding import DeterministicEmbeddingProvider
from review_agent.regulatory_review.services.rag_index import chunk_text, extract_text_from_path
from review_agent.regulatory_review.services.rag_embedding import get_embedding_provider
from review_agent.regulatory_review.services.rag_index import chunk_text, extract_text_from_path, is_excluded_source_path
from review_agent.regulatory_review.services.rule_loader import DEFAULT_RULE_PATH, compute_file_sha256, load_rule_file
@@ -78,6 +78,8 @@ def list_source_documents(source_dir: Path) -> list[dict[str, Any]]:
continue
suffix = path.suffix.lower()
relative_path = str(path.relative_to(source_dir))
if is_excluded_source_path(relative_path):
continue
indexed_chunk_count = source_chunk_counts.get(relative_path, 0)
documents.append(
{
@@ -101,7 +103,7 @@ def search_knowledge_base(query: str, *, n_results: int = 3) -> dict[str, Any]:
try:
results = retrieve_citations(
normalized,
embedding_provider=DeterministicEmbeddingProvider(),
embedding_provider=get_embedding_provider(),
n_results=n_results,
)
except RagIndexUnavailable as exc:
@@ -210,7 +212,7 @@ def index_managed_document(document: KnowledgeBaseDocument) -> int:
return 0
collection = _load_chroma_collection()
texts = [chunk.text for chunk in chunks]
embeddings = DeterministicEmbeddingProvider()(texts)
embeddings = get_embedding_provider()(texts)
ids = [
hashlib.sha256(f"managed:{document.pk}:{chunk.metadata['chunk_index']}".encode("utf-8")).hexdigest()
for chunk in chunks