feat(knowledge-base): 增加全局知识库管理

This commit is contained in:
2026-06-08 21:37:32 +08:00
parent e6fa738fd5
commit 5ecf78c5d6
12 changed files with 1425 additions and 2 deletions

View File

@@ -37,6 +37,7 @@ def retrieve_citations(
"source": metadata.get("source", "法规材料"),
"text": document,
"score": distance,
"metadata": metadata,
}
)
return citations

View File

@@ -2,6 +2,7 @@ from __future__ import annotations
import hashlib
import logging
import shutil
import subprocess
import tempfile
from dataclasses import dataclass
@@ -102,6 +103,33 @@ def _iter_docx_blocks(document):
def _extract_legacy_doc_with_libreoffice(path: Path) -> str:
cached = _cached_docx_path(path)
if cached.exists():
return extract_text_from_path(cached)
try:
return _extract_legacy_doc_with_libreoffice_convert(path)
except RuntimeError as libreoffice_error:
try:
return _extract_legacy_doc_with_word_com(path)
except RuntimeError as word_error:
try:
return _extract_legacy_doc_with_powershell_word_com(path)
except RuntimeError as powershell_error:
raise RuntimeError(
f"无法转换法规 .doc 材料:{path.name}"
f"LibreOffice 错误:{libreoffice_error}"
f"Word COM 错误:{word_error}"
f"PowerShell Word COM 错误:{powershell_error}"
) from powershell_error
def _cached_docx_path(path: Path) -> Path:
digest = hashlib.sha256(str(path.resolve()).encode("utf-8")).hexdigest()[:12]
cache_dir = Path(settings.MEDIA_ROOT) / "regulatory_review" / "docx_cache"
return cache_dir / f"{path.stem}-{digest}.docx"
def _extract_legacy_doc_with_libreoffice_convert(path: Path) -> str:
with tempfile.TemporaryDirectory() as tmp_dir:
target_dir = Path(tmp_dir)
try:
@@ -128,6 +156,72 @@ def _extract_legacy_doc_with_libreoffice(path: Path) -> str:
return extract_text_from_path(converted)
def _extract_legacy_doc_with_word_com(path: Path) -> str:
with tempfile.TemporaryDirectory() as tmp_dir:
target_dir = Path(tmp_dir)
converted = target_dir / f"{path.stem}.docx"
word = None
try:
import pythoncom
import win32com.client
pythoncom.CoInitialize()
word = win32com.client.DispatchEx("Word.Application")
word.Visible = False
document = word.Documents.Open(str(path.resolve()), ReadOnly=True)
document.SaveAs(str(converted.resolve()), FileFormat=16)
document.Close(False)
except Exception as exc:
raise RuntimeError(f"无法通过 Word COM 转换法规 .doc 材料:{path.name}") from exc
finally:
if word is not None:
try:
word.Quit()
except Exception:
pass
try:
pythoncom.CoUninitialize()
except Exception:
pass
if not converted.exists():
raise RuntimeError(f"Word COM 未生成 docx{path.name}")
return extract_text_from_path(converted)
def _extract_legacy_doc_with_powershell_word_com(path: Path) -> str:
with tempfile.TemporaryDirectory() as tmp_dir:
target_dir = Path(tmp_dir)
converted = target_dir / f"{path.stem}.docx"
source_path = str(path.resolve()).replace("'", "''")
target_path = str(converted.resolve()).replace("'", "''")
script = (
"$ErrorActionPreference = 'Stop';"
"$word = New-Object -ComObject Word.Application;"
"$word.Visible = $false;"
"try {"
f"$doc = $word.Documents.Open('{source_path}', $false, $true);"
f"$doc.SaveAs([ref]'{target_path}', [ref]16);"
"$doc.Close([ref]$false);"
"} finally { $word.Quit() }"
)
powershell = shutil.which("powershell") or shutil.which("pwsh")
if not powershell:
raise RuntimeError("PowerShell 不可用,无法调用 Word COM。")
try:
subprocess.run(
[powershell, "-NoProfile", "-ExecutionPolicy", "Bypass", "-Command", script],
check=True,
capture_output=True,
text=True,
timeout=90,
)
except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as exc:
raise RuntimeError(f"无法通过 PowerShell Word COM 转换法规 .doc 材料:{path.name}") from exc
if not converted.exists():
raise RuntimeError(f"PowerShell Word COM 未生成 docx{path.name}")
return extract_text_from_path(converted)
def collect_source_chunks(source_dir: Path) -> list[TextChunk]:
chunks: list[TextChunk] = []
for path in sorted(source_dir.rglob("*")):