feat(knowledge-base): 增加全局知识库管理
This commit is contained in:
@@ -37,6 +37,7 @@ def retrieve_citations(
|
||||
"source": metadata.get("source", "法规材料"),
|
||||
"text": document,
|
||||
"score": distance,
|
||||
"metadata": metadata,
|
||||
}
|
||||
)
|
||||
return citations
|
||||
|
||||
@@ -2,6 +2,7 @@ from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import logging
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
from dataclasses import dataclass
|
||||
@@ -102,6 +103,33 @@ def _iter_docx_blocks(document):
|
||||
|
||||
|
||||
def _extract_legacy_doc_with_libreoffice(path: Path) -> str:
|
||||
cached = _cached_docx_path(path)
|
||||
if cached.exists():
|
||||
return extract_text_from_path(cached)
|
||||
try:
|
||||
return _extract_legacy_doc_with_libreoffice_convert(path)
|
||||
except RuntimeError as libreoffice_error:
|
||||
try:
|
||||
return _extract_legacy_doc_with_word_com(path)
|
||||
except RuntimeError as word_error:
|
||||
try:
|
||||
return _extract_legacy_doc_with_powershell_word_com(path)
|
||||
except RuntimeError as powershell_error:
|
||||
raise RuntimeError(
|
||||
f"无法转换法规 .doc 材料:{path.name};"
|
||||
f"LibreOffice 错误:{libreoffice_error};"
|
||||
f"Word COM 错误:{word_error};"
|
||||
f"PowerShell Word COM 错误:{powershell_error}"
|
||||
) from powershell_error
|
||||
|
||||
|
||||
def _cached_docx_path(path: Path) -> Path:
|
||||
digest = hashlib.sha256(str(path.resolve()).encode("utf-8")).hexdigest()[:12]
|
||||
cache_dir = Path(settings.MEDIA_ROOT) / "regulatory_review" / "docx_cache"
|
||||
return cache_dir / f"{path.stem}-{digest}.docx"
|
||||
|
||||
|
||||
def _extract_legacy_doc_with_libreoffice_convert(path: Path) -> str:
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
target_dir = Path(tmp_dir)
|
||||
try:
|
||||
@@ -128,6 +156,72 @@ def _extract_legacy_doc_with_libreoffice(path: Path) -> str:
|
||||
return extract_text_from_path(converted)
|
||||
|
||||
|
||||
def _extract_legacy_doc_with_word_com(path: Path) -> str:
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
target_dir = Path(tmp_dir)
|
||||
converted = target_dir / f"{path.stem}.docx"
|
||||
word = None
|
||||
try:
|
||||
import pythoncom
|
||||
import win32com.client
|
||||
|
||||
pythoncom.CoInitialize()
|
||||
word = win32com.client.DispatchEx("Word.Application")
|
||||
word.Visible = False
|
||||
document = word.Documents.Open(str(path.resolve()), ReadOnly=True)
|
||||
document.SaveAs(str(converted.resolve()), FileFormat=16)
|
||||
document.Close(False)
|
||||
except Exception as exc:
|
||||
raise RuntimeError(f"无法通过 Word COM 转换法规 .doc 材料:{path.name}") from exc
|
||||
finally:
|
||||
if word is not None:
|
||||
try:
|
||||
word.Quit()
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
pythoncom.CoUninitialize()
|
||||
except Exception:
|
||||
pass
|
||||
if not converted.exists():
|
||||
raise RuntimeError(f"Word COM 未生成 docx:{path.name}")
|
||||
return extract_text_from_path(converted)
|
||||
|
||||
|
||||
def _extract_legacy_doc_with_powershell_word_com(path: Path) -> str:
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
target_dir = Path(tmp_dir)
|
||||
converted = target_dir / f"{path.stem}.docx"
|
||||
source_path = str(path.resolve()).replace("'", "''")
|
||||
target_path = str(converted.resolve()).replace("'", "''")
|
||||
script = (
|
||||
"$ErrorActionPreference = 'Stop';"
|
||||
"$word = New-Object -ComObject Word.Application;"
|
||||
"$word.Visible = $false;"
|
||||
"try {"
|
||||
f"$doc = $word.Documents.Open('{source_path}', $false, $true);"
|
||||
f"$doc.SaveAs([ref]'{target_path}', [ref]16);"
|
||||
"$doc.Close([ref]$false);"
|
||||
"} finally { $word.Quit() }"
|
||||
)
|
||||
powershell = shutil.which("powershell") or shutil.which("pwsh")
|
||||
if not powershell:
|
||||
raise RuntimeError("PowerShell 不可用,无法调用 Word COM。")
|
||||
try:
|
||||
subprocess.run(
|
||||
[powershell, "-NoProfile", "-ExecutionPolicy", "Bypass", "-Command", script],
|
||||
check=True,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=90,
|
||||
)
|
||||
except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as exc:
|
||||
raise RuntimeError(f"无法通过 PowerShell Word COM 转换法规 .doc 材料:{path.name}") from exc
|
||||
if not converted.exists():
|
||||
raise RuntimeError(f"PowerShell Word COM 未生成 docx:{path.name}")
|
||||
return extract_text_from_path(converted)
|
||||
|
||||
|
||||
def collect_source_chunks(source_dir: Path) -> list[TextChunk]:
|
||||
chunks: list[TextChunk] = []
|
||||
for path in sorted(source_dir.rglob("*")):
|
||||
|
||||
Reference in New Issue
Block a user