feat(knowledge-base): 增加全局知识库管理

2026-06-08 21:37:32 +08:00
parent e6fa738fd5
commit 5ecf78c5d6
12 changed files with 1425 additions and 2 deletions
--- a/review_agent/regulatory_review/services/rag_citation.py
+++ b/review_agent/regulatory_review/services/rag_citation.py
@@ -37,6 +37,7 @@ def retrieve_citations(
                "source": metadata.get("source", "法规材料"),
                "text": document,
                "score": distance,
+                "metadata": metadata,
            }
        )
    return citations
--- a/review_agent/regulatory_review/services/rag_index.py
+++ b/review_agent/regulatory_review/services/rag_index.py
@@ -2,6 +2,7 @@ from __future__ import annotations

 import hashlib
 import logging
+import shutil
 import subprocess
 import tempfile
 from dataclasses import dataclass
@@ -102,6 +103,33 @@ def _iter_docx_blocks(document):


 def _extract_legacy_doc_with_libreoffice(path: Path) -> str:
+    cached = _cached_docx_path(path)
+    if cached.exists():
+        return extract_text_from_path(cached)
+    try:
+        return _extract_legacy_doc_with_libreoffice_convert(path)
+    except RuntimeError as libreoffice_error:
+        try:
+            return _extract_legacy_doc_with_word_com(path)
+        except RuntimeError as word_error:
+            try:
+                return _extract_legacy_doc_with_powershell_word_com(path)
+            except RuntimeError as powershell_error:
+                raise RuntimeError(
+                    f"无法转换法规 .doc 材料：{path.name}；"
+                    f"LibreOffice 错误：{libreoffice_error}；"
+                    f"Word COM 错误：{word_error}；"
+                    f"PowerShell Word COM 错误：{powershell_error}"
+                ) from powershell_error
+
+
+def _cached_docx_path(path: Path) -> Path:
+    digest = hashlib.sha256(str(path.resolve()).encode("utf-8")).hexdigest()[:12]
+    cache_dir = Path(settings.MEDIA_ROOT) / "regulatory_review" / "docx_cache"
+    return cache_dir / f"{path.stem}-{digest}.docx"
+
+
+def _extract_legacy_doc_with_libreoffice_convert(path: Path) -> str:
    with tempfile.TemporaryDirectory() as tmp_dir:
        target_dir = Path(tmp_dir)
        try:
@@ -128,6 +156,72 @@ def _extract_legacy_doc_with_libreoffice(path: Path) -> str:
        return extract_text_from_path(converted)


+def _extract_legacy_doc_with_word_com(path: Path) -> str:
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        target_dir = Path(tmp_dir)
+        converted = target_dir / f"{path.stem}.docx"
+        word = None
+        try:
+            import pythoncom
+            import win32com.client
+
+            pythoncom.CoInitialize()
+            word = win32com.client.DispatchEx("Word.Application")
+            word.Visible = False
+            document = word.Documents.Open(str(path.resolve()), ReadOnly=True)
+            document.SaveAs(str(converted.resolve()), FileFormat=16)
+            document.Close(False)
+        except Exception as exc:
+            raise RuntimeError(f"无法通过 Word COM 转换法规 .doc 材料：{path.name}") from exc
+        finally:
+            if word is not None:
+                try:
+                    word.Quit()
+                except Exception:
+                    pass
+            try:
+                pythoncom.CoUninitialize()
+            except Exception:
+                pass
+        if not converted.exists():
+            raise RuntimeError(f"Word COM 未生成 docx：{path.name}")
+        return extract_text_from_path(converted)
+
+
+def _extract_legacy_doc_with_powershell_word_com(path: Path) -> str:
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        target_dir = Path(tmp_dir)
+        converted = target_dir / f"{path.stem}.docx"
+        source_path = str(path.resolve()).replace("'", "''")
+        target_path = str(converted.resolve()).replace("'", "''")
+        script = (
+            "$ErrorActionPreference = 'Stop';"
+            "$word = New-Object -ComObject Word.Application;"
+            "$word.Visible = $false;"
+            "try {"
+            f"$doc = $word.Documents.Open('{source_path}', $false, $true);"
+            f"$doc.SaveAs([ref]'{target_path}', [ref]16);"
+            "$doc.Close([ref]$false);"
+            "} finally { $word.Quit() }"
+        )
+        powershell = shutil.which("powershell") or shutil.which("pwsh")
+        if not powershell:
+            raise RuntimeError("PowerShell 不可用，无法调用 Word COM。")
+        try:
+            subprocess.run(
+                [powershell, "-NoProfile", "-ExecutionPolicy", "Bypass", "-Command", script],
+                check=True,
+                capture_output=True,
+                text=True,
+                timeout=90,
+            )
+        except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as exc:
+            raise RuntimeError(f"无法通过 PowerShell Word COM 转换法规 .doc 材料：{path.name}") from exc
+        if not converted.exists():
+            raise RuntimeError(f"PowerShell Word COM 未生成 docx：{path.name}")
+        return extract_text_from_path(converted)
+
+
 def collect_source_chunks(source_dir: Path) -> list[TextChunk]:
    chunks: list[TextChunk] = []
    for path in sorted(source_dir.rglob("*")):