from __future__ import annotations import hashlib import logging import shutil import subprocess import tempfile from dataclasses import dataclass from pathlib import Path from django.conf import settings from docx import Document from docx.oxml.table import CT_Tbl from docx.oxml.text.paragraph import CT_P from docx.table import Table from docx.text.paragraph import Paragraph from openpyxl import load_workbook from pypdf import PdfReader from pptx import Presentation from .rag_embedding import EmbeddingFunction logger = logging.getLogger("review_agent.regulatory_review.rag_index") EXCLUDED_SOURCE_KEYWORDS = ("模拟题二", "试剂盒临床注册文件准备与审核Agent") @dataclass(frozen=True) class TextChunk: text: str metadata: dict[str, object] def chunk_text(text: str, *, source: str, chunk_size: int = 900, overlap: int = 120) -> list[TextChunk]: normalized = "\n".join(line.strip() for line in text.splitlines() if line.strip()) if not normalized: return [] chunks = [] start = 0 index = 0 step = max(1, chunk_size - overlap) while start < len(normalized): part = normalized[start : start + chunk_size].strip() if part: chunks.append(TextChunk(text=part, metadata={"source": source, "chunk_index": index})) index += 1 start += step return chunks def extract_text_from_path(path: Path) -> str: suffix = path.suffix.lower() if suffix in {".txt", ".md"}: return path.read_text(encoding="utf-8", errors="ignore") if suffix == ".pdf": return "\n".join(page.extract_text() or "" for page in PdfReader(str(path)).pages) if suffix == ".docx": return _extract_docx_text(path) if suffix == ".pptx": presentation = Presentation(str(path)) lines = [] for slide in presentation.slides: for shape in slide.shapes: if hasattr(shape, "text"): lines.append(shape.text) return "\n".join(lines) if suffix == ".xlsx": workbook = load_workbook(path, data_only=True, read_only=True) lines = [] for sheet in workbook.worksheets: for row in sheet.iter_rows(values_only=True): values = [str(cell) for cell in row if cell not in {None, ""}] if values: lines.append("\t".join(values)) return "\n".join(lines) if suffix == ".doc": return _extract_legacy_doc_with_libreoffice(path) return "" def _extract_docx_text(path: Path) -> str: document = Document(str(path)) lines: list[str] = [] for block in _iter_docx_blocks(document): if isinstance(block, Paragraph): text = block.text.strip() if text: lines.append(text) elif isinstance(block, Table): for row in block.rows: values = [cell.text.strip() for cell in row.cells if cell.text.strip()] if values: lines.append("\t".join(values)) return "\n".join(lines) def _iter_docx_blocks(document): body = document.element.body for child in body.iterchildren(): if isinstance(child, CT_P): yield Paragraph(child, document) elif isinstance(child, CT_Tbl): yield Table(child, document) def _extract_legacy_doc_with_libreoffice(path: Path) -> str: cached = _cached_docx_path(path) if cached.exists(): return extract_text_from_path(cached) try: return _extract_legacy_doc_with_libreoffice_convert(path) except RuntimeError as libreoffice_error: try: return _extract_legacy_doc_with_word_com(path) except RuntimeError as word_error: try: return _extract_legacy_doc_with_powershell_word_com(path) except RuntimeError as powershell_error: raise RuntimeError( f"无法转换法规 .doc 材料:{path.name};" f"LibreOffice 错误:{libreoffice_error};" f"Word COM 错误:{word_error};" f"PowerShell Word COM 错误:{powershell_error}" ) from powershell_error def _cached_docx_path(path: Path) -> Path: digest = hashlib.sha256(str(path.resolve()).encode("utf-8")).hexdigest()[:12] cache_dir = Path(settings.MEDIA_ROOT) / "regulatory_review" / "docx_cache" return cache_dir / f"{path.stem}-{digest}.docx" def _extract_legacy_doc_with_libreoffice_convert(path: Path) -> str: with tempfile.TemporaryDirectory() as tmp_dir: target_dir = Path(tmp_dir) try: subprocess.run( [ "soffice", "--headless", "--convert-to", "docx", "--outdir", str(target_dir), str(path), ], check=True, capture_output=True, text=True, timeout=60, ) except (FileNotFoundError, subprocess.CalledProcessError, subprocess.TimeoutExpired) as exc: raise RuntimeError(f"无法通过 LibreOffice 转换法规 .doc 材料:{path.name}") from exc converted = target_dir / f"{path.stem}.docx" if not converted.exists(): raise RuntimeError(f"LibreOffice 未生成 docx:{path.name}") return extract_text_from_path(converted) def _extract_legacy_doc_with_word_com(path: Path) -> str: with tempfile.TemporaryDirectory() as tmp_dir: target_dir = Path(tmp_dir) converted = target_dir / f"{path.stem}.docx" word = None try: import pythoncom import win32com.client pythoncom.CoInitialize() word = win32com.client.DispatchEx("Word.Application") word.Visible = False document = word.Documents.Open(str(path.resolve()), ReadOnly=True) document.SaveAs(str(converted.resolve()), FileFormat=16) document.Close(False) except Exception as exc: raise RuntimeError(f"无法通过 Word COM 转换法规 .doc 材料:{path.name}") from exc finally: if word is not None: try: word.Quit() except Exception: pass try: pythoncom.CoUninitialize() except Exception: pass if not converted.exists(): raise RuntimeError(f"Word COM 未生成 docx:{path.name}") return extract_text_from_path(converted) def _extract_legacy_doc_with_powershell_word_com(path: Path) -> str: with tempfile.TemporaryDirectory() as tmp_dir: target_dir = Path(tmp_dir) converted = target_dir / f"{path.stem}.docx" source_path = str(path.resolve()).replace("'", "''") target_path = str(converted.resolve()).replace("'", "''") script = ( "$ErrorActionPreference = 'Stop';" "$word = New-Object -ComObject Word.Application;" "$word.Visible = $false;" "try {" f"$doc = $word.Documents.Open('{source_path}', $false, $true);" f"$doc.SaveAs([ref]'{target_path}', [ref]16);" "$doc.Close([ref]$false);" "} finally { $word.Quit() }" ) powershell = shutil.which("powershell") or shutil.which("pwsh") if not powershell: raise RuntimeError("PowerShell 不可用,无法调用 Word COM。") try: subprocess.run( [powershell, "-NoProfile", "-ExecutionPolicy", "Bypass", "-Command", script], check=True, capture_output=True, text=True, timeout=90, ) except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as exc: raise RuntimeError(f"无法通过 PowerShell Word COM 转换法规 .doc 材料:{path.name}") from exc if not converted.exists(): raise RuntimeError(f"PowerShell Word COM 未生成 docx:{path.name}") return extract_text_from_path(converted) def collect_source_chunks(source_dir: Path) -> list[TextChunk]: chunks: list[TextChunk] = [] for path in sorted(source_dir.rglob("*")): if not path.is_file(): continue if is_excluded_source_path(path.relative_to(source_dir)): continue try: text = extract_text_from_path(path) except RuntimeError as exc: if _is_attachment4(path): raise RuntimeError(f"附件 4 核心法规材料抽取失败:{path.name}") from exc logger.warning("Regulatory source extraction skipped", extra={"path": str(path), "error": str(exc)}) continue chunks.extend(chunk_text(text, source=str(path.relative_to(source_dir)))) return chunks def is_excluded_source_path(path: Path | str) -> bool: normalized = str(path) return any(keyword in normalized for keyword in EXCLUDED_SOURCE_KEYWORDS) def _is_attachment4(path: Path) -> bool: normalized = path.name.replace(" ", "") return "附件4" in normalized and "体外诊断试剂注册申报资料要求及说明" in normalized def build_chroma_index( *, source_dir: Path, embedding_provider: EmbeddingFunction, persist_path: Path | None = None, collection_name: str | None = None, reset: bool = False, ) -> int: try: import chromadb except ImportError as exc: raise RuntimeError("chromadb 未安装,请先安装 requirements.txt。") from exc persist_path = persist_path or Path(settings.REGULATORY_RAG_CHROMA_PATH) collection_name = collection_name or settings.REGULATORY_RAG_COLLECTION persist_path.mkdir(parents=True, exist_ok=True) chunks = collect_source_chunks(source_dir) try: client = chromadb.PersistentClient(path=str(persist_path)) except Exception: if not reset: raise clear_chroma_system_cache() clear_chroma_index_dir(persist_path) persist_path.mkdir(parents=True, exist_ok=True) client = chromadb.PersistentClient(path=str(persist_path)) if reset: try: client.delete_collection(collection_name) clear_chroma_system_cache() client = chromadb.PersistentClient(path=str(persist_path)) except Exception: pass collection = client.get_or_create_collection(collection_name) if not chunks: return 0 texts = [chunk.text for chunk in chunks] embeddings = embedding_provider(texts) ids = [ hashlib.sha256(f"{chunk.metadata['source']}:{chunk.metadata['chunk_index']}".encode("utf-8")).hexdigest() for chunk in chunks ] collection.upsert( ids=ids, documents=texts, metadatas=[chunk.metadata for chunk in chunks], embeddings=embeddings, ) return len(chunks) def clear_chroma_index_dir(persist_path: Path | str | None = None) -> None: chroma_path = Path(persist_path or settings.REGULATORY_RAG_CHROMA_PATH).resolve() media_root = Path(settings.MEDIA_ROOT).resolve() try: chroma_path.relative_to(media_root) except ValueError as exc: raise RuntimeError("法规 RAG 索引目录必须位于 MEDIA_ROOT 内。") from exc if chroma_path.exists(): shutil.rmtree(chroma_path) def clear_chroma_system_cache() -> None: try: from chromadb.api.shared_system_client import SharedSystemClient except Exception: return SharedSystemClient.clear_system_cache()