from __future__ import annotations import subprocess from concurrent.futures import ThreadPoolExecutor, as_completed from pathlib import Path from zipfile import ZipFile from xml.etree import ElementTree from review_agent.models import RegulatoryInfoPackageBatch from review_agent.regulatory_info_package.constants import GENERATED_FILE_FAILED from review_agent.regulatory_info_package.schemas import GeneratedFileResult, MergedField, TemplateSpec from review_agent.regulatory_info_package.services.docx_document import write_docx_from_template from review_agent.regulatory_info_package.services.legacy_doc_document import write_legacy_doc_or_fallback from review_agent.regulatory_info_package.services.template_repository import copy_template_to_batch, template_specs from review_agent.regulatory_info_package.storage import ensure_batch_subdir def generate_package_documents( batch: RegulatoryInfoPackageBatch, config: dict, merged_fields: dict[str, MergedField], ) -> list[GeneratedFileResult]: specs = template_specs(config) directory_specs = [spec for spec in specs if spec.code == "ch1_2_directory"] content_specs = [spec for spec in specs if spec.code != "ch1_2_directory"] results: list[GeneratedFileResult] = [] with ThreadPoolExecutor(max_workers=min(4, len(content_specs) or 1)) as executor: futures = [executor.submit(_generate_one, batch, config, spec, merged_fields) for spec in content_specs] results.extend(future.result() for future in as_completed(futures)) page_numbers = _directory_page_numbers(results) for spec in directory_specs: results.append(_generate_one(batch, config, spec, merged_fields, directory_page_numbers=page_numbers)) return results def _generate_one( batch: RegulatoryInfoPackageBatch, config: dict, spec: TemplateSpec, merged_fields: dict[str, MergedField], *, directory_page_numbers: dict[str, str] | None = None, ) -> GeneratedFileResult: try: template_path = copy_template_to_batch(batch, config, spec) generated_dir = ensure_batch_subdir(batch, "generated") output_path = generated_dir / spec.output_name adapter_summary = {} if spec.file_format == "doc": actual_path, status, adapter_summary = write_legacy_doc_or_fallback(template_path, output_path, merged_fields) actual_format = actual_path.suffix.lower().lstrip(".") highlight_count = missing_count = llm_only_count = 0 else: highlight_count, missing_count, llm_only_count = write_docx_from_template( template_path, output_path, merged_fields, template_code=spec.code, directory_page_numbers=directory_page_numbers, ) actual_path = output_path actual_format = "docx" status = "success" return GeneratedFileResult( template_code=spec.code, file_name=actual_path.name, requested_format=spec.file_format, actual_format=actual_format, status=status, path=str(actual_path), highlight_count=highlight_count, missing_count=missing_count, llm_only_count=llm_only_count, ) except Exception as exc: return GeneratedFileResult( template_code=spec.code, file_name=spec.output_name, requested_format=spec.file_format, actual_format=spec.file_format, status=GENERATED_FILE_FAILED, error_message=str(exc), ) def _directory_page_numbers(results: list[GeneratedFileResult]) -> dict[str, str]: page_numbers = {"CH1.2": "1"} for result in results: if result.status not in {"success", "fallback_success"} or not result.path: continue code = _directory_code_from_file_name(result.file_name) if not code: continue page_numbers[code] = str(count_document_pages(result.path)) return page_numbers def _directory_code_from_file_name(file_name: str) -> str: stem = Path(file_name).stem.strip() return stem.split()[0] if stem.startswith("CH") else "" def count_document_pages(path: str | Path) -> int: file_path = Path(path) if not file_path.exists(): return 1 pages = _count_pages_from_docx_properties(file_path) if pages: return pages pages = _count_pages_with_pywin32(file_path) if pages: return pages pages = _count_pages_with_powershell_word(file_path) if pages: return pages return 1 def _count_pages_from_docx_properties(file_path: Path) -> int: if file_path.suffix.lower() != ".docx": return 0 try: with ZipFile(file_path) as archive: root = ElementTree.fromstring(archive.read("docProps/app.xml")) namespace = {"ep": "http://schemas.openxmlformats.org/officeDocument/2006/extended-properties"} pages = root.find("ep:Pages", namespace) return max(int((pages.text or "").strip()), 1) if pages is not None else 0 except Exception: return 0 def _count_pages_with_pywin32(file_path: Path) -> int: try: import win32com.client word = win32com.client.DispatchEx("Word.Application") word.Visible = False document = None try: document = word.Documents.Open(str(file_path.resolve()), ReadOnly=True) document.Repaginate() return max(int(document.ComputeStatistics(2)), 1) finally: if document is not None: document.Close(False) word.Quit() except Exception: return 0 def _count_pages_with_powershell_word(file_path: Path) -> int: script = r""" param([string]$Path) $word = $null $doc = $null try { $word = New-Object -ComObject Word.Application $word.Visible = $false $doc = $word.Documents.Open($Path, $false, $true) $doc.Repaginate() [Console]::Out.Write($doc.ComputeStatistics(2)) exit 0 } catch { [Console]::Error.Write($_.Exception.Message) exit 1 } finally { if ($doc -ne $null) { $doc.Close($false) | Out-Null } if ($word -ne $null) { $word.Quit() | Out-Null } } """ try: completed = subprocess.run( ["powershell.exe", "-NoProfile", "-ExecutionPolicy", "Bypass", "-Command", script, str(file_path.resolve())], capture_output=True, check=False, text=True, timeout=8, ) except Exception: return 0 if completed.returncode != 0: return 0 try: return max(int(completed.stdout.strip()), 1) except ValueError: return 0