Files
DEMO-AGENT/review_agent/regulatory_info_package/services/package_generate.py

187 lines
6.6 KiB
Python

from __future__ import annotations
import subprocess
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
from zipfile import ZipFile
from xml.etree import ElementTree
from review_agent.models import RegulatoryInfoPackageBatch
from review_agent.regulatory_info_package.constants import GENERATED_FILE_FAILED
from review_agent.regulatory_info_package.schemas import GeneratedFileResult, MergedField, TemplateSpec
from review_agent.regulatory_info_package.services.docx_document import write_docx_from_template
from review_agent.regulatory_info_package.services.legacy_doc_document import write_legacy_doc_or_fallback
from review_agent.regulatory_info_package.services.template_repository import copy_template_to_batch, template_specs
from review_agent.regulatory_info_package.storage import ensure_batch_subdir
def generate_package_documents(
batch: RegulatoryInfoPackageBatch,
config: dict,
merged_fields: dict[str, MergedField],
) -> list[GeneratedFileResult]:
specs = template_specs(config)
directory_specs = [spec for spec in specs if spec.code == "ch1_2_directory"]
content_specs = [spec for spec in specs if spec.code != "ch1_2_directory"]
results: list[GeneratedFileResult] = []
with ThreadPoolExecutor(max_workers=min(4, len(content_specs) or 1)) as executor:
futures = [executor.submit(_generate_one, batch, config, spec, merged_fields) for spec in content_specs]
results.extend(future.result() for future in as_completed(futures))
page_numbers = _directory_page_numbers(results)
for spec in directory_specs:
results.append(_generate_one(batch, config, spec, merged_fields, directory_page_numbers=page_numbers))
return results
def _generate_one(
batch: RegulatoryInfoPackageBatch,
config: dict,
spec: TemplateSpec,
merged_fields: dict[str, MergedField],
*,
directory_page_numbers: dict[str, str] | None = None,
) -> GeneratedFileResult:
try:
template_path = copy_template_to_batch(batch, config, spec)
generated_dir = ensure_batch_subdir(batch, "generated")
output_path = generated_dir / spec.output_name
adapter_summary = {}
if spec.file_format == "doc":
actual_path, status, adapter_summary = write_legacy_doc_or_fallback(template_path, output_path, merged_fields)
actual_format = actual_path.suffix.lower().lstrip(".")
highlight_count = missing_count = llm_only_count = 0
else:
highlight_count, missing_count, llm_only_count = write_docx_from_template(
template_path,
output_path,
merged_fields,
template_code=spec.code,
directory_page_numbers=directory_page_numbers,
)
actual_path = output_path
actual_format = "docx"
status = "success"
return GeneratedFileResult(
template_code=spec.code,
file_name=actual_path.name,
requested_format=spec.file_format,
actual_format=actual_format,
status=status,
path=str(actual_path),
highlight_count=highlight_count,
missing_count=missing_count,
llm_only_count=llm_only_count,
)
except Exception as exc:
return GeneratedFileResult(
template_code=spec.code,
file_name=spec.output_name,
requested_format=spec.file_format,
actual_format=spec.file_format,
status=GENERATED_FILE_FAILED,
error_message=str(exc),
)
def _directory_page_numbers(results: list[GeneratedFileResult]) -> dict[str, str]:
page_numbers = {"CH1.2": "1"}
for result in results:
if result.status not in {"success", "fallback_success"} or not result.path:
continue
code = _directory_code_from_file_name(result.file_name)
if not code:
continue
page_numbers[code] = str(count_document_pages(result.path))
return page_numbers
def _directory_code_from_file_name(file_name: str) -> str:
stem = Path(file_name).stem.strip()
return stem.split()[0] if stem.startswith("CH") else ""
def count_document_pages(path: str | Path) -> int:
file_path = Path(path)
if not file_path.exists():
return 1
pages = _count_pages_from_docx_properties(file_path)
if pages:
return pages
pages = _count_pages_with_pywin32(file_path)
if pages:
return pages
pages = _count_pages_with_powershell_word(file_path)
if pages:
return pages
return 1
def _count_pages_from_docx_properties(file_path: Path) -> int:
if file_path.suffix.lower() != ".docx":
return 0
try:
with ZipFile(file_path) as archive:
root = ElementTree.fromstring(archive.read("docProps/app.xml"))
namespace = {"ep": "http://schemas.openxmlformats.org/officeDocument/2006/extended-properties"}
pages = root.find("ep:Pages", namespace)
return max(int((pages.text or "").strip()), 1) if pages is not None else 0
except Exception:
return 0
def _count_pages_with_pywin32(file_path: Path) -> int:
try:
import win32com.client
word = win32com.client.DispatchEx("Word.Application")
word.Visible = False
document = None
try:
document = word.Documents.Open(str(file_path.resolve()), ReadOnly=True)
document.Repaginate()
return max(int(document.ComputeStatistics(2)), 1)
finally:
if document is not None:
document.Close(False)
word.Quit()
except Exception:
return 0
def _count_pages_with_powershell_word(file_path: Path) -> int:
script = r"""
param([string]$Path)
$word = $null
$doc = $null
try {
$word = New-Object -ComObject Word.Application
$word.Visible = $false
$doc = $word.Documents.Open($Path, $false, $true)
$doc.Repaginate()
[Console]::Out.Write($doc.ComputeStatistics(2))
exit 0
} catch {
[Console]::Error.Write($_.Exception.Message)
exit 1
} finally {
if ($doc -ne $null) { $doc.Close($false) | Out-Null }
if ($word -ne $null) { $word.Quit() | Out-Null }
}
"""
try:
completed = subprocess.run(
["powershell.exe", "-NoProfile", "-ExecutionPolicy", "Bypass", "-Command", script, str(file_path.resolve())],
capture_output=True,
check=False,
text=True,
timeout=8,
)
except Exception:
return 0
if completed.returncode != 0:
return 0
try:
return max(int(completed.stdout.strip()), 1)
except ValueError:
return 0