187 lines
6.6 KiB
Python
187 lines
6.6 KiB
Python
from __future__ import annotations
|
|
|
|
import subprocess
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
from pathlib import Path
|
|
from zipfile import ZipFile
|
|
from xml.etree import ElementTree
|
|
|
|
from review_agent.models import RegulatoryInfoPackageBatch
|
|
from review_agent.regulatory_info_package.constants import GENERATED_FILE_FAILED
|
|
from review_agent.regulatory_info_package.schemas import GeneratedFileResult, MergedField, TemplateSpec
|
|
from review_agent.regulatory_info_package.services.docx_document import write_docx_from_template
|
|
from review_agent.regulatory_info_package.services.legacy_doc_document import write_legacy_doc_or_fallback
|
|
from review_agent.regulatory_info_package.services.template_repository import copy_template_to_batch, template_specs
|
|
from review_agent.regulatory_info_package.storage import ensure_batch_subdir
|
|
|
|
|
|
def generate_package_documents(
|
|
batch: RegulatoryInfoPackageBatch,
|
|
config: dict,
|
|
merged_fields: dict[str, MergedField],
|
|
) -> list[GeneratedFileResult]:
|
|
specs = template_specs(config)
|
|
directory_specs = [spec for spec in specs if spec.code == "ch1_2_directory"]
|
|
content_specs = [spec for spec in specs if spec.code != "ch1_2_directory"]
|
|
results: list[GeneratedFileResult] = []
|
|
with ThreadPoolExecutor(max_workers=min(4, len(content_specs) or 1)) as executor:
|
|
futures = [executor.submit(_generate_one, batch, config, spec, merged_fields) for spec in content_specs]
|
|
results.extend(future.result() for future in as_completed(futures))
|
|
page_numbers = _directory_page_numbers(results)
|
|
for spec in directory_specs:
|
|
results.append(_generate_one(batch, config, spec, merged_fields, directory_page_numbers=page_numbers))
|
|
return results
|
|
|
|
|
|
def _generate_one(
|
|
batch: RegulatoryInfoPackageBatch,
|
|
config: dict,
|
|
spec: TemplateSpec,
|
|
merged_fields: dict[str, MergedField],
|
|
*,
|
|
directory_page_numbers: dict[str, str] | None = None,
|
|
) -> GeneratedFileResult:
|
|
try:
|
|
template_path = copy_template_to_batch(batch, config, spec)
|
|
generated_dir = ensure_batch_subdir(batch, "generated")
|
|
output_path = generated_dir / spec.output_name
|
|
adapter_summary = {}
|
|
if spec.file_format == "doc":
|
|
actual_path, status, adapter_summary = write_legacy_doc_or_fallback(template_path, output_path, merged_fields)
|
|
actual_format = actual_path.suffix.lower().lstrip(".")
|
|
highlight_count = missing_count = llm_only_count = 0
|
|
else:
|
|
highlight_count, missing_count, llm_only_count = write_docx_from_template(
|
|
template_path,
|
|
output_path,
|
|
merged_fields,
|
|
template_code=spec.code,
|
|
directory_page_numbers=directory_page_numbers,
|
|
)
|
|
actual_path = output_path
|
|
actual_format = "docx"
|
|
status = "success"
|
|
return GeneratedFileResult(
|
|
template_code=spec.code,
|
|
file_name=actual_path.name,
|
|
requested_format=spec.file_format,
|
|
actual_format=actual_format,
|
|
status=status,
|
|
path=str(actual_path),
|
|
highlight_count=highlight_count,
|
|
missing_count=missing_count,
|
|
llm_only_count=llm_only_count,
|
|
)
|
|
except Exception as exc:
|
|
return GeneratedFileResult(
|
|
template_code=spec.code,
|
|
file_name=spec.output_name,
|
|
requested_format=spec.file_format,
|
|
actual_format=spec.file_format,
|
|
status=GENERATED_FILE_FAILED,
|
|
error_message=str(exc),
|
|
)
|
|
|
|
|
|
def _directory_page_numbers(results: list[GeneratedFileResult]) -> dict[str, str]:
|
|
page_numbers = {"CH1.2": "1"}
|
|
for result in results:
|
|
if result.status not in {"success", "fallback_success"} or not result.path:
|
|
continue
|
|
code = _directory_code_from_file_name(result.file_name)
|
|
if not code:
|
|
continue
|
|
page_numbers[code] = str(count_document_pages(result.path))
|
|
return page_numbers
|
|
|
|
|
|
def _directory_code_from_file_name(file_name: str) -> str:
|
|
stem = Path(file_name).stem.strip()
|
|
return stem.split()[0] if stem.startswith("CH") else ""
|
|
|
|
|
|
def count_document_pages(path: str | Path) -> int:
|
|
file_path = Path(path)
|
|
if not file_path.exists():
|
|
return 1
|
|
pages = _count_pages_from_docx_properties(file_path)
|
|
if pages:
|
|
return pages
|
|
pages = _count_pages_with_pywin32(file_path)
|
|
if pages:
|
|
return pages
|
|
pages = _count_pages_with_powershell_word(file_path)
|
|
if pages:
|
|
return pages
|
|
return 1
|
|
|
|
|
|
def _count_pages_from_docx_properties(file_path: Path) -> int:
|
|
if file_path.suffix.lower() != ".docx":
|
|
return 0
|
|
try:
|
|
with ZipFile(file_path) as archive:
|
|
root = ElementTree.fromstring(archive.read("docProps/app.xml"))
|
|
namespace = {"ep": "http://schemas.openxmlformats.org/officeDocument/2006/extended-properties"}
|
|
pages = root.find("ep:Pages", namespace)
|
|
return max(int((pages.text or "").strip()), 1) if pages is not None else 0
|
|
except Exception:
|
|
return 0
|
|
|
|
|
|
def _count_pages_with_pywin32(file_path: Path) -> int:
|
|
try:
|
|
import win32com.client
|
|
|
|
word = win32com.client.DispatchEx("Word.Application")
|
|
word.Visible = False
|
|
document = None
|
|
try:
|
|
document = word.Documents.Open(str(file_path.resolve()), ReadOnly=True)
|
|
document.Repaginate()
|
|
return max(int(document.ComputeStatistics(2)), 1)
|
|
finally:
|
|
if document is not None:
|
|
document.Close(False)
|
|
word.Quit()
|
|
except Exception:
|
|
return 0
|
|
|
|
|
|
def _count_pages_with_powershell_word(file_path: Path) -> int:
|
|
script = r"""
|
|
param([string]$Path)
|
|
$word = $null
|
|
$doc = $null
|
|
try {
|
|
$word = New-Object -ComObject Word.Application
|
|
$word.Visible = $false
|
|
$doc = $word.Documents.Open($Path, $false, $true)
|
|
$doc.Repaginate()
|
|
[Console]::Out.Write($doc.ComputeStatistics(2))
|
|
exit 0
|
|
} catch {
|
|
[Console]::Error.Write($_.Exception.Message)
|
|
exit 1
|
|
} finally {
|
|
if ($doc -ne $null) { $doc.Close($false) | Out-Null }
|
|
if ($word -ne $null) { $word.Quit() | Out-Null }
|
|
}
|
|
"""
|
|
try:
|
|
completed = subprocess.run(
|
|
["powershell.exe", "-NoProfile", "-ExecutionPolicy", "Bypass", "-Command", script, str(file_path.resolve())],
|
|
capture_output=True,
|
|
check=False,
|
|
text=True,
|
|
timeout=8,
|
|
)
|
|
except Exception:
|
|
return 0
|
|
if completed.returncode != 0:
|
|
return 0
|
|
try:
|
|
return max(int(completed.stdout.strip()), 1)
|
|
except ValueError:
|
|
return 0
|