feat(regulatory-info-package): 完善目录页码与组成成分填充

This commit is contained in:
2026-06-10 23:56:40 +08:00
parent 3bcf9647a1
commit 1bf8634373
13 changed files with 296 additions and 82 deletions

View File

@@ -1,7 +1,10 @@
from __future__ import annotations
import subprocess
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
from zipfile import ZipFile
from xml.etree import ElementTree
from review_agent.models import RegulatoryInfoPackageBatch
from review_agent.regulatory_info_package.constants import GENERATED_FILE_FAILED
@@ -18,9 +21,16 @@ def generate_package_documents(
merged_fields: dict[str, MergedField],
) -> list[GeneratedFileResult]:
specs = template_specs(config)
with ThreadPoolExecutor(max_workers=min(4, len(specs) or 1)) as executor:
futures = [executor.submit(_generate_one, batch, config, spec, merged_fields) for spec in specs]
return [future.result() for future in as_completed(futures)]
directory_specs = [spec for spec in specs if spec.code == "ch1_2_directory"]
content_specs = [spec for spec in specs if spec.code != "ch1_2_directory"]
results: list[GeneratedFileResult] = []
with ThreadPoolExecutor(max_workers=min(4, len(content_specs) or 1)) as executor:
futures = [executor.submit(_generate_one, batch, config, spec, merged_fields) for spec in content_specs]
results.extend(future.result() for future in as_completed(futures))
page_numbers = _directory_page_numbers(results)
for spec in directory_specs:
results.append(_generate_one(batch, config, spec, merged_fields, directory_page_numbers=page_numbers))
return results
def _generate_one(
@@ -28,6 +38,8 @@ def _generate_one(
config: dict,
spec: TemplateSpec,
merged_fields: dict[str, MergedField],
*,
directory_page_numbers: dict[str, str] | None = None,
) -> GeneratedFileResult:
try:
template_path = copy_template_to_batch(batch, config, spec)
@@ -44,6 +56,7 @@ def _generate_one(
output_path,
merged_fields,
template_code=spec.code,
directory_page_numbers=directory_page_numbers,
)
actual_path = output_path
actual_format = "docx"
@@ -68,3 +81,106 @@ def _generate_one(
status=GENERATED_FILE_FAILED,
error_message=str(exc),
)
def _directory_page_numbers(results: list[GeneratedFileResult]) -> dict[str, str]:
page_numbers = {"CH1.2": "1"}
for result in results:
if result.status not in {"success", "fallback_success"} or not result.path:
continue
code = _directory_code_from_file_name(result.file_name)
if not code:
continue
page_numbers[code] = str(count_document_pages(result.path))
return page_numbers
def _directory_code_from_file_name(file_name: str) -> str:
stem = Path(file_name).stem.strip()
return stem.split()[0] if stem.startswith("CH") else ""
def count_document_pages(path: str | Path) -> int:
file_path = Path(path)
if not file_path.exists():
return 1
pages = _count_pages_from_docx_properties(file_path)
if pages:
return pages
pages = _count_pages_with_pywin32(file_path)
if pages:
return pages
pages = _count_pages_with_powershell_word(file_path)
if pages:
return pages
return 1
def _count_pages_from_docx_properties(file_path: Path) -> int:
if file_path.suffix.lower() != ".docx":
return 0
try:
with ZipFile(file_path) as archive:
root = ElementTree.fromstring(archive.read("docProps/app.xml"))
namespace = {"ep": "http://schemas.openxmlformats.org/officeDocument/2006/extended-properties"}
pages = root.find("ep:Pages", namespace)
return max(int((pages.text or "").strip()), 1) if pages is not None else 0
except Exception:
return 0
def _count_pages_with_pywin32(file_path: Path) -> int:
try:
import win32com.client
word = win32com.client.DispatchEx("Word.Application")
word.Visible = False
document = None
try:
document = word.Documents.Open(str(file_path.resolve()), ReadOnly=True)
document.Repaginate()
return max(int(document.ComputeStatistics(2)), 1)
finally:
if document is not None:
document.Close(False)
word.Quit()
except Exception:
return 0
def _count_pages_with_powershell_word(file_path: Path) -> int:
script = r"""
param([string]$Path)
$word = $null
$doc = $null
try {
$word = New-Object -ComObject Word.Application
$word.Visible = $false
$doc = $word.Documents.Open($Path, $false, $true)
$doc.Repaginate()
[Console]::Out.Write($doc.ComputeStatistics(2))
exit 0
} catch {
[Console]::Error.Write($_.Exception.Message)
exit 1
} finally {
if ($doc -ne $null) { $doc.Close($false) | Out-Null }
if ($word -ne $null) { $word.Quit() | Out-Null }
}
"""
try:
completed = subprocess.run(
["powershell.exe", "-NoProfile", "-ExecutionPolicy", "Bypass", "-Command", script, str(file_path.resolve())],
capture_output=True,
check=False,
text=True,
timeout=8,
)
except Exception:
return 0
if completed.returncode != 0:
return 0
try:
return max(int(completed.stdout.strip()), 1)
except ValueError:
return 0