pack

#!/usr/bin/env python3 """ Tool to pack a directory into a .docx, .pptx, or .xlsx file with XML formatting undone.

Example usage: python pack.py <input_directory> <office_file> [--force] """

import argparse import shutil import subprocess import sys import tempfile import defusedxml.minidom import zipfile from pathlib import Path

def main(): parser = argparse.ArgumentParser(description="Pack a directory into an Office file") parser.add_argument("input_directory", help="Unpacked Office document directory") parser.add_argument("output_file", help="Output Office file (.docx/.pptx/.xlsx)") parser.add_argument("--force", action="store_true", help="Skip validation") args = parser.parse_args()

try:
    success = pack_document(
        args.input_directory, args.output_file, validate=not args.force
    )

    # Show warning if validation was skipped
    if args.force:
        print("Warning: Skipped validation, file may be corrupt", file=sys.stderr)
    # Exit with error if validation failed
    elif not success:
        print("Contents would produce a corrupt file.", file=sys.stderr)
        print("Please validate XML before repacking.", file=sys.stderr)
        print("Use --force to skip validation and pack anyway.", file=sys.stderr)
        sys.exit(1)

except ValueError as e:
    sys.exit(f"Error: {e}")

def pack_document(input_dir, output_file, validate=False): """Pack a directory into an Office file (.docx/.pptx/.xlsx).

Args:
    input_dir: Path to unpacked Office document directory
    output_file: Path to output Office file
    validate: If True, validates with soffice (default: False)

Returns:
    bool: True if successful, False if validation failed
"""
input_dir = Path(input_dir)
output_file = Path(output_file)

if not input_dir.is_dir():
    raise ValueError(f"{input_dir} is not a directory")
if output_file.suffix.lower() not in {".docx", ".pptx", ".xlsx"}:
    raise ValueError(f"{output_file} must be a .docx, .pptx, or .xlsx file")

# Work in temporary directory to avoid modifying original
with tempfile.TemporaryDirectory() as temp_dir:
    temp_content_dir = Path(temp_dir) / "content"
    shutil.copytree(input_dir, temp_content_dir)

    # Process XML files to remove pretty-printing whitespace
    for pattern in ["*.xml", "*.rels"]:
        for xml_file in temp_content_dir.rglob(pattern):
            condense_xml(xml_file)

    # Create final Office file as zip archive
    output_file.parent.mkdir(parents=True, exist_ok=True)
    with zipfile.ZipFile(output_file, "w", zipfile.ZIP_DEFLATED) as zf:
        for f in temp_content_dir.rglob("*"):
            if f.is_file():
                zf.write(f, f.relative_to(temp_content_dir))

    # Validate if requested
    if validate:
        if not validate_document(output_file):
            output_file.unlink()  # Delete the corrupt file
            return False

return True

def validate_document(doc_path): """Validate document by converting to HTML with soffice.""" # Determine the correct filter based on file extension match doc_path.suffix.lower(): case ".docx": filter_name = "html:HTML" case ".pptx": filter_name = "html:impress_html_Export" case ".xlsx": filter_name = "html:HTML (StarCalc)"

with tempfile.TemporaryDirectory() as temp_dir:
    try:
        result = subprocess.run(
            [
                "soffice",
                "--headless",
                "--convert-to",
                filter_name,
                "--outdir",
                temp_dir,
                str(doc_path),
            ],
            capture_output=True,
            timeout=10,
            text=True,
        )
        if not (Path(temp_dir) / f"{doc_path.stem}.html").exists():
            error_msg = result.stderr.strip() or "Document validation failed"
            print(f"Validation error: {error_msg}", file=sys.stderr)
            return False
        return True
    except FileNotFoundError:
        print("Warning: soffice not found. Skipping validation.", file=sys.stderr)
        return True
    except subprocess.TimeoutExpired:
        print("Validation error: Timeout during conversion", file=sys.stderr)
        return False
    except Exception as e:
        print(f"Validation error: {e}", file=sys.stderr)
        return False

def condense_xml(xml_file): """Strip unnecessary whitespace and remove comments.""" with open(xml_file, "r", encoding="utf-8") as f: dom = defusedxml.minidom.parse(f)

# Process each element to remove whitespace and comments
for element in dom.getElementsByTagName("*"):
    # Skip w:t elements and their processing
    if element.tagName.endswith(":t"):
        continue

    # Remove whitespace-only text nodes and comment nodes
    for child in list(element.childNodes):
        if (
            child.nodeType == child.TEXT_NODE
            and child.nodeValue
            and child.nodeValue.strip() == ""
        ) or child.nodeType == child.COMMENT_NODE:
            element.removeChild(child)

# Write back the condensed XML
with open(xml_file, "wb") as f:
    f.write(dom.toxml(encoding="UTF-8"))

if name == "main": main()

Search AI Tools

Install this agent skill to your Project

SKILL.md