import subprocess from pathlib import Path import sys import shutil import tqdm BASE = Path(__file__).resolve().parents[0] DATASET_DIR = BASE / "dataset" def find_split_dir() -> Path: name = "eval" # eval or test p = DATASET_DIR / name if p.exists() and p.is_dir(): return p raise FileNotFoundError(f"No split directory found under {DATASET_DIR}. Expected one of: val, eval, validation") def run_for_pdf(pdf_path: Path, out_path: Path) -> int: # Ensure output parent exists out_path.parent.mkdir(parents=True, exist_ok=True) cmd = [sys.executable, "main.py", "--pdf", str(pdf_path), "--out", str(out_path)] print(f"Running: {' '.join(cmd)}") proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) if proc.returncode != 0: print(f"ERROR: main.py failed for {pdf_path.name} (rc={proc.returncode})") print(proc.stdout) print(proc.stderr) else: print(f"OK: saved -> {out_path}") return proc.returncode def main(): split_dir = find_split_dir() pdf_dir = split_dir / "PDFs" if not pdf_dir.exists(): raise FileNotFoundError(f"PDFs directory not found: {pdf_dir}") out_dir = split_dir / "classifier_output" out_dir.mkdir(parents=True, exist_ok=True) pdf_files = sorted([p for p in pdf_dir.iterdir() if p.suffix.lower() == ".pdf"]) if not pdf_files: print(f"No PDF files found in {pdf_dir}") return print(f"Found {len(pdf_files)} PDFs in {pdf_dir}; outputs -> {out_dir}") failures = 0 for pdf in tqdm.tqdm(pdf_files, total=len(pdf_files)): stem = pdf.stem if stem in list([i.stem for i in out_dir.iterdir()]): continue out_path = out_dir / f"{stem}.json" rc = run_for_pdf(pdf, out_path) if rc != 0: failures += 1 print(f"\nDone. Processed: {len(pdf_files)} failures: {failures}") if __name__ == "__main__": main()