Spaces:
Sleeping
Sleeping
| import subprocess | |
| from pathlib import Path | |
| import sys | |
| import shutil | |
| import tqdm | |
| BASE = Path(__file__).resolve().parents[0] | |
| DATASET_DIR = BASE / "dataset" | |
| def find_split_dir() -> Path: | |
| name = "eval" # eval or test | |
| p = DATASET_DIR / name | |
| if p.exists() and p.is_dir(): | |
| return p | |
| raise FileNotFoundError(f"No split directory found under {DATASET_DIR}. Expected one of: val, eval, validation") | |
| def run_for_pdf(pdf_path: Path, out_path: Path) -> int: | |
| # Ensure output parent exists | |
| out_path.parent.mkdir(parents=True, exist_ok=True) | |
| cmd = [sys.executable, "main.py", "--pdf", str(pdf_path), "--out", str(out_path)] | |
| print(f"Running: {' '.join(cmd)}") | |
| proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) | |
| if proc.returncode != 0: | |
| print(f"ERROR: main.py failed for {pdf_path.name} (rc={proc.returncode})") | |
| print(proc.stdout) | |
| print(proc.stderr) | |
| else: | |
| print(f"OK: saved -> {out_path}") | |
| return proc.returncode | |
| def main(): | |
| split_dir = find_split_dir() | |
| pdf_dir = split_dir / "PDFs" | |
| if not pdf_dir.exists(): | |
| raise FileNotFoundError(f"PDFs directory not found: {pdf_dir}") | |
| out_dir = split_dir / "classifier_output" | |
| out_dir.mkdir(parents=True, exist_ok=True) | |
| pdf_files = sorted([p for p in pdf_dir.iterdir() if p.suffix.lower() == ".pdf"]) | |
| if not pdf_files: | |
| print(f"No PDF files found in {pdf_dir}") | |
| return | |
| print(f"Found {len(pdf_files)} PDFs in {pdf_dir}; outputs -> {out_dir}") | |
| failures = 0 | |
| for pdf in tqdm.tqdm(pdf_files, total=len(pdf_files)): | |
| stem = pdf.stem | |
| if stem in list([i.stem for i in out_dir.iterdir()]): | |
| continue | |
| out_path = out_dir / f"{stem}.json" | |
| rc = run_for_pdf(pdf, out_path) | |
| if rc != 0: | |
| failures += 1 | |
| print(f"\nDone. Processed: {len(pdf_files)} failures: {failures}") | |
| if __name__ == "__main__": | |
| main() | |