Spaces:
Sleeping
Sleeping
File size: 1,976 Bytes
4a76722 df0e7a4 4a76722 df0e7a4 4a76722 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
import subprocess
from pathlib import Path
import sys
import shutil
import tqdm
BASE = Path(__file__).resolve().parents[0]
DATASET_DIR = BASE / "dataset"
def find_split_dir() -> Path:
name = "eval" # eval or test
p = DATASET_DIR / name
if p.exists() and p.is_dir():
return p
raise FileNotFoundError(f"No split directory found under {DATASET_DIR}. Expected one of: val, eval, validation")
def run_for_pdf(pdf_path: Path, out_path: Path) -> int:
# Ensure output parent exists
out_path.parent.mkdir(parents=True, exist_ok=True)
cmd = [sys.executable, "main.py", "--pdf", str(pdf_path), "--out", str(out_path)]
print(f"Running: {' '.join(cmd)}")
proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
if proc.returncode != 0:
print(f"ERROR: main.py failed for {pdf_path.name} (rc={proc.returncode})")
print(proc.stdout)
print(proc.stderr)
else:
print(f"OK: saved -> {out_path}")
return proc.returncode
def main():
split_dir = find_split_dir()
pdf_dir = split_dir / "PDFs"
if not pdf_dir.exists():
raise FileNotFoundError(f"PDFs directory not found: {pdf_dir}")
out_dir = split_dir / "classifier_output"
out_dir.mkdir(parents=True, exist_ok=True)
pdf_files = sorted([p for p in pdf_dir.iterdir() if p.suffix.lower() == ".pdf"])
if not pdf_files:
print(f"No PDF files found in {pdf_dir}")
return
print(f"Found {len(pdf_files)} PDFs in {pdf_dir}; outputs -> {out_dir}")
failures = 0
for pdf in tqdm.tqdm(pdf_files, total=len(pdf_files)):
stem = pdf.stem
if stem in list([i.stem for i in out_dir.iterdir()]):
continue
out_path = out_dir / f"{stem}.json"
rc = run_for_pdf(pdf, out_path)
if rc != 0:
failures += 1
print(f"\nDone. Processed: {len(pdf_files)} failures: {failures}")
if __name__ == "__main__":
main()
|