File size: 1,976 Bytes
4a76722
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
df0e7a4
 
4a76722
df0e7a4
4a76722
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import subprocess
from pathlib import Path
import sys
import shutil
import tqdm

BASE = Path(__file__).resolve().parents[0]
DATASET_DIR = BASE / "dataset"


def find_split_dir() -> Path:
    name = "eval" # eval or test
    p = DATASET_DIR / name
    if p.exists() and p.is_dir():
        return p
    raise FileNotFoundError(f"No split directory found under {DATASET_DIR}. Expected one of: val, eval, validation")


def run_for_pdf(pdf_path: Path, out_path: Path) -> int:
    # Ensure output parent exists
    out_path.parent.mkdir(parents=True, exist_ok=True)

    cmd = [sys.executable, "main.py", "--pdf", str(pdf_path), "--out", str(out_path)]
    print(f"Running: {' '.join(cmd)}")
    proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    if proc.returncode != 0:
        print(f"ERROR: main.py failed for {pdf_path.name} (rc={proc.returncode})")
        print(proc.stdout)
        print(proc.stderr)
    else:
        print(f"OK: saved -> {out_path}")
    return proc.returncode


def main():
    split_dir = find_split_dir()
    pdf_dir = split_dir / "PDFs"
    if not pdf_dir.exists():
        raise FileNotFoundError(f"PDFs directory not found: {pdf_dir}")

    out_dir = split_dir / "classifier_output"
    out_dir.mkdir(parents=True, exist_ok=True)

    pdf_files = sorted([p for p in pdf_dir.iterdir() if p.suffix.lower() == ".pdf"])
    if not pdf_files:
        print(f"No PDF files found in {pdf_dir}")
        return

    print(f"Found {len(pdf_files)} PDFs in {pdf_dir}; outputs -> {out_dir}")

    failures = 0
    for pdf in tqdm.tqdm(pdf_files, total=len(pdf_files)):
        stem = pdf.stem
        if stem in list([i.stem for i in out_dir.iterdir()]):
            continue
        out_path = out_dir / f"{stem}.json"
        rc = run_for_pdf(pdf, out_path)
        if rc != 0:
            failures += 1

    print(f"\nDone. Processed: {len(pdf_files)}  failures: {failures}")


if __name__ == "__main__":
    main()