Spaces:

davanstrien
/

doab-title-extraction-eval

Running

File size: 9,622 Bytes

import marimo

__generated_with = "0.10.9"
app = marimo.App(width="medium")


@app.cell
def _():
    import marimo as mo
    return (mo,)


@app.cell
def _(mo):
    mo.md(
        """
        # VLM vs Text: Extracting Metadata from Book Covers

        **The Task**: Libraries and archives have millions of digitized book covers where metadata is incomplete or missing. Can we use AI to automatically extract titles and other metadata?

        **The Question**: Should we use Vision-Language Models (VLMs) that "see" the cover image, or extract text first and send it to a standard LLM?

        **The Answer**: VLMs win decisively for this task.

        ---

        This evaluation uses the [DOAB (Directory of Open Access Books)](https://huggingface.co/datasets/biglam/doab-metadata-extraction) dataset of academic book covers. We compare two approaches:

        | Approach | How it works |
        |----------|-------------|
        | **VLM** | Send the cover image directly to a Vision-Language Model |
        | **Text** | Extract text from image first (OCR), then send to an LLM |

        ---

        ## Evaluation Results

        Select a task below to see how different models performed:
        """
    )
    return


@app.cell
def _():
    import pandas as pd
    import altair as alt
    from inspect_ai.analysis import evals_df
    return alt, evals_df, pd


@app.cell
def _(evals_df, mo):
    # Load evaluation results with persistent caching
    with mo.persistent_cache(name="doab_evals"):
        df_raw = evals_df("hf://datasets/davanstrien/doab-title-extraction-evals", quiet=True)

        # Add metadata columns
        df_raw["approach"] = df_raw["task_name"].apply(lambda x: "VLM" if "vlm" in x else "Text")
        df_raw["model_short"] = df_raw["model"].apply(lambda x: x.split("/")[-1])

        # Determine task category
        def get_task_category(task_name):
            if "llm_judge" in task_name:
                return "Full Metadata"
            else:
                return "Title Extraction"

        df_raw["task_category"] = df_raw["task_name"].apply(get_task_category)

        # Convert score to percentage
        df_raw["accuracy"] = df_raw["score_headline_value"] * 100

        # Parameter sizes and URLs
        model_info = {
            "hf-inference-providers/Qwen/Qwen3-VL-8B-Instruct": {
                "params": 8,
                "url": "https://huggingface.co/Qwen/Qwen3-VL-8B-Instruct"
            },
            "hf-inference-providers/Qwen/Qwen3-VL-30B-A3B-Thinking": {
                "params": 30,
                "url": "https://huggingface.co/Qwen/Qwen3-VL-30B-A3B"
            },
            "hf-inference-providers/zai-org/GLM-4.6V-Flash": {
                "params": 9,
                "url": "https://huggingface.co/THUDM/GLM-4.1V-9B-Thinking"
            },
            "hf-inference-providers/openai/gpt-oss-20b": {
                "params": 20,
                "url": "https://huggingface.co/openai/gpt-oss-20b"
            },
            "hf-inference-providers/Qwen/Qwen3-4B-Instruct-2507": {
                "params": 4,
                "url": "https://huggingface.co/Qwen/Qwen3-4B"
            },
            "hf-inference-providers/allenai/Olmo-3-7B-Instruct": {
                "params": 7,
                "url": "https://huggingface.co/allenai/OLMo-2-0325-32B-Instruct"
            },
        }
        df_raw["param_size_b"] = df_raw["model"].apply(lambda x: model_info.get(x, {}).get("params"))
        df_raw["model_url"] = df_raw["model"].apply(lambda x: model_info.get(x, {}).get("url", ""))

    df_raw
    return df_raw, get_task_category, model_info


@app.cell
def _(alt, df_raw, mo):
    def make_task_content(task_name):
        """Generate the complete results view for a task."""
        df = df_raw[df_raw["task_category"] == task_name].copy()

        # Calculate summary stats
        vlm_avg = df[df["approach"] == "VLM"]["accuracy"].mean()
        text_avg = df[df["approach"] == "Text"]["accuracy"].mean()
        diff = vlm_avg - text_avg

        task_desc = "book titles" if task_name == "Title Extraction" else "full metadata (title, subtitle, publisher, year, ISBN)"

        # Results summary
        results_md = mo.md(
            f"""
            ### Summary

            | Approach | Average Accuracy |
            |----------|-----------------|
            | **VLM (Vision)** | **{vlm_avg:.0f}%** |
            | Text Extraction | {text_avg:.0f}% |

            **VLM advantage: +{diff:.0f} percentage points**

            VLMs {'significantly ' if diff > 15 else ''}outperform text extraction for extracting {task_desc}.
            """
        )

        # Scatter plot
        chart = alt.Chart(df).mark_circle(size=200, opacity=0.8).encode(
            x=alt.X("param_size_b:Q", title="Parameters (Billions)", scale=alt.Scale(zero=False)),
            y=alt.Y("accuracy:Q", title="Accuracy (%)", scale=alt.Scale(domain=[50, 105])),
            color=alt.Color("approach:N", title="Approach", scale=alt.Scale(domain=["VLM", "Text"], range=["#1f77b4", "#ff7f0e"])),
            tooltip=[
                alt.Tooltip("model_short:N", title="Model"),
                alt.Tooltip("approach:N", title="Approach"),
                alt.Tooltip("param_size_b:Q", title="Params (B)"),
                alt.Tooltip("accuracy:Q", title="Accuracy", format=".1f"),
            ],
        ).properties(
            width=500,
            height=300,
            title="Model Size vs Accuracy"
        ).configure_axis(
            labelFontSize=12,
            titleFontSize=14,
        )

        # Leaderboard
        leaderboard_md = "### Model Leaderboard\n\n| Model | Approach | Params (B) | Accuracy (%) |\n|-------|----------|------------|-------------|\n"
        for _, row in df.sort_values("accuracy", ascending=False).iterrows():
            model_link = f"[{row['model_short']}]({row['model_url']})" if row['model_url'] else row['model_short']
            leaderboard_md += f"| {model_link} | {row['approach']} | {row['param_size_b']} | {row['accuracy']:.1f} |\n"

        return mo.vstack([
            results_md,
            mo.md("### Model Size vs Accuracy"),
            mo.as_html(chart),
            mo.md("*Hover over points to see model details*"),
            mo.md(leaderboard_md),
        ])

    # Create tabs
    tabs = mo.ui.tabs({
        "📄 Title Extraction": make_task_content("Title Extraction"),
        "📚 Full Metadata": make_task_content("Full Metadata"),
    })

    tabs
    return make_task_content, tabs


@app.cell
def _(mo):
    mo.md(
        """
        ---

        ## Why VLMs Win

        Book covers are **visually structured** documents:

        - **Spatial layout**: Titles appear in specific locations (usually top/center)
        - **Typography**: Larger text = more important (likely the title)
        - **Visual hierarchy**: Authors, publishers, and other info have distinct styling

        When you extract text first (OCR), you **flatten this structure** into a linear sequence. The model loses the visual cues that make it obvious what's a title vs. a subtitle vs. author name.

        **Interesting finding**: Qwen3-VL-8B achieves 94% even when used as a text-only model, suggesting it has strong general text understanding - but it still does better (98%) when given the actual images.
        """
    )
    return


@app.cell
def _(mo):
    mo.md(
        """
        ## The Dataset

        We use the [DOAB Metadata Extraction](https://huggingface.co/datasets/biglam/doab-metadata-extraction) dataset - academic book covers from the Directory of Open Access Books.

        Each sample has:
        - Cover image (rendered from PDF)
        - Pre-extracted page text
        - Ground truth metadata (title, subtitle, publisher, year, ISBN)
        """
    )
    return


@app.cell
def _(mo):
    mo.Html(
        """
        <iframe
          src="https://huggingface.co/datasets/biglam/doab-metadata-extraction/embed/viewer/default/train"
          frameborder="0"
          width="100%"
          height="400px"
        ></iframe>
        """
    )
    return


@app.cell
def _(mo):
    mo.md(
        """
        ## Methodology

        **Evaluation Framework**: [Inspect AI](https://inspect.aisi.org.uk/) - an open-source framework for evaluating language models

        **Sample Size**: 50 books (randomly sampled with fixed seed for reproducibility)

        **Scoring Methods**:
        - *Title Extraction*: Custom flexible matching scorer
          - Case-insensitive comparison
          - Accepts if ground truth is substring of prediction (handles subtitles)
          - More robust than exact match for this task
        - *Full Metadata*: LLM-as-judge with partial credit
          - Correct (1.0): Title + year + at least one other field
          - Partial (0.5): Some fields correct
          - Incorrect (0.0): Mostly wrong

        **Models via**: [HuggingFace Inference Providers](https://huggingface.co/docs/inference-providers)

        ---

        ## Replicate This

        The evaluation logs are stored on HuggingFace and can be loaded directly:

        ```python
        from inspect_ai.analysis import evals_df

        df = evals_df("hf://datasets/davanstrien/doab-title-extraction-evals")
        ```

        ---

        *Built with [Marimo](https://marimo.io) | Evaluation framework: [Inspect AI](https://inspect.aisi.org.uk/) | Dataset: [biglam/doab-metadata-extraction](https://huggingface.co/datasets/biglam/doab-metadata-extraction)*
        """
    )
    return


if __name__ == "__main__":
    app.run()