File size: 9,622 Bytes
2ea6e78 d925140 2ea6e78 d925140 2ea6e78 d925140 a08c225 d925140 2ea6e78 d925140 8f9e935 2ea6e78 a08c225 2ea6e78 a08c225 2ea6e78 2528d39 8f9e935 d925140 786e808 d925140 2528d39 d925140 a08c225 d925140 2ea6e78 8f9e935 a08c225 8f9e935 2ea6e78 8f9e935 a08c225 8f9e935 a08c225 8f9e935 2ea6e78 a08c225 2ea6e78 a08c225 2ea6e78 8f9e935 a08c225 8f9e935 2ea6e78 8f9e935 2ea6e78 8f9e935 2ea6e78 8f9e935 2ea6e78 8f9e935 d925140 8f9e935 d925140 2ea6e78 a08c225 2ea6e78 d925140 2ea6e78 d925140 2ea6e78 d925140 2ea6e78 d925140 a08c225 d925140 2ea6e78 d925140 2ea6e78 d925140 a08c225 d925140 a08c225 d925140 a08c225 d925140 2ea6e78 d925140 2ea6e78 d925140 2ea6e78 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 |
import marimo
__generated_with = "0.10.9"
app = marimo.App(width="medium")
@app.cell
def _():
import marimo as mo
return (mo,)
@app.cell
def _(mo):
mo.md(
"""
# VLM vs Text: Extracting Metadata from Book Covers
**The Task**: Libraries and archives have millions of digitized book covers where metadata is incomplete or missing. Can we use AI to automatically extract titles and other metadata?
**The Question**: Should we use Vision-Language Models (VLMs) that "see" the cover image, or extract text first and send it to a standard LLM?
**The Answer**: VLMs win decisively for this task.
---
This evaluation uses the [DOAB (Directory of Open Access Books)](https://huggingface.co/datasets/biglam/doab-metadata-extraction) dataset of academic book covers. We compare two approaches:
| Approach | How it works |
|----------|-------------|
| **VLM** | Send the cover image directly to a Vision-Language Model |
| **Text** | Extract text from image first (OCR), then send to an LLM |
---
## Evaluation Results
Select a task below to see how different models performed:
"""
)
return
@app.cell
def _():
import pandas as pd
import altair as alt
from inspect_ai.analysis import evals_df
return alt, evals_df, pd
@app.cell
def _(evals_df, mo):
# Load evaluation results with persistent caching
with mo.persistent_cache(name="doab_evals"):
df_raw = evals_df("hf://datasets/davanstrien/doab-title-extraction-evals", quiet=True)
# Add metadata columns
df_raw["approach"] = df_raw["task_name"].apply(lambda x: "VLM" if "vlm" in x else "Text")
df_raw["model_short"] = df_raw["model"].apply(lambda x: x.split("/")[-1])
# Determine task category
def get_task_category(task_name):
if "llm_judge" in task_name:
return "Full Metadata"
else:
return "Title Extraction"
df_raw["task_category"] = df_raw["task_name"].apply(get_task_category)
# Convert score to percentage
df_raw["accuracy"] = df_raw["score_headline_value"] * 100
# Parameter sizes and URLs
model_info = {
"hf-inference-providers/Qwen/Qwen3-VL-8B-Instruct": {
"params": 8,
"url": "https://huggingface.co/Qwen/Qwen3-VL-8B-Instruct"
},
"hf-inference-providers/Qwen/Qwen3-VL-30B-A3B-Thinking": {
"params": 30,
"url": "https://huggingface.co/Qwen/Qwen3-VL-30B-A3B"
},
"hf-inference-providers/zai-org/GLM-4.6V-Flash": {
"params": 9,
"url": "https://huggingface.co/THUDM/GLM-4.1V-9B-Thinking"
},
"hf-inference-providers/openai/gpt-oss-20b": {
"params": 20,
"url": "https://huggingface.co/openai/gpt-oss-20b"
},
"hf-inference-providers/Qwen/Qwen3-4B-Instruct-2507": {
"params": 4,
"url": "https://huggingface.co/Qwen/Qwen3-4B"
},
"hf-inference-providers/allenai/Olmo-3-7B-Instruct": {
"params": 7,
"url": "https://huggingface.co/allenai/OLMo-2-0325-32B-Instruct"
},
}
df_raw["param_size_b"] = df_raw["model"].apply(lambda x: model_info.get(x, {}).get("params"))
df_raw["model_url"] = df_raw["model"].apply(lambda x: model_info.get(x, {}).get("url", ""))
df_raw
return df_raw, get_task_category, model_info
@app.cell
def _(alt, df_raw, mo):
def make_task_content(task_name):
"""Generate the complete results view for a task."""
df = df_raw[df_raw["task_category"] == task_name].copy()
# Calculate summary stats
vlm_avg = df[df["approach"] == "VLM"]["accuracy"].mean()
text_avg = df[df["approach"] == "Text"]["accuracy"].mean()
diff = vlm_avg - text_avg
task_desc = "book titles" if task_name == "Title Extraction" else "full metadata (title, subtitle, publisher, year, ISBN)"
# Results summary
results_md = mo.md(
f"""
### Summary
| Approach | Average Accuracy |
|----------|-----------------|
| **VLM (Vision)** | **{vlm_avg:.0f}%** |
| Text Extraction | {text_avg:.0f}% |
**VLM advantage: +{diff:.0f} percentage points**
VLMs {'significantly ' if diff > 15 else ''}outperform text extraction for extracting {task_desc}.
"""
)
# Scatter plot
chart = alt.Chart(df).mark_circle(size=200, opacity=0.8).encode(
x=alt.X("param_size_b:Q", title="Parameters (Billions)", scale=alt.Scale(zero=False)),
y=alt.Y("accuracy:Q", title="Accuracy (%)", scale=alt.Scale(domain=[50, 105])),
color=alt.Color("approach:N", title="Approach", scale=alt.Scale(domain=["VLM", "Text"], range=["#1f77b4", "#ff7f0e"])),
tooltip=[
alt.Tooltip("model_short:N", title="Model"),
alt.Tooltip("approach:N", title="Approach"),
alt.Tooltip("param_size_b:Q", title="Params (B)"),
alt.Tooltip("accuracy:Q", title="Accuracy", format=".1f"),
],
).properties(
width=500,
height=300,
title="Model Size vs Accuracy"
).configure_axis(
labelFontSize=12,
titleFontSize=14,
)
# Leaderboard
leaderboard_md = "### Model Leaderboard\n\n| Model | Approach | Params (B) | Accuracy (%) |\n|-------|----------|------------|-------------|\n"
for _, row in df.sort_values("accuracy", ascending=False).iterrows():
model_link = f"[{row['model_short']}]({row['model_url']})" if row['model_url'] else row['model_short']
leaderboard_md += f"| {model_link} | {row['approach']} | {row['param_size_b']} | {row['accuracy']:.1f} |\n"
return mo.vstack([
results_md,
mo.md("### Model Size vs Accuracy"),
mo.as_html(chart),
mo.md("*Hover over points to see model details*"),
mo.md(leaderboard_md),
])
# Create tabs
tabs = mo.ui.tabs({
"π Title Extraction": make_task_content("Title Extraction"),
"π Full Metadata": make_task_content("Full Metadata"),
})
tabs
return make_task_content, tabs
@app.cell
def _(mo):
mo.md(
"""
---
## Why VLMs Win
Book covers are **visually structured** documents:
- **Spatial layout**: Titles appear in specific locations (usually top/center)
- **Typography**: Larger text = more important (likely the title)
- **Visual hierarchy**: Authors, publishers, and other info have distinct styling
When you extract text first (OCR), you **flatten this structure** into a linear sequence. The model loses the visual cues that make it obvious what's a title vs. a subtitle vs. author name.
**Interesting finding**: Qwen3-VL-8B achieves 94% even when used as a text-only model, suggesting it has strong general text understanding - but it still does better (98%) when given the actual images.
"""
)
return
@app.cell
def _(mo):
mo.md(
"""
## The Dataset
We use the [DOAB Metadata Extraction](https://huggingface.co/datasets/biglam/doab-metadata-extraction) dataset - academic book covers from the Directory of Open Access Books.
Each sample has:
- Cover image (rendered from PDF)
- Pre-extracted page text
- Ground truth metadata (title, subtitle, publisher, year, ISBN)
"""
)
return
@app.cell
def _(mo):
mo.Html(
"""
<iframe
src="https://huggingface.co/datasets/biglam/doab-metadata-extraction/embed/viewer/default/train"
frameborder="0"
width="100%"
height="400px"
></iframe>
"""
)
return
@app.cell
def _(mo):
mo.md(
"""
## Methodology
**Evaluation Framework**: [Inspect AI](https://inspect.aisi.org.uk/) - an open-source framework for evaluating language models
**Sample Size**: 50 books (randomly sampled with fixed seed for reproducibility)
**Scoring Methods**:
- *Title Extraction*: Custom flexible matching scorer
- Case-insensitive comparison
- Accepts if ground truth is substring of prediction (handles subtitles)
- More robust than exact match for this task
- *Full Metadata*: LLM-as-judge with partial credit
- Correct (1.0): Title + year + at least one other field
- Partial (0.5): Some fields correct
- Incorrect (0.0): Mostly wrong
**Models via**: [HuggingFace Inference Providers](https://huggingface.co/docs/inference-providers)
---
## Replicate This
The evaluation logs are stored on HuggingFace and can be loaded directly:
```python
from inspect_ai.analysis import evals_df
df = evals_df("hf://datasets/davanstrien/doab-title-extraction-evals")
```
---
*Built with [Marimo](https://marimo.io) | Evaluation framework: [Inspect AI](https://inspect.aisi.org.uk/) | Dataset: [biglam/doab-metadata-extraction](https://huggingface.co/datasets/biglam/doab-metadata-extraction)*
"""
)
return
if __name__ == "__main__":
app.run()
|