Spaces:

snake11235
/

words2csv

Running

snake11235 commited on 22 days ago

Commit

a8f90b0

1 Parent(s): 2bf7e8d

feat: refactor code organization and add shared logging utilities

Extract common constants and utilities into shared modules for better code organization. Add generic model response logging function to reduce code duplication across backends. Move olmOCR functionality to dedicated module and add image conversion utility.

- Add common.py with shared constants (OPENAI_PRICING, MODEL_GEMINI, MODEL_OLMOCR)
- Add image_utils.py with _pil_image_to_base64_jpeg() helper function
- Add olm_ocr.py module

Files changed (7) hide show

app.py +4 -65
common.py +53 -0
image_utils.py +11 -0
logging_helper.py +41 -0
olm_ocr.py +62 -0
openai_backend.py +13 -81
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -5,7 +5,7 @@ load_dotenv()
 import base64
 from io import BytesIO
-from typing import Tuple, Optional
 import time
 import gradio as gr
@@ -13,22 +13,18 @@ from PIL import Image
 from olmocr.data.renderpdf import render_pdf_to_base64png
-from openai_backend import OPENAI_PRICING, _run_openai_vision
 from logging_helper import log as _log, log_debug as _log_debug
 try:
     import google.generativeai as genai
 except ImportError:  # pragma: no cover
     genai = None  # type: ignore
-import torch
-from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
 APP_TITLE = "words2doc"
 APP_DESCRIPTION = "Upload a PDF or image with (handwritten) text and convert it to CSV using different LLM backends."
-MODEL_GEMINI = "Gemini 3 Pro"
-MODEL_OLMOCR = "olmOCR-2-7B-1025-FP8"
 # -------- Utility helpers -------- #
@@ -101,63 +97,6 @@ def _run_gemini_vision(image: Image.Image, prompt: str) -> str:
     return response.text or ""
-_olmocr_model: Optional[Qwen2_5_VLForConditionalGeneration] = None
-_olmocr_processor: Optional[AutoProcessor] = None
-def _ensure_olmocr_loaded() -> Tuple[Qwen2_5_VLForConditionalGeneration, AutoProcessor]:
-    global _olmocr_model, _olmocr_processor
-    if _olmocr_model is None or _olmocr_processor is None:
-        _olmocr_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-            "allenai/olmOCR-2-7B-1025-FP8", device_map="auto"
-        ).eval()
-        _olmocr_processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
-    return _olmocr_model, _olmocr_processor
-def _run_olmocr(image: Image.Image, prompt: str) -> str:
-    model, processor = _ensure_olmocr_loaded()
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    model.to(device)
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                {"type": "text", "text": prompt},
-                {"type": "image", "image": image},
-            ],
-        }
-    ]
-    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    inputs = processor(
-        text=[text],
-        images=[image],
-        padding=True,
-        return_tensors="pt",
-    )
-    inputs = {key: value.to(device) for (key, value) in inputs.items()}
-    output = model.generate(
-        **inputs,
-        temperature=0.1,
-        max_new_tokens=1024,
-        num_return_sequences=1,
-        do_sample=True,
-    )
-    prompt_length = inputs["input_ids"].shape[1]
-    new_tokens = output[:, prompt_length:]
-    text_output = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
-    return text_output[0] if text_output else ""
 # -------- Main processing function -------- #

 import base64
 from io import BytesIO
+from typing import Optional
 import time
 import gradio as gr
 from olmocr.data.renderpdf import render_pdf_to_base64png
+from openai_backend import _run_openai_vision
+from common import OPENAI_PRICING, MODEL_GEMINI, MODEL_OLMOCR
 from logging_helper import log as _log, log_debug as _log_debug
+from olm_ocr import _run_olmocr
 try:
     import google.generativeai as genai
 except ImportError:  # pragma: no cover
     genai = None  # type: ignore
 APP_TITLE = "words2doc"
 APP_DESCRIPTION = "Upload a PDF or image with (handwritten) text and convert it to CSV using different LLM backends."
 # -------- Utility helpers -------- #
     return response.text or ""
 # -------- Main processing function -------- #

common.py ADDED Viewed

	@@ -0,0 +1,53 @@

+MODEL_GEMINI = "Gemini 3 Pro"
+MODEL_OLMOCR = "olmOCR-2-7B-1025-FP8"
+OPENAI_PRICING = {
+    # GPT-5.2 family
+    "gpt-5.2": {"input": 1.75, "output": 14.00},
+    "gpt-5.2-chat-latest": {"input": 1.75, "output": 14.00},
+    "gpt-5.2-pro": {"input": 21.00, "output": 168.00},
+    # GPT-5.1 / GPT-5 family
+    "gpt-5.1": {"input": 1.25, "output": 10.00},
+    "gpt-5": {"input": 1.25, "output": 10.00},
+    "gpt-5-mini": {"input": 0.25, "output": 2.00},
+    "gpt-5-nano": {"input": 0.05, "output": 0.40},
+    "gpt-5.1-chat-latest": {"input": 1.25, "output": 10.00},
+    "gpt-5-chat-latest": {"input": 1.25, "output": 10.00},
+    "gpt-5.1-codex-max": {"input": 1.25, "output": 10.00},
+    "gpt-5.1-codex": {"input": 1.25, "output": 10.00},
+    "gpt-5-codex": {"input": 1.25, "output": 10.00},
+    "gpt-5.1-codex-mini": {"input": 0.25, "output": 2.00},
+    "gpt-5-pro": {"input": 15.00, "output": 120.00},
+    "gpt-5-search-api": {"input": 1.25, "output": 10.00},
+    # GPT-4.1 family
+    "gpt-4.1": {"input": 2.00, "output": 8.00},
+    "gpt-4.1-mini": {"input": 0.40, "output": 1.60},
+    "gpt-4.1-nano": {"input": 0.10, "output": 0.40},
+    # GPT-4o family
+    "gpt-4o": {"input": 2.50, "output": 10.00},
+    "gpt-4o-2024-05-13": {"input": 5.00, "output": 15.00},
+    "gpt-4o-mini": {"input": 0.15, "output": 0.60},
+    "chatgpt-4o-latest": {"input": 5.00, "output": 15.00},
+    # GPT-4 Turbo / GPT-4 legacy family (from legacy models table)
+    "gpt-4-turbo": {"input": 10.00, "output": 30.00},
+    "gpt-4-turbo-2024-04-09": {"input": 10.00, "output": 30.00},
+    "gpt-4-0125-preview": {"input": 10.00, "output": 30.00},
+    "gpt-4-1106-preview": {"input": 10.00, "output": 30.00},
+    "gpt-4-1106-vision-preview": {"input": 10.00, "output": 30.00},
+    "gpt-4-0613": {"input": 30.00, "output": 60.00},
+    "gpt-4-0314": {"input": 30.00, "output": 60.00},
+    "gpt-4": {"input": 30.00, "output": 60.00},
+    "gpt-4-32k": {"input": 60.00, "output": 120.00},
+    # Default
+    "default": {"input": 2.50, "output": 10.00},
+    # Other backends (mock rates)
+    MODEL_GEMINI: {"input": 1.00, "output": 1.00},
+    MODEL_OLMOCR: {"input": 1.35, "output": 0.30},
+}

image_utils.py ADDED Viewed

	@@ -0,0 +1,11 @@

+import base64
+from io import BytesIO
+from PIL import Image
+def _pil_image_to_base64_jpeg(image: Image.Image) -> str:
+    """Encode a PIL Image as base64 JPEG string (without data: URL prefix)."""
+    buffered = BytesIO()
+    image.save(buffered, format="JPEG")
+    return base64.b64encode(buffered.getvalue()).decode("utf-8")

logging_helper.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import os
 def log(message: str) -> None:
@@ -8,3 +9,43 @@ def log(message: str) -> None:
 def log_debug(message: str) -> None:
     if os.getenv("WORDS2CSV_DEBUG"):
         print(f"[WORDS2CSV-DEBUG] {message}")

 import os
+from typing import Any, Mapping, Optional
 def log(message: str) -> None:
 def log_debug(message: str) -> None:
     if os.getenv("WORDS2CSV_DEBUG"):
         print(f"[WORDS2CSV-DEBUG] {message}")
+def _log_model_response(
+    *,
+    model_name: str,
+    content: str,
+    duration: float,
+    usage: Optional[Any] = None,
+    pricing: Optional[Mapping[str, Mapping[str, float]]] = None,
+    default_pricing_key: str = "default",
+) -> Optional[float]:
+    """Log model usage, cost (if pricing and usage are provided), and response details.
+    Returns the calculated cost if pricing and usage are provided, otherwise None.
+    """
+    cost: Optional[float] = None
+    if usage is not None and pricing is not None:
+        pricing_row = pricing.get(model_name, pricing[default_pricing_key])
+        input_cost = (usage.prompt_tokens / 1_000_000) * pricing_row["input"]
+        output_cost = (usage.completion_tokens / 1_000_000) * pricing_row["output"]
+        cost = input_cost + output_cost
+        log(f"Model: {model_name}")
+        log(
+            "Token usage: Input={usage.prompt_tokens}, "
+            "Output={usage.completion_tokens}, Total={usage.total_tokens}".format(usage=usage)
+        )
+        log(f"Estimated cost: ${cost:.6f}")
+        log(f"Execution time: {duration:.3f} seconds")
+    else:
+        log(f"Model: {model_name}")
+        log(f"Execution time: {duration:.3f} seconds")
+    log("Model response received")
+    log_debug(f"Response length: {len(content)} characters")
+    log_debug(f"Result: {content}")
+    log_debug("End of result")
+    return cost

olm_ocr.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import os
+import time
+from typing import Optional
+from PIL import Image
+from huggingface_hub import InferenceClient
+from image_utils import _pil_image_to_base64_jpeg
+from logging_helper import _log_model_response
+from common import OPENAI_PRICING
+MODEL_ID = "allenai/olmOCR-2-7B-1025-FP8"
+HF_ENDPOINT_URL = "https://wsy54j97qbvg7mua.us-east-1.aws.endpoints.huggingface.cloud"
+def _build_messages(image_base64: str, prompt: str):
+    return [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": prompt},
+                {
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"},
+                },
+            ],
+        }
+    ]
+def _run_olmocr(image: Image.Image, prompt: str) -> str:
+    image_base64 = _pil_image_to_base64_jpeg(image)
+    messages = _build_messages(image_base64, prompt)
+    hf_token: Optional[str] = os.getenv("HF_TOKEN")
+    client = InferenceClient(
+        base_url=HF_ENDPOINT_URL,
+        token=hf_token,
+    )
+    start_time = time.perf_counter()
+    completion = client.chat.completions.create(
+        model=MODEL_ID,
+        messages=messages,
+        max_tokens=512,
+        temperature=0.1,
+    )
+    duration = time.perf_counter() - start_time
+    content = str(completion.choices[0].message.content)
+    _log_model_response(
+        model_name=MODEL_ID,
+        content=content,
+        duration=duration,
+        usage=completion.usage,
+        pricing=OPENAI_PRICING,
+    )
+    return content

openai_backend.py CHANGED Viewed

@@ -2,77 +2,17 @@ import os
 import base64
 import time
 from io import BytesIO
 from typing import Optional
 from PIL import Image
-from logging_helper import log as _log, log_debug as _log_debug
 try:
     from openai import OpenAI
 except ImportError:  # pragma: no cover
     OpenAI = None  # type: ignore
-OPENAI_PRICING = {
-    # GPT-5.2 family
-    "gpt-5.2": {"input": 1.75, "output": 14.00},
-    "gpt-5.2-chat-latest": {"input": 1.75, "output": 14.00},
-    "gpt-5.2-pro": {"input": 21.00, "output": 168.00},
-    # GPT-5.1 / GPT-5 family
-    "gpt-5.1": {"input": 1.25, "output": 10.00},
-    "gpt-5": {"input": 1.25, "output": 10.00},
-    "gpt-5-mini": {"input": 0.25, "output": 2.00},
-    "gpt-5-nano": {"input": 0.05, "output": 0.40},
-    "gpt-5.1-chat-latest": {"input": 1.25, "output": 10.00},
-    "gpt-5-chat-latest": {"input": 1.25, "output": 10.00},
-    "gpt-5.1-codex-max": {"input": 1.25, "output": 10.00},
-    "gpt-5.1-codex": {"input": 1.25, "output": 10.00},
-    "gpt-5-codex": {"input": 1.25, "output": 10.00},
-    "gpt-5.1-codex-mini": {"input": 0.25, "output": 2.00},
-    "gpt-5-pro": {"input": 15.00, "output": 120.00},
-    "gpt-5-search-api": {"input": 1.25, "output": 10.00},
-    # GPT-4.1 family
-    "gpt-4.1": {"input": 2.00, "output": 8.00},
-    "gpt-4.1-mini": {"input": 0.40, "output": 1.60},
-    "gpt-4.1-nano": {"input": 0.10, "output": 0.40},
-    # GPT-4o family
-    "gpt-4o": {"input": 2.50, "output": 10.00},
-    "gpt-4o-2024-05-13": {"input": 5.00, "output": 15.00},
-    "gpt-4o-mini": {"input": 0.15, "output": 0.60},
-    "chatgpt-4o-latest": {"input": 5.00, "output": 15.00},
-    # GPT-4 Turbo / GPT-4 legacy family (from legacy models table)
-    "gpt-4-turbo": {"input": 10.00, "output": 30.00},
-    "gpt-4-turbo-2024-04-09": {"input": 10.00, "output": 30.00},
-    "gpt-4-0125-preview": {"input": 10.00, "output": 30.00},
-    "gpt-4-1106-preview": {"input": 10.00, "output": 30.00},
-    "gpt-4-1106-vision-preview": {"input": 10.00, "output": 30.00},
-    "gpt-4-0613": {"input": 30.00, "output": 60.00},
-    "gpt-4-0314": {"input": 30.00, "output": 60.00},
-    "gpt-4": {"input": 30.00, "output": 60.00},
-    "gpt-4-32k": {"input": 60.00, "output": 120.00},
-    # Default
-    "default": {"input": 2.50, "output": 10.00},
-}
-def _calculate_openai_cost(usage, model_name: str) -> float:
-    """Calculate cost based on token usage and model pricing (per 1M tokens)."""
-    if not usage:
-        return 0.0
-    pricing = OPENAI_PRICING.get(model_name, OPENAI_PRICING["default"])
-    input_cost = (usage.prompt_tokens / 1_000_000) * pricing["input"]
-    output_cost = (usage.completion_tokens / 1_000_000) * pricing["output"]
-    return input_cost + output_cost
 def _run_openai_vision(image: Image.Image, prompt: str, model_name: str) -> str:
     if OpenAI is None:
         raise RuntimeError("openai package is not installed. Please install it to use ChatGPT 5.2 backend.")
@@ -83,9 +23,7 @@ def _run_openai_vision(image: Image.Image, prompt: str, model_name: str) -> str:
     client = OpenAI(api_key=api_key)
-    buffered = BytesIO()
-    image.save(buffered, format="JPEG")
-    img_b64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
     _log_debug(f"Using OpenAI model: {model_name}")
     _log_debug(f"Input image size: {image.size}")
@@ -111,20 +49,14 @@ def _run_openai_vision(image: Image.Image, prompt: str, model_name: str) -> str:
     duration = time.perf_counter() - start_time
-    usage = response.usage
-    if usage:
-        cost = _calculate_openai_cost(usage, model_name)
-        _log(f"Model: {model_name}")
-        _log(
-            "Token usage: Input={usage.prompt_tokens}, "
-            "Output={usage.completion_tokens}, Total={usage.total_tokens}".format(usage=usage)
-        )
-        _log(f"Estimated cost: ${cost:.6f}")
-        _log(f"Execution time: {duration:.3f} seconds")
     content = response.choices[0].message.content or ""
-    _log("OpenAI vision response received")
-    _log_debug(f"Response length: {len(content)} characters")
-    _log_debug(f"Result: {content}")
-    _log_debug("End of result")
     return content

 import base64
 import time
 from io import BytesIO
 from typing import Optional
 from PIL import Image
+from logging_helper import log as _log, log_debug as _log_debug, _log_model_response
+from image_utils import _pil_image_to_base64_jpeg
+from common import OPENAI_PRICING
 try:
     from openai import OpenAI
 except ImportError:  # pragma: no cover
     OpenAI = None  # type: ignore
 def _run_openai_vision(image: Image.Image, prompt: str, model_name: str) -> str:
     if OpenAI is None:
         raise RuntimeError("openai package is not installed. Please install it to use ChatGPT 5.2 backend.")
     client = OpenAI(api_key=api_key)
+    img_b64 = _pil_image_to_base64_jpeg(image)
     _log_debug(f"Using OpenAI model: {model_name}")
     _log_debug(f"Input image size: {image.size}")
     duration = time.perf_counter() - start_time
     content = response.choices[0].message.content or ""
+    _log_model_response(
+        model_name=model_name,
+        content=content,
+        duration=duration,
+        usage=response.usage,
+        pricing=OPENAI_PRICING,
+    )
     return content

requirements.txt CHANGED Viewed

@@ -6,4 +6,5 @@ torch>=2.2.0
 transformers>=4.42.0
 pillow>=10.3.0
 python-dotenv>=1.0.0
-compressed-tensors>=0.0.0

 transformers>=4.42.0
 pillow>=10.3.0
 python-dotenv>=1.0.0
+compressed-tensors>=0.0.0
+accelerate>=0.22.0