snake11235 commited on
Commit
a8f90b0
·
1 Parent(s): 2bf7e8d

feat: refactor code organization and add shared logging utilities

Browse files

Extract common constants and utilities into shared modules for better code organization. Add generic model response logging function to reduce code duplication across backends. Move olmOCR functionality to dedicated module and add image conversion utility.

- Add common.py with shared constants (OPENAI_PRICING, MODEL_GEMINI, MODEL_OLMOCR)
- Add image_utils.py with _pil_image_to_base64_jpeg() helper function
- Add olm_ocr.py module

Files changed (7) hide show
  1. app.py +4 -65
  2. common.py +53 -0
  3. image_utils.py +11 -0
  4. logging_helper.py +41 -0
  5. olm_ocr.py +62 -0
  6. openai_backend.py +13 -81
  7. requirements.txt +2 -1
app.py CHANGED
@@ -5,7 +5,7 @@ load_dotenv()
5
 
6
  import base64
7
  from io import BytesIO
8
- from typing import Tuple, Optional
9
  import time
10
 
11
  import gradio as gr
@@ -13,22 +13,18 @@ from PIL import Image
13
 
14
  from olmocr.data.renderpdf import render_pdf_to_base64png
15
 
16
- from openai_backend import OPENAI_PRICING, _run_openai_vision
 
17
  from logging_helper import log as _log, log_debug as _log_debug
 
18
 
19
  try:
20
  import google.generativeai as genai
21
  except ImportError: # pragma: no cover
22
  genai = None # type: ignore
23
 
24
- import torch
25
- from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
26
-
27
-
28
  APP_TITLE = "words2doc"
29
  APP_DESCRIPTION = "Upload a PDF or image with (handwritten) text and convert it to CSV using different LLM backends."
30
- MODEL_GEMINI = "Gemini 3 Pro"
31
- MODEL_OLMOCR = "olmOCR-2-7B-1025-FP8"
32
 
33
 
34
  # -------- Utility helpers -------- #
@@ -101,63 +97,6 @@ def _run_gemini_vision(image: Image.Image, prompt: str) -> str:
101
  return response.text or ""
102
 
103
 
104
- _olmocr_model: Optional[Qwen2_5_VLForConditionalGeneration] = None
105
- _olmocr_processor: Optional[AutoProcessor] = None
106
-
107
-
108
- def _ensure_olmocr_loaded() -> Tuple[Qwen2_5_VLForConditionalGeneration, AutoProcessor]:
109
- global _olmocr_model, _olmocr_processor
110
-
111
- if _olmocr_model is None or _olmocr_processor is None:
112
- _olmocr_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
113
- "allenai/olmOCR-2-7B-1025-FP8", device_map="auto"
114
- ).eval()
115
- _olmocr_processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
116
-
117
- return _olmocr_model, _olmocr_processor
118
-
119
-
120
- def _run_olmocr(image: Image.Image, prompt: str) -> str:
121
- model, processor = _ensure_olmocr_loaded()
122
-
123
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
124
- model.to(device)
125
-
126
- messages = [
127
- {
128
- "role": "user",
129
- "content": [
130
- {"type": "text", "text": prompt},
131
- {"type": "image", "image": image},
132
- ],
133
- }
134
- ]
135
-
136
- text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
137
-
138
- inputs = processor(
139
- text=[text],
140
- images=[image],
141
- padding=True,
142
- return_tensors="pt",
143
- )
144
- inputs = {key: value.to(device) for (key, value) in inputs.items()}
145
-
146
- output = model.generate(
147
- **inputs,
148
- temperature=0.1,
149
- max_new_tokens=1024,
150
- num_return_sequences=1,
151
- do_sample=True,
152
- )
153
-
154
- prompt_length = inputs["input_ids"].shape[1]
155
- new_tokens = output[:, prompt_length:]
156
-
157
- text_output = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
158
- return text_output[0] if text_output else ""
159
-
160
-
161
  # -------- Main processing function -------- #
162
 
163
 
 
5
 
6
  import base64
7
  from io import BytesIO
8
+ from typing import Optional
9
  import time
10
 
11
  import gradio as gr
 
13
 
14
  from olmocr.data.renderpdf import render_pdf_to_base64png
15
 
16
+ from openai_backend import _run_openai_vision
17
+ from common import OPENAI_PRICING, MODEL_GEMINI, MODEL_OLMOCR
18
  from logging_helper import log as _log, log_debug as _log_debug
19
+ from olm_ocr import _run_olmocr
20
 
21
  try:
22
  import google.generativeai as genai
23
  except ImportError: # pragma: no cover
24
  genai = None # type: ignore
25
 
 
 
 
 
26
  APP_TITLE = "words2doc"
27
  APP_DESCRIPTION = "Upload a PDF or image with (handwritten) text and convert it to CSV using different LLM backends."
 
 
28
 
29
 
30
  # -------- Utility helpers -------- #
 
97
  return response.text or ""
98
 
99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  # -------- Main processing function -------- #
101
 
102
 
common.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL_GEMINI = "Gemini 3 Pro"
2
+ MODEL_OLMOCR = "olmOCR-2-7B-1025-FP8"
3
+
4
+
5
+ OPENAI_PRICING = {
6
+ # GPT-5.2 family
7
+ "gpt-5.2": {"input": 1.75, "output": 14.00},
8
+ "gpt-5.2-chat-latest": {"input": 1.75, "output": 14.00},
9
+ "gpt-5.2-pro": {"input": 21.00, "output": 168.00},
10
+
11
+ # GPT-5.1 / GPT-5 family
12
+ "gpt-5.1": {"input": 1.25, "output": 10.00},
13
+ "gpt-5": {"input": 1.25, "output": 10.00},
14
+ "gpt-5-mini": {"input": 0.25, "output": 2.00},
15
+ "gpt-5-nano": {"input": 0.05, "output": 0.40},
16
+ "gpt-5.1-chat-latest": {"input": 1.25, "output": 10.00},
17
+ "gpt-5-chat-latest": {"input": 1.25, "output": 10.00},
18
+ "gpt-5.1-codex-max": {"input": 1.25, "output": 10.00},
19
+ "gpt-5.1-codex": {"input": 1.25, "output": 10.00},
20
+ "gpt-5-codex": {"input": 1.25, "output": 10.00},
21
+ "gpt-5.1-codex-mini": {"input": 0.25, "output": 2.00},
22
+ "gpt-5-pro": {"input": 15.00, "output": 120.00},
23
+ "gpt-5-search-api": {"input": 1.25, "output": 10.00},
24
+
25
+ # GPT-4.1 family
26
+ "gpt-4.1": {"input": 2.00, "output": 8.00},
27
+ "gpt-4.1-mini": {"input": 0.40, "output": 1.60},
28
+ "gpt-4.1-nano": {"input": 0.10, "output": 0.40},
29
+
30
+ # GPT-4o family
31
+ "gpt-4o": {"input": 2.50, "output": 10.00},
32
+ "gpt-4o-2024-05-13": {"input": 5.00, "output": 15.00},
33
+ "gpt-4o-mini": {"input": 0.15, "output": 0.60},
34
+ "chatgpt-4o-latest": {"input": 5.00, "output": 15.00},
35
+
36
+ # GPT-4 Turbo / GPT-4 legacy family (from legacy models table)
37
+ "gpt-4-turbo": {"input": 10.00, "output": 30.00},
38
+ "gpt-4-turbo-2024-04-09": {"input": 10.00, "output": 30.00},
39
+ "gpt-4-0125-preview": {"input": 10.00, "output": 30.00},
40
+ "gpt-4-1106-preview": {"input": 10.00, "output": 30.00},
41
+ "gpt-4-1106-vision-preview": {"input": 10.00, "output": 30.00},
42
+ "gpt-4-0613": {"input": 30.00, "output": 60.00},
43
+ "gpt-4-0314": {"input": 30.00, "output": 60.00},
44
+ "gpt-4": {"input": 30.00, "output": 60.00},
45
+ "gpt-4-32k": {"input": 60.00, "output": 120.00},
46
+
47
+ # Default
48
+ "default": {"input": 2.50, "output": 10.00},
49
+
50
+ # Other backends (mock rates)
51
+ MODEL_GEMINI: {"input": 1.00, "output": 1.00},
52
+ MODEL_OLMOCR: {"input": 1.35, "output": 0.30},
53
+ }
image_utils.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ from io import BytesIO
3
+
4
+ from PIL import Image
5
+
6
+
7
+ def _pil_image_to_base64_jpeg(image: Image.Image) -> str:
8
+ """Encode a PIL Image as base64 JPEG string (without data: URL prefix)."""
9
+ buffered = BytesIO()
10
+ image.save(buffered, format="JPEG")
11
+ return base64.b64encode(buffered.getvalue()).decode("utf-8")
logging_helper.py CHANGED
@@ -1,4 +1,5 @@
1
  import os
 
2
 
3
 
4
  def log(message: str) -> None:
@@ -8,3 +9,43 @@ def log(message: str) -> None:
8
  def log_debug(message: str) -> None:
9
  if os.getenv("WORDS2CSV_DEBUG"):
10
  print(f"[WORDS2CSV-DEBUG] {message}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
+ from typing import Any, Mapping, Optional
3
 
4
 
5
  def log(message: str) -> None:
 
9
  def log_debug(message: str) -> None:
10
  if os.getenv("WORDS2CSV_DEBUG"):
11
  print(f"[WORDS2CSV-DEBUG] {message}")
12
+
13
+
14
+ def _log_model_response(
15
+ *,
16
+ model_name: str,
17
+ content: str,
18
+ duration: float,
19
+ usage: Optional[Any] = None,
20
+ pricing: Optional[Mapping[str, Mapping[str, float]]] = None,
21
+ default_pricing_key: str = "default",
22
+ ) -> Optional[float]:
23
+ """Log model usage, cost (if pricing and usage are provided), and response details.
24
+
25
+ Returns the calculated cost if pricing and usage are provided, otherwise None.
26
+ """
27
+ cost: Optional[float] = None
28
+
29
+ if usage is not None and pricing is not None:
30
+ pricing_row = pricing.get(model_name, pricing[default_pricing_key])
31
+ input_cost = (usage.prompt_tokens / 1_000_000) * pricing_row["input"]
32
+ output_cost = (usage.completion_tokens / 1_000_000) * pricing_row["output"]
33
+ cost = input_cost + output_cost
34
+
35
+ log(f"Model: {model_name}")
36
+ log(
37
+ "Token usage: Input={usage.prompt_tokens}, "
38
+ "Output={usage.completion_tokens}, Total={usage.total_tokens}".format(usage=usage)
39
+ )
40
+ log(f"Estimated cost: ${cost:.6f}")
41
+ log(f"Execution time: {duration:.3f} seconds")
42
+ else:
43
+ log(f"Model: {model_name}")
44
+ log(f"Execution time: {duration:.3f} seconds")
45
+
46
+ log("Model response received")
47
+ log_debug(f"Response length: {len(content)} characters")
48
+ log_debug(f"Result: {content}")
49
+ log_debug("End of result")
50
+
51
+ return cost
olm_ocr.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ from typing import Optional
4
+ from PIL import Image
5
+ from huggingface_hub import InferenceClient
6
+ from image_utils import _pil_image_to_base64_jpeg
7
+ from logging_helper import _log_model_response
8
+ from common import OPENAI_PRICING
9
+
10
+
11
+ MODEL_ID = "allenai/olmOCR-2-7B-1025-FP8"
12
+ HF_ENDPOINT_URL = "https://wsy54j97qbvg7mua.us-east-1.aws.endpoints.huggingface.cloud"
13
+
14
+
15
+ def _build_messages(image_base64: str, prompt: str):
16
+ return [
17
+ {
18
+ "role": "user",
19
+ "content": [
20
+ {"type": "text", "text": prompt},
21
+ {
22
+ "type": "image_url",
23
+ "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"},
24
+ },
25
+ ],
26
+ }
27
+ ]
28
+
29
+
30
+ def _run_olmocr(image: Image.Image, prompt: str) -> str:
31
+ image_base64 = _pil_image_to_base64_jpeg(image)
32
+ messages = _build_messages(image_base64, prompt)
33
+
34
+ hf_token: Optional[str] = os.getenv("HF_TOKEN")
35
+
36
+ client = InferenceClient(
37
+ base_url=HF_ENDPOINT_URL,
38
+ token=hf_token,
39
+ )
40
+
41
+ start_time = time.perf_counter()
42
+
43
+ completion = client.chat.completions.create(
44
+ model=MODEL_ID,
45
+ messages=messages,
46
+ max_tokens=512,
47
+ temperature=0.1,
48
+ )
49
+
50
+ duration = time.perf_counter() - start_time
51
+
52
+ content = str(completion.choices[0].message.content)
53
+
54
+ _log_model_response(
55
+ model_name=MODEL_ID,
56
+ content=content,
57
+ duration=duration,
58
+ usage=completion.usage,
59
+ pricing=OPENAI_PRICING,
60
+ )
61
+
62
+ return content
openai_backend.py CHANGED
@@ -2,77 +2,17 @@ import os
2
  import base64
3
  import time
4
  from io import BytesIO
5
-
6
  from typing import Optional
7
-
8
  from PIL import Image
9
-
10
- from logging_helper import log as _log, log_debug as _log_debug
 
11
 
12
  try:
13
  from openai import OpenAI
14
  except ImportError: # pragma: no cover
15
  OpenAI = None # type: ignore
16
 
17
-
18
- OPENAI_PRICING = {
19
- # GPT-5.2 family
20
- "gpt-5.2": {"input": 1.75, "output": 14.00},
21
- "gpt-5.2-chat-latest": {"input": 1.75, "output": 14.00},
22
- "gpt-5.2-pro": {"input": 21.00, "output": 168.00},
23
-
24
- # GPT-5.1 / GPT-5 family
25
- "gpt-5.1": {"input": 1.25, "output": 10.00},
26
- "gpt-5": {"input": 1.25, "output": 10.00},
27
- "gpt-5-mini": {"input": 0.25, "output": 2.00},
28
- "gpt-5-nano": {"input": 0.05, "output": 0.40},
29
- "gpt-5.1-chat-latest": {"input": 1.25, "output": 10.00},
30
- "gpt-5-chat-latest": {"input": 1.25, "output": 10.00},
31
- "gpt-5.1-codex-max": {"input": 1.25, "output": 10.00},
32
- "gpt-5.1-codex": {"input": 1.25, "output": 10.00},
33
- "gpt-5-codex": {"input": 1.25, "output": 10.00},
34
- "gpt-5.1-codex-mini": {"input": 0.25, "output": 2.00},
35
- "gpt-5-pro": {"input": 15.00, "output": 120.00},
36
- "gpt-5-search-api": {"input": 1.25, "output": 10.00},
37
-
38
- # GPT-4.1 family
39
- "gpt-4.1": {"input": 2.00, "output": 8.00},
40
- "gpt-4.1-mini": {"input": 0.40, "output": 1.60},
41
- "gpt-4.1-nano": {"input": 0.10, "output": 0.40},
42
-
43
- # GPT-4o family
44
- "gpt-4o": {"input": 2.50, "output": 10.00},
45
- "gpt-4o-2024-05-13": {"input": 5.00, "output": 15.00},
46
- "gpt-4o-mini": {"input": 0.15, "output": 0.60},
47
- "chatgpt-4o-latest": {"input": 5.00, "output": 15.00},
48
-
49
- # GPT-4 Turbo / GPT-4 legacy family (from legacy models table)
50
- "gpt-4-turbo": {"input": 10.00, "output": 30.00},
51
- "gpt-4-turbo-2024-04-09": {"input": 10.00, "output": 30.00},
52
- "gpt-4-0125-preview": {"input": 10.00, "output": 30.00},
53
- "gpt-4-1106-preview": {"input": 10.00, "output": 30.00},
54
- "gpt-4-1106-vision-preview": {"input": 10.00, "output": 30.00},
55
- "gpt-4-0613": {"input": 30.00, "output": 60.00},
56
- "gpt-4-0314": {"input": 30.00, "output": 60.00},
57
- "gpt-4": {"input": 30.00, "output": 60.00},
58
- "gpt-4-32k": {"input": 60.00, "output": 120.00},
59
-
60
- # Default
61
- "default": {"input": 2.50, "output": 10.00},
62
- }
63
-
64
-
65
- def _calculate_openai_cost(usage, model_name: str) -> float:
66
- """Calculate cost based on token usage and model pricing (per 1M tokens)."""
67
- if not usage:
68
- return 0.0
69
-
70
- pricing = OPENAI_PRICING.get(model_name, OPENAI_PRICING["default"])
71
- input_cost = (usage.prompt_tokens / 1_000_000) * pricing["input"]
72
- output_cost = (usage.completion_tokens / 1_000_000) * pricing["output"]
73
- return input_cost + output_cost
74
-
75
-
76
  def _run_openai_vision(image: Image.Image, prompt: str, model_name: str) -> str:
77
  if OpenAI is None:
78
  raise RuntimeError("openai package is not installed. Please install it to use ChatGPT 5.2 backend.")
@@ -83,9 +23,7 @@ def _run_openai_vision(image: Image.Image, prompt: str, model_name: str) -> str:
83
 
84
  client = OpenAI(api_key=api_key)
85
 
86
- buffered = BytesIO()
87
- image.save(buffered, format="JPEG")
88
- img_b64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
89
 
90
  _log_debug(f"Using OpenAI model: {model_name}")
91
  _log_debug(f"Input image size: {image.size}")
@@ -111,20 +49,14 @@ def _run_openai_vision(image: Image.Image, prompt: str, model_name: str) -> str:
111
 
112
  duration = time.perf_counter() - start_time
113
 
114
- usage = response.usage
115
- if usage:
116
- cost = _calculate_openai_cost(usage, model_name)
117
- _log(f"Model: {model_name}")
118
- _log(
119
- "Token usage: Input={usage.prompt_tokens}, "
120
- "Output={usage.completion_tokens}, Total={usage.total_tokens}".format(usage=usage)
121
- )
122
- _log(f"Estimated cost: ${cost:.6f}")
123
- _log(f"Execution time: {duration:.3f} seconds")
124
-
125
  content = response.choices[0].message.content or ""
126
- _log("OpenAI vision response received")
127
- _log_debug(f"Response length: {len(content)} characters")
128
- _log_debug(f"Result: {content}")
129
- _log_debug("End of result")
 
 
 
 
 
130
  return content
 
2
  import base64
3
  import time
4
  from io import BytesIO
 
5
  from typing import Optional
 
6
  from PIL import Image
7
+ from logging_helper import log as _log, log_debug as _log_debug, _log_model_response
8
+ from image_utils import _pil_image_to_base64_jpeg
9
+ from common import OPENAI_PRICING
10
 
11
  try:
12
  from openai import OpenAI
13
  except ImportError: # pragma: no cover
14
  OpenAI = None # type: ignore
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  def _run_openai_vision(image: Image.Image, prompt: str, model_name: str) -> str:
17
  if OpenAI is None:
18
  raise RuntimeError("openai package is not installed. Please install it to use ChatGPT 5.2 backend.")
 
23
 
24
  client = OpenAI(api_key=api_key)
25
 
26
+ img_b64 = _pil_image_to_base64_jpeg(image)
 
 
27
 
28
  _log_debug(f"Using OpenAI model: {model_name}")
29
  _log_debug(f"Input image size: {image.size}")
 
49
 
50
  duration = time.perf_counter() - start_time
51
 
 
 
 
 
 
 
 
 
 
 
 
52
  content = response.choices[0].message.content or ""
53
+
54
+ _log_model_response(
55
+ model_name=model_name,
56
+ content=content,
57
+ duration=duration,
58
+ usage=response.usage,
59
+ pricing=OPENAI_PRICING,
60
+ )
61
+
62
  return content
requirements.txt CHANGED
@@ -6,4 +6,5 @@ torch>=2.2.0
6
  transformers>=4.42.0
7
  pillow>=10.3.0
8
  python-dotenv>=1.0.0
9
- compressed-tensors>=0.0.0
 
 
6
  transformers>=4.42.0
7
  pillow>=10.3.0
8
  python-dotenv>=1.0.0
9
+ compressed-tensors>=0.0.0
10
+ accelerate>=0.22.0