Spaces:
Running
Running
Commit
·
a8f90b0
1
Parent(s):
2bf7e8d
feat: refactor code organization and add shared logging utilities
Browse filesExtract common constants and utilities into shared modules for better code organization. Add generic model response logging function to reduce code duplication across backends. Move olmOCR functionality to dedicated module and add image conversion utility.
- Add common.py with shared constants (OPENAI_PRICING, MODEL_GEMINI, MODEL_OLMOCR)
- Add image_utils.py with _pil_image_to_base64_jpeg() helper function
- Add olm_ocr.py module
- app.py +4 -65
- common.py +53 -0
- image_utils.py +11 -0
- logging_helper.py +41 -0
- olm_ocr.py +62 -0
- openai_backend.py +13 -81
- requirements.txt +2 -1
app.py
CHANGED
|
@@ -5,7 +5,7 @@ load_dotenv()
|
|
| 5 |
|
| 6 |
import base64
|
| 7 |
from io import BytesIO
|
| 8 |
-
from typing import
|
| 9 |
import time
|
| 10 |
|
| 11 |
import gradio as gr
|
|
@@ -13,22 +13,18 @@ from PIL import Image
|
|
| 13 |
|
| 14 |
from olmocr.data.renderpdf import render_pdf_to_base64png
|
| 15 |
|
| 16 |
-
from openai_backend import
|
|
|
|
| 17 |
from logging_helper import log as _log, log_debug as _log_debug
|
|
|
|
| 18 |
|
| 19 |
try:
|
| 20 |
import google.generativeai as genai
|
| 21 |
except ImportError: # pragma: no cover
|
| 22 |
genai = None # type: ignore
|
| 23 |
|
| 24 |
-
import torch
|
| 25 |
-
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
|
| 26 |
-
|
| 27 |
-
|
| 28 |
APP_TITLE = "words2doc"
|
| 29 |
APP_DESCRIPTION = "Upload a PDF or image with (handwritten) text and convert it to CSV using different LLM backends."
|
| 30 |
-
MODEL_GEMINI = "Gemini 3 Pro"
|
| 31 |
-
MODEL_OLMOCR = "olmOCR-2-7B-1025-FP8"
|
| 32 |
|
| 33 |
|
| 34 |
# -------- Utility helpers -------- #
|
|
@@ -101,63 +97,6 @@ def _run_gemini_vision(image: Image.Image, prompt: str) -> str:
|
|
| 101 |
return response.text or ""
|
| 102 |
|
| 103 |
|
| 104 |
-
_olmocr_model: Optional[Qwen2_5_VLForConditionalGeneration] = None
|
| 105 |
-
_olmocr_processor: Optional[AutoProcessor] = None
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
def _ensure_olmocr_loaded() -> Tuple[Qwen2_5_VLForConditionalGeneration, AutoProcessor]:
|
| 109 |
-
global _olmocr_model, _olmocr_processor
|
| 110 |
-
|
| 111 |
-
if _olmocr_model is None or _olmocr_processor is None:
|
| 112 |
-
_olmocr_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 113 |
-
"allenai/olmOCR-2-7B-1025-FP8", device_map="auto"
|
| 114 |
-
).eval()
|
| 115 |
-
_olmocr_processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
|
| 116 |
-
|
| 117 |
-
return _olmocr_model, _olmocr_processor
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
def _run_olmocr(image: Image.Image, prompt: str) -> str:
|
| 121 |
-
model, processor = _ensure_olmocr_loaded()
|
| 122 |
-
|
| 123 |
-
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 124 |
-
model.to(device)
|
| 125 |
-
|
| 126 |
-
messages = [
|
| 127 |
-
{
|
| 128 |
-
"role": "user",
|
| 129 |
-
"content": [
|
| 130 |
-
{"type": "text", "text": prompt},
|
| 131 |
-
{"type": "image", "image": image},
|
| 132 |
-
],
|
| 133 |
-
}
|
| 134 |
-
]
|
| 135 |
-
|
| 136 |
-
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
| 137 |
-
|
| 138 |
-
inputs = processor(
|
| 139 |
-
text=[text],
|
| 140 |
-
images=[image],
|
| 141 |
-
padding=True,
|
| 142 |
-
return_tensors="pt",
|
| 143 |
-
)
|
| 144 |
-
inputs = {key: value.to(device) for (key, value) in inputs.items()}
|
| 145 |
-
|
| 146 |
-
output = model.generate(
|
| 147 |
-
**inputs,
|
| 148 |
-
temperature=0.1,
|
| 149 |
-
max_new_tokens=1024,
|
| 150 |
-
num_return_sequences=1,
|
| 151 |
-
do_sample=True,
|
| 152 |
-
)
|
| 153 |
-
|
| 154 |
-
prompt_length = inputs["input_ids"].shape[1]
|
| 155 |
-
new_tokens = output[:, prompt_length:]
|
| 156 |
-
|
| 157 |
-
text_output = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
|
| 158 |
-
return text_output[0] if text_output else ""
|
| 159 |
-
|
| 160 |
-
|
| 161 |
# -------- Main processing function -------- #
|
| 162 |
|
| 163 |
|
|
|
|
| 5 |
|
| 6 |
import base64
|
| 7 |
from io import BytesIO
|
| 8 |
+
from typing import Optional
|
| 9 |
import time
|
| 10 |
|
| 11 |
import gradio as gr
|
|
|
|
| 13 |
|
| 14 |
from olmocr.data.renderpdf import render_pdf_to_base64png
|
| 15 |
|
| 16 |
+
from openai_backend import _run_openai_vision
|
| 17 |
+
from common import OPENAI_PRICING, MODEL_GEMINI, MODEL_OLMOCR
|
| 18 |
from logging_helper import log as _log, log_debug as _log_debug
|
| 19 |
+
from olm_ocr import _run_olmocr
|
| 20 |
|
| 21 |
try:
|
| 22 |
import google.generativeai as genai
|
| 23 |
except ImportError: # pragma: no cover
|
| 24 |
genai = None # type: ignore
|
| 25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
APP_TITLE = "words2doc"
|
| 27 |
APP_DESCRIPTION = "Upload a PDF or image with (handwritten) text and convert it to CSV using different LLM backends."
|
|
|
|
|
|
|
| 28 |
|
| 29 |
|
| 30 |
# -------- Utility helpers -------- #
|
|
|
|
| 97 |
return response.text or ""
|
| 98 |
|
| 99 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
# -------- Main processing function -------- #
|
| 101 |
|
| 102 |
|
common.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MODEL_GEMINI = "Gemini 3 Pro"
|
| 2 |
+
MODEL_OLMOCR = "olmOCR-2-7B-1025-FP8"
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
OPENAI_PRICING = {
|
| 6 |
+
# GPT-5.2 family
|
| 7 |
+
"gpt-5.2": {"input": 1.75, "output": 14.00},
|
| 8 |
+
"gpt-5.2-chat-latest": {"input": 1.75, "output": 14.00},
|
| 9 |
+
"gpt-5.2-pro": {"input": 21.00, "output": 168.00},
|
| 10 |
+
|
| 11 |
+
# GPT-5.1 / GPT-5 family
|
| 12 |
+
"gpt-5.1": {"input": 1.25, "output": 10.00},
|
| 13 |
+
"gpt-5": {"input": 1.25, "output": 10.00},
|
| 14 |
+
"gpt-5-mini": {"input": 0.25, "output": 2.00},
|
| 15 |
+
"gpt-5-nano": {"input": 0.05, "output": 0.40},
|
| 16 |
+
"gpt-5.1-chat-latest": {"input": 1.25, "output": 10.00},
|
| 17 |
+
"gpt-5-chat-latest": {"input": 1.25, "output": 10.00},
|
| 18 |
+
"gpt-5.1-codex-max": {"input": 1.25, "output": 10.00},
|
| 19 |
+
"gpt-5.1-codex": {"input": 1.25, "output": 10.00},
|
| 20 |
+
"gpt-5-codex": {"input": 1.25, "output": 10.00},
|
| 21 |
+
"gpt-5.1-codex-mini": {"input": 0.25, "output": 2.00},
|
| 22 |
+
"gpt-5-pro": {"input": 15.00, "output": 120.00},
|
| 23 |
+
"gpt-5-search-api": {"input": 1.25, "output": 10.00},
|
| 24 |
+
|
| 25 |
+
# GPT-4.1 family
|
| 26 |
+
"gpt-4.1": {"input": 2.00, "output": 8.00},
|
| 27 |
+
"gpt-4.1-mini": {"input": 0.40, "output": 1.60},
|
| 28 |
+
"gpt-4.1-nano": {"input": 0.10, "output": 0.40},
|
| 29 |
+
|
| 30 |
+
# GPT-4o family
|
| 31 |
+
"gpt-4o": {"input": 2.50, "output": 10.00},
|
| 32 |
+
"gpt-4o-2024-05-13": {"input": 5.00, "output": 15.00},
|
| 33 |
+
"gpt-4o-mini": {"input": 0.15, "output": 0.60},
|
| 34 |
+
"chatgpt-4o-latest": {"input": 5.00, "output": 15.00},
|
| 35 |
+
|
| 36 |
+
# GPT-4 Turbo / GPT-4 legacy family (from legacy models table)
|
| 37 |
+
"gpt-4-turbo": {"input": 10.00, "output": 30.00},
|
| 38 |
+
"gpt-4-turbo-2024-04-09": {"input": 10.00, "output": 30.00},
|
| 39 |
+
"gpt-4-0125-preview": {"input": 10.00, "output": 30.00},
|
| 40 |
+
"gpt-4-1106-preview": {"input": 10.00, "output": 30.00},
|
| 41 |
+
"gpt-4-1106-vision-preview": {"input": 10.00, "output": 30.00},
|
| 42 |
+
"gpt-4-0613": {"input": 30.00, "output": 60.00},
|
| 43 |
+
"gpt-4-0314": {"input": 30.00, "output": 60.00},
|
| 44 |
+
"gpt-4": {"input": 30.00, "output": 60.00},
|
| 45 |
+
"gpt-4-32k": {"input": 60.00, "output": 120.00},
|
| 46 |
+
|
| 47 |
+
# Default
|
| 48 |
+
"default": {"input": 2.50, "output": 10.00},
|
| 49 |
+
|
| 50 |
+
# Other backends (mock rates)
|
| 51 |
+
MODEL_GEMINI: {"input": 1.00, "output": 1.00},
|
| 52 |
+
MODEL_OLMOCR: {"input": 1.35, "output": 0.30},
|
| 53 |
+
}
|
image_utils.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import base64
|
| 2 |
+
from io import BytesIO
|
| 3 |
+
|
| 4 |
+
from PIL import Image
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def _pil_image_to_base64_jpeg(image: Image.Image) -> str:
|
| 8 |
+
"""Encode a PIL Image as base64 JPEG string (without data: URL prefix)."""
|
| 9 |
+
buffered = BytesIO()
|
| 10 |
+
image.save(buffered, format="JPEG")
|
| 11 |
+
return base64.b64encode(buffered.getvalue()).decode("utf-8")
|
logging_helper.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
import os
|
|
|
|
| 2 |
|
| 3 |
|
| 4 |
def log(message: str) -> None:
|
|
@@ -8,3 +9,43 @@ def log(message: str) -> None:
|
|
| 8 |
def log_debug(message: str) -> None:
|
| 9 |
if os.getenv("WORDS2CSV_DEBUG"):
|
| 10 |
print(f"[WORDS2CSV-DEBUG] {message}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
+
from typing import Any, Mapping, Optional
|
| 3 |
|
| 4 |
|
| 5 |
def log(message: str) -> None:
|
|
|
|
| 9 |
def log_debug(message: str) -> None:
|
| 10 |
if os.getenv("WORDS2CSV_DEBUG"):
|
| 11 |
print(f"[WORDS2CSV-DEBUG] {message}")
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def _log_model_response(
|
| 15 |
+
*,
|
| 16 |
+
model_name: str,
|
| 17 |
+
content: str,
|
| 18 |
+
duration: float,
|
| 19 |
+
usage: Optional[Any] = None,
|
| 20 |
+
pricing: Optional[Mapping[str, Mapping[str, float]]] = None,
|
| 21 |
+
default_pricing_key: str = "default",
|
| 22 |
+
) -> Optional[float]:
|
| 23 |
+
"""Log model usage, cost (if pricing and usage are provided), and response details.
|
| 24 |
+
|
| 25 |
+
Returns the calculated cost if pricing and usage are provided, otherwise None.
|
| 26 |
+
"""
|
| 27 |
+
cost: Optional[float] = None
|
| 28 |
+
|
| 29 |
+
if usage is not None and pricing is not None:
|
| 30 |
+
pricing_row = pricing.get(model_name, pricing[default_pricing_key])
|
| 31 |
+
input_cost = (usage.prompt_tokens / 1_000_000) * pricing_row["input"]
|
| 32 |
+
output_cost = (usage.completion_tokens / 1_000_000) * pricing_row["output"]
|
| 33 |
+
cost = input_cost + output_cost
|
| 34 |
+
|
| 35 |
+
log(f"Model: {model_name}")
|
| 36 |
+
log(
|
| 37 |
+
"Token usage: Input={usage.prompt_tokens}, "
|
| 38 |
+
"Output={usage.completion_tokens}, Total={usage.total_tokens}".format(usage=usage)
|
| 39 |
+
)
|
| 40 |
+
log(f"Estimated cost: ${cost:.6f}")
|
| 41 |
+
log(f"Execution time: {duration:.3f} seconds")
|
| 42 |
+
else:
|
| 43 |
+
log(f"Model: {model_name}")
|
| 44 |
+
log(f"Execution time: {duration:.3f} seconds")
|
| 45 |
+
|
| 46 |
+
log("Model response received")
|
| 47 |
+
log_debug(f"Response length: {len(content)} characters")
|
| 48 |
+
log_debug(f"Result: {content}")
|
| 49 |
+
log_debug("End of result")
|
| 50 |
+
|
| 51 |
+
return cost
|
olm_ocr.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import time
|
| 3 |
+
from typing import Optional
|
| 4 |
+
from PIL import Image
|
| 5 |
+
from huggingface_hub import InferenceClient
|
| 6 |
+
from image_utils import _pil_image_to_base64_jpeg
|
| 7 |
+
from logging_helper import _log_model_response
|
| 8 |
+
from common import OPENAI_PRICING
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
MODEL_ID = "allenai/olmOCR-2-7B-1025-FP8"
|
| 12 |
+
HF_ENDPOINT_URL = "https://wsy54j97qbvg7mua.us-east-1.aws.endpoints.huggingface.cloud"
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def _build_messages(image_base64: str, prompt: str):
|
| 16 |
+
return [
|
| 17 |
+
{
|
| 18 |
+
"role": "user",
|
| 19 |
+
"content": [
|
| 20 |
+
{"type": "text", "text": prompt},
|
| 21 |
+
{
|
| 22 |
+
"type": "image_url",
|
| 23 |
+
"image_url": {"url": f"data:image/jpeg;base64,{image_base64}"},
|
| 24 |
+
},
|
| 25 |
+
],
|
| 26 |
+
}
|
| 27 |
+
]
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def _run_olmocr(image: Image.Image, prompt: str) -> str:
|
| 31 |
+
image_base64 = _pil_image_to_base64_jpeg(image)
|
| 32 |
+
messages = _build_messages(image_base64, prompt)
|
| 33 |
+
|
| 34 |
+
hf_token: Optional[str] = os.getenv("HF_TOKEN")
|
| 35 |
+
|
| 36 |
+
client = InferenceClient(
|
| 37 |
+
base_url=HF_ENDPOINT_URL,
|
| 38 |
+
token=hf_token,
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
start_time = time.perf_counter()
|
| 42 |
+
|
| 43 |
+
completion = client.chat.completions.create(
|
| 44 |
+
model=MODEL_ID,
|
| 45 |
+
messages=messages,
|
| 46 |
+
max_tokens=512,
|
| 47 |
+
temperature=0.1,
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
duration = time.perf_counter() - start_time
|
| 51 |
+
|
| 52 |
+
content = str(completion.choices[0].message.content)
|
| 53 |
+
|
| 54 |
+
_log_model_response(
|
| 55 |
+
model_name=MODEL_ID,
|
| 56 |
+
content=content,
|
| 57 |
+
duration=duration,
|
| 58 |
+
usage=completion.usage,
|
| 59 |
+
pricing=OPENAI_PRICING,
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
return content
|
openai_backend.py
CHANGED
|
@@ -2,77 +2,17 @@ import os
|
|
| 2 |
import base64
|
| 3 |
import time
|
| 4 |
from io import BytesIO
|
| 5 |
-
|
| 6 |
from typing import Optional
|
| 7 |
-
|
| 8 |
from PIL import Image
|
| 9 |
-
|
| 10 |
-
from
|
|
|
|
| 11 |
|
| 12 |
try:
|
| 13 |
from openai import OpenAI
|
| 14 |
except ImportError: # pragma: no cover
|
| 15 |
OpenAI = None # type: ignore
|
| 16 |
|
| 17 |
-
|
| 18 |
-
OPENAI_PRICING = {
|
| 19 |
-
# GPT-5.2 family
|
| 20 |
-
"gpt-5.2": {"input": 1.75, "output": 14.00},
|
| 21 |
-
"gpt-5.2-chat-latest": {"input": 1.75, "output": 14.00},
|
| 22 |
-
"gpt-5.2-pro": {"input": 21.00, "output": 168.00},
|
| 23 |
-
|
| 24 |
-
# GPT-5.1 / GPT-5 family
|
| 25 |
-
"gpt-5.1": {"input": 1.25, "output": 10.00},
|
| 26 |
-
"gpt-5": {"input": 1.25, "output": 10.00},
|
| 27 |
-
"gpt-5-mini": {"input": 0.25, "output": 2.00},
|
| 28 |
-
"gpt-5-nano": {"input": 0.05, "output": 0.40},
|
| 29 |
-
"gpt-5.1-chat-latest": {"input": 1.25, "output": 10.00},
|
| 30 |
-
"gpt-5-chat-latest": {"input": 1.25, "output": 10.00},
|
| 31 |
-
"gpt-5.1-codex-max": {"input": 1.25, "output": 10.00},
|
| 32 |
-
"gpt-5.1-codex": {"input": 1.25, "output": 10.00},
|
| 33 |
-
"gpt-5-codex": {"input": 1.25, "output": 10.00},
|
| 34 |
-
"gpt-5.1-codex-mini": {"input": 0.25, "output": 2.00},
|
| 35 |
-
"gpt-5-pro": {"input": 15.00, "output": 120.00},
|
| 36 |
-
"gpt-5-search-api": {"input": 1.25, "output": 10.00},
|
| 37 |
-
|
| 38 |
-
# GPT-4.1 family
|
| 39 |
-
"gpt-4.1": {"input": 2.00, "output": 8.00},
|
| 40 |
-
"gpt-4.1-mini": {"input": 0.40, "output": 1.60},
|
| 41 |
-
"gpt-4.1-nano": {"input": 0.10, "output": 0.40},
|
| 42 |
-
|
| 43 |
-
# GPT-4o family
|
| 44 |
-
"gpt-4o": {"input": 2.50, "output": 10.00},
|
| 45 |
-
"gpt-4o-2024-05-13": {"input": 5.00, "output": 15.00},
|
| 46 |
-
"gpt-4o-mini": {"input": 0.15, "output": 0.60},
|
| 47 |
-
"chatgpt-4o-latest": {"input": 5.00, "output": 15.00},
|
| 48 |
-
|
| 49 |
-
# GPT-4 Turbo / GPT-4 legacy family (from legacy models table)
|
| 50 |
-
"gpt-4-turbo": {"input": 10.00, "output": 30.00},
|
| 51 |
-
"gpt-4-turbo-2024-04-09": {"input": 10.00, "output": 30.00},
|
| 52 |
-
"gpt-4-0125-preview": {"input": 10.00, "output": 30.00},
|
| 53 |
-
"gpt-4-1106-preview": {"input": 10.00, "output": 30.00},
|
| 54 |
-
"gpt-4-1106-vision-preview": {"input": 10.00, "output": 30.00},
|
| 55 |
-
"gpt-4-0613": {"input": 30.00, "output": 60.00},
|
| 56 |
-
"gpt-4-0314": {"input": 30.00, "output": 60.00},
|
| 57 |
-
"gpt-4": {"input": 30.00, "output": 60.00},
|
| 58 |
-
"gpt-4-32k": {"input": 60.00, "output": 120.00},
|
| 59 |
-
|
| 60 |
-
# Default
|
| 61 |
-
"default": {"input": 2.50, "output": 10.00},
|
| 62 |
-
}
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
def _calculate_openai_cost(usage, model_name: str) -> float:
|
| 66 |
-
"""Calculate cost based on token usage and model pricing (per 1M tokens)."""
|
| 67 |
-
if not usage:
|
| 68 |
-
return 0.0
|
| 69 |
-
|
| 70 |
-
pricing = OPENAI_PRICING.get(model_name, OPENAI_PRICING["default"])
|
| 71 |
-
input_cost = (usage.prompt_tokens / 1_000_000) * pricing["input"]
|
| 72 |
-
output_cost = (usage.completion_tokens / 1_000_000) * pricing["output"]
|
| 73 |
-
return input_cost + output_cost
|
| 74 |
-
|
| 75 |
-
|
| 76 |
def _run_openai_vision(image: Image.Image, prompt: str, model_name: str) -> str:
|
| 77 |
if OpenAI is None:
|
| 78 |
raise RuntimeError("openai package is not installed. Please install it to use ChatGPT 5.2 backend.")
|
|
@@ -83,9 +23,7 @@ def _run_openai_vision(image: Image.Image, prompt: str, model_name: str) -> str:
|
|
| 83 |
|
| 84 |
client = OpenAI(api_key=api_key)
|
| 85 |
|
| 86 |
-
|
| 87 |
-
image.save(buffered, format="JPEG")
|
| 88 |
-
img_b64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
|
| 89 |
|
| 90 |
_log_debug(f"Using OpenAI model: {model_name}")
|
| 91 |
_log_debug(f"Input image size: {image.size}")
|
|
@@ -111,20 +49,14 @@ def _run_openai_vision(image: Image.Image, prompt: str, model_name: str) -> str:
|
|
| 111 |
|
| 112 |
duration = time.perf_counter() - start_time
|
| 113 |
|
| 114 |
-
usage = response.usage
|
| 115 |
-
if usage:
|
| 116 |
-
cost = _calculate_openai_cost(usage, model_name)
|
| 117 |
-
_log(f"Model: {model_name}")
|
| 118 |
-
_log(
|
| 119 |
-
"Token usage: Input={usage.prompt_tokens}, "
|
| 120 |
-
"Output={usage.completion_tokens}, Total={usage.total_tokens}".format(usage=usage)
|
| 121 |
-
)
|
| 122 |
-
_log(f"Estimated cost: ${cost:.6f}")
|
| 123 |
-
_log(f"Execution time: {duration:.3f} seconds")
|
| 124 |
-
|
| 125 |
content = response.choices[0].message.content or ""
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
return content
|
|
|
|
| 2 |
import base64
|
| 3 |
import time
|
| 4 |
from io import BytesIO
|
|
|
|
| 5 |
from typing import Optional
|
|
|
|
| 6 |
from PIL import Image
|
| 7 |
+
from logging_helper import log as _log, log_debug as _log_debug, _log_model_response
|
| 8 |
+
from image_utils import _pil_image_to_base64_jpeg
|
| 9 |
+
from common import OPENAI_PRICING
|
| 10 |
|
| 11 |
try:
|
| 12 |
from openai import OpenAI
|
| 13 |
except ImportError: # pragma: no cover
|
| 14 |
OpenAI = None # type: ignore
|
| 15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
def _run_openai_vision(image: Image.Image, prompt: str, model_name: str) -> str:
|
| 17 |
if OpenAI is None:
|
| 18 |
raise RuntimeError("openai package is not installed. Please install it to use ChatGPT 5.2 backend.")
|
|
|
|
| 23 |
|
| 24 |
client = OpenAI(api_key=api_key)
|
| 25 |
|
| 26 |
+
img_b64 = _pil_image_to_base64_jpeg(image)
|
|
|
|
|
|
|
| 27 |
|
| 28 |
_log_debug(f"Using OpenAI model: {model_name}")
|
| 29 |
_log_debug(f"Input image size: {image.size}")
|
|
|
|
| 49 |
|
| 50 |
duration = time.perf_counter() - start_time
|
| 51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
content = response.choices[0].message.content or ""
|
| 53 |
+
|
| 54 |
+
_log_model_response(
|
| 55 |
+
model_name=model_name,
|
| 56 |
+
content=content,
|
| 57 |
+
duration=duration,
|
| 58 |
+
usage=response.usage,
|
| 59 |
+
pricing=OPENAI_PRICING,
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
return content
|
requirements.txt
CHANGED
|
@@ -6,4 +6,5 @@ torch>=2.2.0
|
|
| 6 |
transformers>=4.42.0
|
| 7 |
pillow>=10.3.0
|
| 8 |
python-dotenv>=1.0.0
|
| 9 |
-
compressed-tensors>=0.0.0
|
|
|
|
|
|
| 6 |
transformers>=4.42.0
|
| 7 |
pillow>=10.3.0
|
| 8 |
python-dotenv>=1.0.0
|
| 9 |
+
compressed-tensors>=0.0.0
|
| 10 |
+
accelerate>=0.22.0
|