snake11235 commited on
Commit
081369c
·
1 Parent(s): 32b93c4

feat: add Gradio cache and Python cache directories to gitignore

Browse files

Add .gradio/ and __pycache__/ to gitignore to exclude auto-generated cache directories from version control. These directories are created during application runtime and should not be committed.

feat: migrate to google-genai SDK and add multi-model Gemini support with unified usage tracking

Refactor Gemini backend to use new google-genai SDK instead of google-generativeai. Add support for multiple Gemini models (2.5-flash, 2.5-pro, 3-pro-preview) with accurate pricing. Enhance logging helper to handle different usage metadata formats from OpenAI and Gemini providers.

- Extract _run_gemini_vision() to gemini_backend.py module
- Update to google-genai>=0.1.0 SDK with new API patterns
- Add model

feat: add manual vocabulary CSV with Dutch-English-Russian translations

Add manual_vocab.csv containing Dutch language learning vocabulary with English and Russian translations. Includes common words, phrases, and household activities organized by date (August-September 2023).

- Add 38 vocabulary entries with directional words (links/rechts, boven/beneden)
- Add household vocabulary (de gang, de tuin, dingen/spullen)
- Add daily routine phrases (lunch, douche, tanden poetsen, afwas doen)
- Add frequency

feat: add example image preview and remove deprecated OpenAI models from pricing map

Add Gradio Examples component with preview image for vocab.jpg example. Clean up MODELS_MAP by removing deprecated GPT-4 and GPT-4o family models, keeping only current GPT-5 and GPT-4.1 families.

- Add image_example_preview component with static/vocab.jpg preview
- Add gr.Examples with vocab.jpg example input
- Remove deprecated GPT-5 variants (chat-latest, codex, pro, search-api)
- Remove GPT-4o family models (g

feat: add execution logs display to UI with timestamp, model, duration, and cost tracking

Add logs output textbox to Gradio interface showing accumulated model execution history. Store each model run in global _MODEL_LOGS list with timestamp, model name, duration, and cost. Update process_document() to return logs as third output.

- Add _MODEL_LOGS global list to logging_helper.py for storing execution history
- Add get_latest_model_log() function to retrieve accumulated logs
- Store CSV-formatte

removed vocab.jpg from index

added vocab.jpg into index

removed vocab.jpg from index

Files changed (7) hide show
  1. .gitignore +2 -0
  2. app.py +24 -27
  3. common.py +5 -31
  4. gemini_backend.py +61 -0
  5. logging_helper.py +67 -12
  6. manual_vocab.csv +38 -0
  7. requirements.txt +1 -1
.gitignore CHANGED
@@ -1,2 +1,4 @@
1
  .env
2
  *.pyc
 
 
 
1
  .env
2
  *.pyc
3
+ .gradio/
4
+ __pycache__/
app.py CHANGED
@@ -15,14 +15,10 @@ from olmocr.data.renderpdf import render_pdf_to_base64png
15
 
16
  from openai_backend import _run_openai_vision
17
  from common import MODELS_MAP, MODEL_GEMINI, MODEL_OLMOCR
18
- from logging_helper import log as _log, log_debug as _log_debug
 
19
  from olm_ocr import _run_olmocr
20
 
21
- try:
22
- import google.generativeai as genai
23
- except ImportError: # pragma: no cover
24
- genai = None # type: ignore
25
-
26
  APP_TITLE = "words2doc"
27
  APP_DESCRIPTION = "Upload a PDF or image with (handwritten) text and convert it to CSV using different LLM backends."
28
 
@@ -80,29 +76,12 @@ def _encode_image(image_path):
80
  return base64.b64encode(image_file.read()).decode("utf-8")
81
 
82
 
83
- def _run_gemini_vision(image: Image.Image, prompt: str) -> str:
84
- if genai is None:
85
- raise RuntimeError("google-generativeai package is not installed. Please install it to use Gemini backend.")
86
-
87
- api_key = os.getenv("GEMINI_API_KEY")
88
- if not api_key:
89
- raise RuntimeError("GEMINI_API_KEY environment variable is not set.")
90
-
91
- genai.configure(api_key=api_key)
92
- model_name = os.getenv("WORDS2DOC_GEMINI_MODEL", "gemini-1.5-flash")
93
- model = genai.GenerativeModel(model_name)
94
-
95
- # Gemini expects a PIL Image directly
96
- response = model.generate_content([prompt, image])
97
- return response.text or ""
98
-
99
-
100
  # -------- Main processing function -------- #
101
 
102
 
103
  def process_document(file_obj, model_choice: str, prompt: str):
104
  if file_obj is None:
105
- return "No file uploaded.", None
106
 
107
  file_path = getattr(file_obj, "name", None) or file_obj
108
  image = _image_from_any_file(file_path)
@@ -122,14 +101,17 @@ def process_document(file_obj, model_choice: str, prompt: str):
122
  if MODELS_MAP[model_choice]["backend"] == "openai":
123
  csv_text = _run_openai_vision(image, prompt, model_choice)
124
  elif MODELS_MAP[model_choice]["backend"] == "gemini":
125
- csv_text = _run_gemini_vision(image, prompt)
126
  elif MODELS_MAP[model_choice]["backend"] == "olmocr":
127
  csv_text = _run_olmocr(image, prompt)
128
  else:
129
  csv_text = f"Unknown model choice: {model_choice}"
130
 
131
  csv_file_path = _write_csv_to_temp_file(csv_text)
132
- return csv_text, csv_file_path
 
 
 
133
 
134
  # -------- Gradio UI -------- #
135
 
@@ -145,6 +127,16 @@ def build_interface() -> gr.Blocks:
145
  label="Upload PDF or image",
146
  file_types=[".pdf", ".png", ".jpg", ".jpeg", ".webp"],
147
  )
 
 
 
 
 
 
 
 
 
 
148
 
149
  model_selector = gr.Dropdown(
150
  label="LLM backend",
@@ -194,11 +186,16 @@ def build_interface() -> gr.Blocks:
194
  buttons=["copy"],
195
  )
196
  csv_file = gr.File(label="Download CSV file", interactive=False)
 
 
 
 
 
197
 
198
  run_button.click(
199
  fn=process_document,
200
  inputs=[file_input, model_selector, prompt_editor],
201
- outputs=[csv_output, csv_file],
202
  )
203
 
204
  return demo
 
15
 
16
  from openai_backend import _run_openai_vision
17
  from common import MODELS_MAP, MODEL_GEMINI, MODEL_OLMOCR
18
+ from gemini_backend import _run_gemini_vision
19
+ from logging_helper import log as _log, log_debug as _log_debug, get_latest_model_log as _get_latest_model_log
20
  from olm_ocr import _run_olmocr
21
 
 
 
 
 
 
22
  APP_TITLE = "words2doc"
23
  APP_DESCRIPTION = "Upload a PDF or image with (handwritten) text and convert it to CSV using different LLM backends."
24
 
 
76
  return base64.b64encode(image_file.read()).decode("utf-8")
77
 
78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  # -------- Main processing function -------- #
80
 
81
 
82
  def process_document(file_obj, model_choice: str, prompt: str):
83
  if file_obj is None:
84
+ return "No file uploaded.", None, ""
85
 
86
  file_path = getattr(file_obj, "name", None) or file_obj
87
  image = _image_from_any_file(file_path)
 
101
  if MODELS_MAP[model_choice]["backend"] == "openai":
102
  csv_text = _run_openai_vision(image, prompt, model_choice)
103
  elif MODELS_MAP[model_choice]["backend"] == "gemini":
104
+ csv_text = _run_gemini_vision(image, prompt, model_choice)
105
  elif MODELS_MAP[model_choice]["backend"] == "olmocr":
106
  csv_text = _run_olmocr(image, prompt)
107
  else:
108
  csv_text = f"Unknown model choice: {model_choice}"
109
 
110
  csv_file_path = _write_csv_to_temp_file(csv_text)
111
+
112
+ latest_log = _get_latest_model_log() or ""
113
+
114
+ return csv_text, csv_file_path, latest_log
115
 
116
  # -------- Gradio UI -------- #
117
 
 
127
  label="Upload PDF or image",
128
  file_types=[".pdf", ".png", ".jpg", ".jpeg", ".webp"],
129
  )
130
+ image_example_preview = gr.Image(
131
+ label="Example image preview",
132
+ value="static/vocab.jpg", # adjust to real relative path
133
+ interactive=False,
134
+ )
135
+ gr.Examples(
136
+ examples=[["vocab.jpg", "vocab.jpg"]],
137
+ inputs=[file_input, image_example_preview],
138
+ label="Example image",
139
+ )
140
 
141
  model_selector = gr.Dropdown(
142
  label="LLM backend",
 
186
  buttons=["copy"],
187
  )
188
  csv_file = gr.File(label="Download CSV file", interactive=False)
189
+ with gr.Row():
190
+ logs_output = gr.Textbox(
191
+ label="Logs",
192
+ lines=4,
193
+ )
194
 
195
  run_button.click(
196
  fn=process_document,
197
  inputs=[file_input, model_selector, prompt_editor],
198
+ outputs=[csv_output, csv_file, logs_output],
199
  )
200
 
201
  return demo
common.py CHANGED
@@ -1,4 +1,4 @@
1
- MODEL_GEMINI = "Gemini 3 Pro"
2
  MODEL_OLMOCR = "olmOCR-2-7B-1025-FP8"
3
 
4
 
@@ -13,41 +13,15 @@ MODELS_MAP = {
13
  "gpt-5": {"input": 1.25, "output": 10.00, "backend": "openai"},
14
  "gpt-5-mini": {"input": 0.25, "output": 2.00, "backend": "openai"},
15
  "gpt-5-nano": {"input": 0.05, "output": 0.40, "backend": "openai"},
16
- "gpt-5.1-chat-latest": {"input": 1.25, "output": 10.00, "backend": "openai"},
17
- "gpt-5-chat-latest": {"input": 1.25, "output": 10.00, "backend": "openai"},
18
- "gpt-5.1-codex-max": {"input": 1.25, "output": 10.00, "backend": "openai"},
19
- "gpt-5.1-codex": {"input": 1.25, "output": 10.00, "backend": "openai"},
20
- "gpt-5-codex": {"input": 1.25, "output": 10.00, "backend": "openai"},
21
- "gpt-5.1-codex-mini": {"input": 0.25, "output": 2.00, "backend": "openai"},
22
- "gpt-5-pro": {"input": 15.00, "output": 120.00, "backend": "openai"},
23
- "gpt-5-search-api": {"input": 1.25, "output": 10.00, "backend": "openai"},
24
 
25
  # GPT-4.1 family
26
  "gpt-4.1": {"input": 2.00, "output": 8.00, "backend": "openai"},
27
  "gpt-4.1-mini": {"input": 0.40, "output": 1.60, "backend": "openai"},
28
  "gpt-4.1-nano": {"input": 0.10, "output": 0.40, "backend": "openai"},
29
-
30
- # GPT-4o family
31
- "gpt-4o": {"input": 2.50, "output": 10.00, "backend": "openai"},
32
- "gpt-4o-2024-05-13": {"input": 5.00, "output": 15.00, "backend": "openai"},
33
- "gpt-4o-mini": {"input": 0.15, "output": 0.60, "backend": "openai"},
34
- "chatgpt-4o-latest": {"input": 5.00, "output": 15.00, "backend": "openai"},
35
-
36
- # GPT-4 Turbo / GPT-4 legacy family (from legacy models table)
37
- "gpt-4-turbo": {"input": 10.00, "output": 30.00, "backend": "openai"},
38
- "gpt-4-turbo-2024-04-09": {"input": 10.00, "output": 30.00, "backend": "openai"},
39
- "gpt-4-0125-preview": {"input": 10.00, "output": 30.00, "backend": "openai"},
40
- "gpt-4-1106-preview": {"input": 10.00, "output": 30.00, "backend": "openai"},
41
- "gpt-4-1106-vision-preview": {"input": 10.00, "output": 30.00, "backend": "openai"},
42
- "gpt-4-0613": {"input": 30.00, "output": 60.00, "backend": "openai"},
43
- "gpt-4-0314": {"input": 30.00, "output": 60.00, "backend": "openai"},
44
- "gpt-4": {"input": 30.00, "output": 60.00, "backend": "openai"},
45
- "gpt-4-32k": {"input": 60.00, "output": 120.00, "backend": "openai"},
46
-
47
- # Default
48
- "default": {"input": 2.50, "output": 10.00, "backend": "openai"},
49
-
50
  # Other backends (mock rates)
51
- MODEL_GEMINI: {"input": 1.00, "output": 1.00, "backend": "gemini"},
 
52
  MODEL_OLMOCR: {"input": 1.35, "output": 0.30, "backend": "olmocr"},
 
 
53
  }
 
1
+ MODEL_GEMINI = "gemini-2.5-flash"
2
  MODEL_OLMOCR = "olmOCR-2-7B-1025-FP8"
3
 
4
 
 
13
  "gpt-5": {"input": 1.25, "output": 10.00, "backend": "openai"},
14
  "gpt-5-mini": {"input": 0.25, "output": 2.00, "backend": "openai"},
15
  "gpt-5-nano": {"input": 0.05, "output": 0.40, "backend": "openai"},
 
 
 
 
 
 
 
 
16
 
17
  # GPT-4.1 family
18
  "gpt-4.1": {"input": 2.00, "output": 8.00, "backend": "openai"},
19
  "gpt-4.1-mini": {"input": 0.40, "output": 1.60, "backend": "openai"},
20
  "gpt-4.1-nano": {"input": 0.10, "output": 0.40, "backend": "openai"},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  # Other backends (mock rates)
22
+ MODEL_GEMINI: {"input": 0.30, "output": 2.50, "backend": "gemini"},
23
+ "gemini-3-pro-preview": {"input": 2.00, "output": 12.00, "backend": "gemini"},
24
  MODEL_OLMOCR: {"input": 1.35, "output": 0.30, "backend": "olmocr"},
25
+ "gemini-2.5-pro": {"input": 1.25, "output": 10.00, "backend": "gemini"},
26
+ "default": {"input": 2.50, "output": 10.00, "backend": "openai"},
27
  }
gemini_backend.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ from typing import TYPE_CHECKING
4
+ from PIL import Image
5
+ from logging_helper import log as _log, log_debug as _log_debug, _log_model_response
6
+ from common import MODELS_MAP
7
+
8
+ try:
9
+ # New google-genai client library
10
+ from google import genai
11
+ except ImportError: # pragma: no cover
12
+ genai = None # type: ignore
13
+
14
+ if TYPE_CHECKING: # pragma: no cover
15
+ # Kept for type-checkers; Image is also imported at runtime above
16
+ from PIL import Image as _ImageType
17
+
18
+
19
+ def _run_gemini_vision(image: Image.Image, prompt: str, model_choice: str) -> str:
20
+ if genai is None:
21
+ raise RuntimeError("google-genai package is not installed. Please install it to use Gemini backend.")
22
+
23
+ api_key = os.getenv("GEMINI_API_KEY")
24
+ if not api_key:
25
+ raise RuntimeError("GEMINI_API_KEY environment variable is not set.")
26
+
27
+ model_name = model_choice
28
+
29
+ # Instantiate the google-genai client and call the model
30
+ client = genai.Client(api_key=api_key)
31
+
32
+ _log_debug(f"Using Gemini model: {model_name}")
33
+ _log_debug(f"Input image size: {image.size}")
34
+
35
+ start_time = time.perf_counter()
36
+
37
+ # google-genai accepts mixed text and image content
38
+ response = client.models.generate_content(
39
+ model=model_name,
40
+ contents=[prompt, image],
41
+ )
42
+
43
+ duration = time.perf_counter() - start_time
44
+ print(f"Response: {response}")
45
+ print(f"Response text: {getattr(response, 'text', 'No text attribute')}")
46
+
47
+ content = response.text or ""
48
+
49
+ usage = getattr(response, "usage_metadata", None)
50
+ if usage is None:
51
+ usage = getattr(response, "usage", None)
52
+
53
+ _log_model_response(
54
+ model_name=model_name,
55
+ content=content,
56
+ duration=duration,
57
+ usage=usage,
58
+ pricing=MODELS_MAP,
59
+ )
60
+
61
+ return content
logging_helper.py CHANGED
@@ -1,5 +1,9 @@
1
  import os
2
- from typing import Any, Mapping, Optional
 
 
 
 
3
 
4
 
5
  def log(message: str) -> None:
@@ -24,28 +28,79 @@ def _log_model_response(
24
 
25
  Returns the calculated cost if pricing and usage are provided, otherwise None.
26
  """
 
 
27
  cost: Optional[float] = None
28
 
29
  if usage is not None and pricing is not None:
30
- pricing_row = pricing.get(model_name, pricing[default_pricing_key])
31
- input_cost = (usage.prompt_tokens / 1_000_000) * pricing_row["input"]
32
- output_cost = (usage.completion_tokens / 1_000_000) * pricing_row["output"]
33
- cost = input_cost + output_cost
34
 
35
- log(f"Model: {model_name}")
36
- log(
37
- "Token usage: Input={usage.prompt_tokens}, "
38
- "Output={usage.completion_tokens}, Total={usage.total_tokens}".format(usage=usage)
39
- )
40
- log(f"Estimated cost: ${cost:.6f}")
41
- log(f"Execution time: {duration:.3f} seconds")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  else:
43
  log(f"Model: {model_name}")
44
  log(f"Execution time: {duration:.3f} seconds")
45
 
46
  log("Model response received")
 
 
 
 
 
 
 
47
  log_debug(f"Response length: {len(content)} characters")
48
  log_debug(f"Result: {content}")
49
  log_debug("End of result")
50
 
51
  return cost
 
 
 
 
 
 
 
 
1
  import os
2
+ from typing import Any, List, Mapping, Optional
3
+ from datetime import datetime
4
+
5
+
6
+ _MODEL_LOGS: List[str] = []
7
 
8
 
9
  def log(message: str) -> None:
 
28
 
29
  Returns the calculated cost if pricing and usage are provided, otherwise None.
30
  """
31
+ global _MODEL_LOGS
32
+
33
  cost: Optional[float] = None
34
 
35
  if usage is not None and pricing is not None:
36
+ # Normalize different provider usage formats into a common shape.
37
+ prompt_tokens: Optional[float] = None
38
+ completion_tokens: Optional[float] = None
39
+ total_tokens: Optional[float] = None
40
 
41
+ # Attribute-style access (e.g. OpenAI usage)
42
+ if hasattr(usage, "prompt_tokens") and hasattr(usage, "completion_tokens"):
43
+ prompt_tokens = float(getattr(usage, "prompt_tokens"))
44
+ completion_tokens = float(getattr(usage, "completion_tokens"))
45
+ total_tokens = float(getattr(usage, "total_tokens", prompt_tokens + completion_tokens))
46
+
47
+ # Gemini GenerateContentResponseUsageMetadata-style
48
+ elif hasattr(usage, "prompt_token_count") and hasattr(usage, "candidates_token_count"):
49
+ prompt_tokens = float(getattr(usage, "prompt_token_count"))
50
+ completion_tokens = float(getattr(usage, "candidates_token_count"))
51
+ total_tokens = float(getattr(usage, "total_token_count", prompt_tokens + completion_tokens))
52
+
53
+ # dict-style access fallback
54
+ elif isinstance(usage, Mapping):
55
+ if {"prompt_tokens", "completion_tokens"}.issubset(usage.keys()):
56
+ prompt_tokens = float(usage["prompt_tokens"])
57
+ completion_tokens = float(usage["completion_tokens"])
58
+ total_tokens = float(usage.get("total_tokens", prompt_tokens + completion_tokens))
59
+ elif {"prompt_token_count", "candidates_token_count"}.issubset(usage.keys()):
60
+ prompt_tokens = float(usage["prompt_token_count"])
61
+ completion_tokens = float(usage["candidates_token_count"])
62
+ total_tokens = float(usage.get("total_token_count", prompt_tokens + completion_tokens))
63
+
64
+ if prompt_tokens is not None and completion_tokens is not None and total_tokens is not None:
65
+ pricing_row = pricing.get(model_name, pricing[default_pricing_key])
66
+ input_cost = (prompt_tokens / 1_000_000) * pricing_row["input"]
67
+ output_cost = (completion_tokens / 1_000_000) * pricing_row["output"]
68
+ cost = input_cost + output_cost
69
+
70
+ log(f"Model: {model_name}")
71
+ log(
72
+ "Token usage: Input={prompt}, Output={completion}, Total={total}".format(
73
+ prompt=int(prompt_tokens), completion=int(completion_tokens), total=int(total_tokens)
74
+ )
75
+ )
76
+ log(f"Estimated cost: ${cost:.6f}")
77
+ log(f"Execution time: {duration:.3f} seconds")
78
+ else:
79
+ # Usage provided but in an unknown format – still log basic info.
80
+ log(f"Model: {model_name}")
81
+ log(f"Execution time: {duration:.3f} seconds")
82
+ log_debug(f"Unrecognized usage format: {usage!r}")
83
  else:
84
  log(f"Model: {model_name}")
85
  log(f"Execution time: {duration:.3f} seconds")
86
 
87
  log("Model response received")
88
+
89
+ # Store latest model log in a simple CSV-like line: timestamp, model name, duration, cost
90
+ timestamp = datetime.now().isoformat(timespec="seconds")
91
+ duration_s = float(duration)
92
+ cost_value = float(cost) if cost is not None else 0.0
93
+ line = f"{timestamp}, {model_name}, {duration_s:.3f} seconds, ${cost_value:.6f}"
94
+ _MODEL_LOGS.append(line)
95
  log_debug(f"Response length: {len(content)} characters")
96
  log_debug(f"Result: {content}")
97
  log_debug("End of result")
98
 
99
  return cost
100
+
101
+
102
+ def get_latest_model_log() -> Optional[str]:
103
+ """Return full accumulated model logs as a single string, or None if empty."""
104
+ if not _MODEL_LOGS:
105
+ return None
106
+ return "\n".join(_MODEL_LOGS)
manual_vocab.csv ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 22.08.2023,
2
+ links, left
3
+ rechts, right
4
+ boven, above
5
+ beneden, downstairs
6
+ allerlei, all kinds of stuff
7
+ de gang, коридор
8
+ dingen/spullen, stuff
9
+ de tuin,
10
+ gewoon, simple
11
+ Д/з урок p71,
12
+ 5.09.2023,
13
+ op zaterdag ben ik naar .. gekomen,
14
+ wandelen, to walk
15
+ verkocht, sold
16
+ gekocht, bought
17
+ tijd manier plaats,
18
+ waarvoor, for what
19
+ bezoeken, to visit
20
+ dingen,
21
+ bijvoorbeeld, for example
22
+ er is een verschil, there is a difference
23
+ tuin, garden
24
+ Д/з 3 72 диктант под диктовку,
25
+ 5. 75.
26
+ de afwas doen, do the laundry
27
+ Ik lunch tussen de middag, I have a lunch at noon
28
+ afdrogen, dry
29
+ Ik droog af, I dry off
30
+ tanden poetsen, toothbrushing
31
+ I douche 's ochtends en 's avonds,
32
+ Ik kleed me aan, Я одеваюсь
33
+ al, already
34
+ vaak, often
35
+ meestal, usually
36
+ nooit, never
37
+ zelden, seldom редко
38
+ altijd, always
requirements.txt CHANGED
@@ -1,6 +1,6 @@
1
  gradio>=6.1.0
2
  openai>=1.40.0
3
- google-generativeai>=0.7.0
4
  olmocr>=0.4.0
5
  torch>=2.2.0
6
  transformers>=4.42.0
 
1
  gradio>=6.1.0
2
  openai>=1.40.0
3
+ google-genai>=0.1.0
4
  olmocr>=0.4.0
5
  torch>=2.2.0
6
  transformers>=4.42.0