Spaces:

snake11235
/

words2csv

Running

snake11235 commited on 2 days ago

Commit

081369c

1 Parent(s): 32b93c4

feat: add Gradio cache and Python cache directories to gitignore

Add .gradio/ and __pycache__/ to gitignore to exclude auto-generated cache directories from version control. These directories are created during application runtime and should not be committed.

feat: migrate to google-genai SDK and add multi-model Gemini support with unified usage tracking

Refactor Gemini backend to use new google-genai SDK instead of google-generativeai. Add support for multiple Gemini models (2.5-flash, 2.5-pro, 3-pro-preview) with accurate pricing. Enhance logging helper to handle different usage metadata formats from OpenAI and Gemini providers.

- Extract _run_gemini_vision() to gemini_backend.py module
- Update to google-genai>=0.1.0 SDK with new API patterns
- Add model

feat: add manual vocabulary CSV with Dutch-English-Russian translations

Add manual_vocab.csv containing Dutch language learning vocabulary with English and Russian translations. Includes common words, phrases, and household activities organized by date (August-September 2023).

- Add 38 vocabulary entries with directional words (links/rechts, boven/beneden)
- Add household vocabulary (de gang, de tuin, dingen/spullen)
- Add daily routine phrases (lunch, douche, tanden poetsen, afwas doen)
- Add frequency

feat: add example image preview and remove deprecated OpenAI models from pricing map

Add Gradio Examples component with preview image for vocab.jpg example. Clean up MODELS_MAP by removing deprecated GPT-4 and GPT-4o family models, keeping only current GPT-5 and GPT-4.1 families.

- Add image_example_preview component with static/vocab.jpg preview
- Add gr.Examples with vocab.jpg example input
- Remove deprecated GPT-5 variants (chat-latest, codex, pro, search-api)
- Remove GPT-4o family models (g

feat: add execution logs display to UI with timestamp, model, duration, and cost tracking

Add logs output textbox to Gradio interface showing accumulated model execution history. Store each model run in global _MODEL_LOGS list with timestamp, model name, duration, and cost. Update process_document() to return logs as third output.

- Add _MODEL_LOGS global list to logging_helper.py for storing execution history
- Add get_latest_model_log() function to retrieve accumulated logs
- Store CSV-formatte

removed vocab.jpg from index

added vocab.jpg into index

removed vocab.jpg from index

Files changed (7) hide show

.gitignore +2 -0
app.py +24 -27
common.py +5 -31
gemini_backend.py +61 -0
logging_helper.py +67 -12
manual_vocab.csv +38 -0
requirements.txt +1 -1

.gitignore CHANGED Viewed

@@ -1,2 +1,4 @@
 .env
 *.pyc

 .env
 *.pyc
+.gradio/
+__pycache__/

app.py CHANGED Viewed

@@ -15,14 +15,10 @@ from olmocr.data.renderpdf import render_pdf_to_base64png
 from openai_backend import _run_openai_vision
 from common import MODELS_MAP, MODEL_GEMINI, MODEL_OLMOCR
-from logging_helper import log as _log, log_debug as _log_debug
 from olm_ocr import _run_olmocr
-try:
-    import google.generativeai as genai
-except ImportError:  # pragma: no cover
-    genai = None  # type: ignore
 APP_TITLE = "words2doc"
 APP_DESCRIPTION = "Upload a PDF or image with (handwritten) text and convert it to CSV using different LLM backends."
@@ -80,29 +76,12 @@ def _encode_image(image_path):
         return base64.b64encode(image_file.read()).decode("utf-8")
-def _run_gemini_vision(image: Image.Image, prompt: str) -> str:
-    if genai is None:
-        raise RuntimeError("google-generativeai package is not installed. Please install it to use Gemini backend.")
-    api_key = os.getenv("GEMINI_API_KEY")
-    if not api_key:
-        raise RuntimeError("GEMINI_API_KEY environment variable is not set.")
-    genai.configure(api_key=api_key)
-    model_name = os.getenv("WORDS2DOC_GEMINI_MODEL", "gemini-1.5-flash")
-    model = genai.GenerativeModel(model_name)
-    # Gemini expects a PIL Image directly
-    response = model.generate_content([prompt, image])
-    return response.text or ""
 # -------- Main processing function -------- #
 def process_document(file_obj, model_choice: str, prompt: str):
     if file_obj is None:
-        return "No file uploaded.", None
     file_path = getattr(file_obj, "name", None) or file_obj
     image = _image_from_any_file(file_path)
@@ -122,14 +101,17 @@ def process_document(file_obj, model_choice: str, prompt: str):
     if MODELS_MAP[model_choice]["backend"] == "openai":
         csv_text = _run_openai_vision(image, prompt, model_choice)
     elif MODELS_MAP[model_choice]["backend"] == "gemini":
-        csv_text = _run_gemini_vision(image, prompt)
     elif MODELS_MAP[model_choice]["backend"] == "olmocr":
         csv_text = _run_olmocr(image, prompt)
     else:
         csv_text = f"Unknown model choice: {model_choice}"
     csv_file_path = _write_csv_to_temp_file(csv_text)
-    return csv_text, csv_file_path
 # -------- Gradio UI -------- #
@@ -145,6 +127,16 @@ def build_interface() -> gr.Blocks:
                     label="Upload PDF or image",
                     file_types=[".pdf", ".png", ".jpg", ".jpeg", ".webp"],
                 )
                 model_selector = gr.Dropdown(
                     label="LLM backend",
@@ -194,11 +186,16 @@ def build_interface() -> gr.Blocks:
                     buttons=["copy"],
                 )
                 csv_file = gr.File(label="Download CSV file", interactive=False)
         run_button.click(
             fn=process_document,
             inputs=[file_input, model_selector, prompt_editor],
-            outputs=[csv_output, csv_file],
         )
     return demo

 from openai_backend import _run_openai_vision
 from common import MODELS_MAP, MODEL_GEMINI, MODEL_OLMOCR
+from gemini_backend import _run_gemini_vision
+from logging_helper import log as _log, log_debug as _log_debug, get_latest_model_log as _get_latest_model_log
 from olm_ocr import _run_olmocr
 APP_TITLE = "words2doc"
 APP_DESCRIPTION = "Upload a PDF or image with (handwritten) text and convert it to CSV using different LLM backends."
         return base64.b64encode(image_file.read()).decode("utf-8")
 # -------- Main processing function -------- #
 def process_document(file_obj, model_choice: str, prompt: str):
     if file_obj is None:
+        return "No file uploaded.", None, ""
     file_path = getattr(file_obj, "name", None) or file_obj
     image = _image_from_any_file(file_path)
     if MODELS_MAP[model_choice]["backend"] == "openai":
         csv_text = _run_openai_vision(image, prompt, model_choice)
     elif MODELS_MAP[model_choice]["backend"] == "gemini":
+        csv_text = _run_gemini_vision(image, prompt, model_choice)
     elif MODELS_MAP[model_choice]["backend"] == "olmocr":
         csv_text = _run_olmocr(image, prompt)
     else:
         csv_text = f"Unknown model choice: {model_choice}"
     csv_file_path = _write_csv_to_temp_file(csv_text)
+    latest_log = _get_latest_model_log() or ""
+    return csv_text, csv_file_path, latest_log
 # -------- Gradio UI -------- #
                     label="Upload PDF or image",
                     file_types=[".pdf", ".png", ".jpg", ".jpeg", ".webp"],
                 )
+                image_example_preview = gr.Image(
+                    label="Example image preview",
+                    value="static/vocab.jpg",  # adjust to real relative path
+                    interactive=False,
+)
+                gr.Examples(
+                    examples=[["vocab.jpg", "vocab.jpg"]],
+                    inputs=[file_input, image_example_preview],
+                    label="Example image",
+                )
                 model_selector = gr.Dropdown(
                     label="LLM backend",
                     buttons=["copy"],
                 )
                 csv_file = gr.File(label="Download CSV file", interactive=False)
+        with gr.Row():
+            logs_output = gr.Textbox(
+                label="Logs",
+                lines=4,
+            )
         run_button.click(
             fn=process_document,
             inputs=[file_input, model_selector, prompt_editor],
+            outputs=[csv_output, csv_file, logs_output],
         )
     return demo

common.py CHANGED Viewed

@@ -1,4 +1,4 @@
-MODEL_GEMINI = "Gemini 3 Pro"
 MODEL_OLMOCR = "olmOCR-2-7B-1025-FP8"
@@ -13,41 +13,15 @@ MODELS_MAP = {
     "gpt-5": {"input": 1.25, "output": 10.00, "backend": "openai"},
     "gpt-5-mini": {"input": 0.25, "output": 2.00, "backend": "openai"},
     "gpt-5-nano": {"input": 0.05, "output": 0.40, "backend": "openai"},
-    "gpt-5.1-chat-latest": {"input": 1.25, "output": 10.00, "backend": "openai"},
-    "gpt-5-chat-latest": {"input": 1.25, "output": 10.00, "backend": "openai"},
-    "gpt-5.1-codex-max": {"input": 1.25, "output": 10.00, "backend": "openai"},
-    "gpt-5.1-codex": {"input": 1.25, "output": 10.00, "backend": "openai"},
-    "gpt-5-codex": {"input": 1.25, "output": 10.00, "backend": "openai"},
-    "gpt-5.1-codex-mini": {"input": 0.25, "output": 2.00, "backend": "openai"},
-    "gpt-5-pro": {"input": 15.00, "output": 120.00, "backend": "openai"},
-    "gpt-5-search-api": {"input": 1.25, "output": 10.00, "backend": "openai"},
     # GPT-4.1 family
     "gpt-4.1": {"input": 2.00, "output": 8.00, "backend": "openai"},
     "gpt-4.1-mini": {"input": 0.40, "output": 1.60, "backend": "openai"},
     "gpt-4.1-nano": {"input": 0.10, "output": 0.40, "backend": "openai"},
-    # GPT-4o family
-    "gpt-4o": {"input": 2.50, "output": 10.00, "backend": "openai"},
-    "gpt-4o-2024-05-13": {"input": 5.00, "output": 15.00, "backend": "openai"},
-    "gpt-4o-mini": {"input": 0.15, "output": 0.60, "backend": "openai"},
-    "chatgpt-4o-latest": {"input": 5.00, "output": 15.00, "backend": "openai"},
-    # GPT-4 Turbo / GPT-4 legacy family (from legacy models table)
-    "gpt-4-turbo": {"input": 10.00, "output": 30.00, "backend": "openai"},
-    "gpt-4-turbo-2024-04-09": {"input": 10.00, "output": 30.00, "backend": "openai"},
-    "gpt-4-0125-preview": {"input": 10.00, "output": 30.00, "backend": "openai"},
-    "gpt-4-1106-preview": {"input": 10.00, "output": 30.00, "backend": "openai"},
-    "gpt-4-1106-vision-preview": {"input": 10.00, "output": 30.00, "backend": "openai"},
-    "gpt-4-0613": {"input": 30.00, "output": 60.00, "backend": "openai"},
-    "gpt-4-0314": {"input": 30.00, "output": 60.00, "backend": "openai"},
-    "gpt-4": {"input": 30.00, "output": 60.00, "backend": "openai"},
-    "gpt-4-32k": {"input": 60.00, "output": 120.00, "backend": "openai"},
-    # Default
-    "default": {"input": 2.50, "output": 10.00, "backend": "openai"},
     # Other backends (mock rates)
-    MODEL_GEMINI: {"input": 1.00, "output": 1.00, "backend": "gemini"},
     MODEL_OLMOCR: {"input": 1.35, "output": 0.30, "backend": "olmocr"},
 }

+MODEL_GEMINI = "gemini-2.5-flash"
 MODEL_OLMOCR = "olmOCR-2-7B-1025-FP8"
     "gpt-5": {"input": 1.25, "output": 10.00, "backend": "openai"},
     "gpt-5-mini": {"input": 0.25, "output": 2.00, "backend": "openai"},
     "gpt-5-nano": {"input": 0.05, "output": 0.40, "backend": "openai"},
     # GPT-4.1 family
     "gpt-4.1": {"input": 2.00, "output": 8.00, "backend": "openai"},
     "gpt-4.1-mini": {"input": 0.40, "output": 1.60, "backend": "openai"},
     "gpt-4.1-nano": {"input": 0.10, "output": 0.40, "backend": "openai"},
     # Other backends (mock rates)
+    MODEL_GEMINI: {"input": 0.30, "output": 2.50, "backend": "gemini"},
+    "gemini-3-pro-preview": {"input": 2.00, "output": 12.00, "backend": "gemini"},
     MODEL_OLMOCR: {"input": 1.35, "output": 0.30, "backend": "olmocr"},
+    "gemini-2.5-pro": {"input": 1.25, "output": 10.00, "backend": "gemini"},
+    "default": {"input": 2.50, "output": 10.00, "backend": "openai"},
 }

gemini_backend.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import os
+import time
+from typing import TYPE_CHECKING
+from PIL import Image
+from logging_helper import log as _log, log_debug as _log_debug, _log_model_response
+from common import MODELS_MAP
+try:
+    # New google-genai client library
+    from google import genai
+except ImportError:  # pragma: no cover
+    genai = None  # type: ignore
+if TYPE_CHECKING:  # pragma: no cover
+    # Kept for type-checkers; Image is also imported at runtime above
+    from PIL import Image as _ImageType
+def _run_gemini_vision(image: Image.Image, prompt: str, model_choice: str) -> str:
+    if genai is None:
+        raise RuntimeError("google-genai package is not installed. Please install it to use Gemini backend.")
+    api_key = os.getenv("GEMINI_API_KEY")
+    if not api_key:
+        raise RuntimeError("GEMINI_API_KEY environment variable is not set.")
+    model_name = model_choice
+    # Instantiate the google-genai client and call the model
+    client = genai.Client(api_key=api_key)
+    _log_debug(f"Using Gemini model: {model_name}")
+    _log_debug(f"Input image size: {image.size}")
+    start_time = time.perf_counter()
+    # google-genai accepts mixed text and image content
+    response = client.models.generate_content(
+        model=model_name,
+        contents=[prompt, image],
+    )
+    duration = time.perf_counter() - start_time
+    print(f"Response: {response}")
+    print(f"Response text: {getattr(response, 'text', 'No text attribute')}")
+    content = response.text or ""
+    usage = getattr(response, "usage_metadata", None)
+    if usage is None:
+        usage = getattr(response, "usage", None)
+    _log_model_response(
+        model_name=model_name,
+        content=content,
+        duration=duration,
+        usage=usage,
+        pricing=MODELS_MAP,
+    )
+    return content

logging_helper.py CHANGED Viewed

@@ -1,5 +1,9 @@
 import os
-from typing import Any, Mapping, Optional
 def log(message: str) -> None:
@@ -24,28 +28,79 @@ def _log_model_response(
     Returns the calculated cost if pricing and usage are provided, otherwise None.
     """
     cost: Optional[float] = None
     if usage is not None and pricing is not None:
-        pricing_row = pricing.get(model_name, pricing[default_pricing_key])
-        input_cost = (usage.prompt_tokens / 1_000_000) * pricing_row["input"]
-        output_cost = (usage.completion_tokens / 1_000_000) * pricing_row["output"]
-        cost = input_cost + output_cost
-        log(f"Model: {model_name}")
-        log(
-            "Token usage: Input={usage.prompt_tokens}, "
-            "Output={usage.completion_tokens}, Total={usage.total_tokens}".format(usage=usage)
-        )
-        log(f"Estimated cost: ${cost:.6f}")
-        log(f"Execution time: {duration:.3f} seconds")
     else:
         log(f"Model: {model_name}")
         log(f"Execution time: {duration:.3f} seconds")
     log("Model response received")
     log_debug(f"Response length: {len(content)} characters")
     log_debug(f"Result: {content}")
     log_debug("End of result")
     return cost

 import os
+from typing import Any, List, Mapping, Optional
+from datetime import datetime
+_MODEL_LOGS: List[str] = []
 def log(message: str) -> None:
     Returns the calculated cost if pricing and usage are provided, otherwise None.
     """
+    global _MODEL_LOGS
     cost: Optional[float] = None
     if usage is not None and pricing is not None:
+        # Normalize different provider usage formats into a common shape.
+        prompt_tokens: Optional[float] = None
+        completion_tokens: Optional[float] = None
+        total_tokens: Optional[float] = None
+        # Attribute-style access (e.g. OpenAI usage)
+        if hasattr(usage, "prompt_tokens") and hasattr(usage, "completion_tokens"):
+            prompt_tokens = float(getattr(usage, "prompt_tokens"))
+            completion_tokens = float(getattr(usage, "completion_tokens"))
+            total_tokens = float(getattr(usage, "total_tokens", prompt_tokens + completion_tokens))
+        # Gemini GenerateContentResponseUsageMetadata-style
+        elif hasattr(usage, "prompt_token_count") and hasattr(usage, "candidates_token_count"):
+            prompt_tokens = float(getattr(usage, "prompt_token_count"))
+            completion_tokens = float(getattr(usage, "candidates_token_count"))
+            total_tokens = float(getattr(usage, "total_token_count", prompt_tokens + completion_tokens))
+        # dict-style access fallback
+        elif isinstance(usage, Mapping):
+            if {"prompt_tokens", "completion_tokens"}.issubset(usage.keys()):
+                prompt_tokens = float(usage["prompt_tokens"])
+                completion_tokens = float(usage["completion_tokens"])
+                total_tokens = float(usage.get("total_tokens", prompt_tokens + completion_tokens))
+            elif {"prompt_token_count", "candidates_token_count"}.issubset(usage.keys()):
+                prompt_tokens = float(usage["prompt_token_count"])
+                completion_tokens = float(usage["candidates_token_count"])
+                total_tokens = float(usage.get("total_token_count", prompt_tokens + completion_tokens))
+        if prompt_tokens is not None and completion_tokens is not None and total_tokens is not None:
+            pricing_row = pricing.get(model_name, pricing[default_pricing_key])
+            input_cost = (prompt_tokens / 1_000_000) * pricing_row["input"]
+            output_cost = (completion_tokens / 1_000_000) * pricing_row["output"]
+            cost = input_cost + output_cost
+            log(f"Model: {model_name}")
+            log(
+                "Token usage: Input={prompt}, Output={completion}, Total={total}".format(
+                    prompt=int(prompt_tokens), completion=int(completion_tokens), total=int(total_tokens)
+                )
+            )
+            log(f"Estimated cost: ${cost:.6f}")
+            log(f"Execution time: {duration:.3f} seconds")
+        else:
+            # Usage provided but in an unknown format – still log basic info.
+            log(f"Model: {model_name}")
+            log(f"Execution time: {duration:.3f} seconds")
+            log_debug(f"Unrecognized usage format: {usage!r}")
     else:
         log(f"Model: {model_name}")
         log(f"Execution time: {duration:.3f} seconds")
     log("Model response received")
+    # Store latest model log in a simple CSV-like line: timestamp, model name, duration, cost
+    timestamp = datetime.now().isoformat(timespec="seconds")
+    duration_s = float(duration)
+    cost_value = float(cost) if cost is not None else 0.0
+    line = f"{timestamp}, {model_name}, {duration_s:.3f} seconds, ${cost_value:.6f}"
+    _MODEL_LOGS.append(line)
     log_debug(f"Response length: {len(content)} characters")
     log_debug(f"Result: {content}")
     log_debug("End of result")
     return cost
+def get_latest_model_log() -> Optional[str]:
+    """Return full accumulated model logs as a single string, or None if empty."""
+    if not _MODEL_LOGS:
+        return None
+    return "\n".join(_MODEL_LOGS)

manual_vocab.csv ADDED Viewed

	@@ -0,0 +1,38 @@

+22.08.2023,
+links, left
+rechts, right
+boven, above
+beneden, downstairs
+allerlei, all kinds of stuff
+de gang, коридор
+dingen/spullen, stuff
+de tuin,
+gewoon, simple
+Д/з урок p71,
+5.09.2023,
+op zaterdag ben ik naar .. gekomen,
+wandelen, to walk
+verkocht, sold
+gekocht, bought
+tijd manier plaats,
+waarvoor, for what
+bezoeken, to visit
+dingen,
+bijvoorbeeld, for example
+er is een verschil, there is a difference
+tuin, garden
+Д/з 3 72 диктант под диктовку,
+5. 75.
+de afwas doen, do the laundry
+Ik lunch tussen de middag, I have a lunch at noon
+afdrogen, dry
+Ik droog af, I dry off
+tanden poetsen, toothbrushing
+I douche 's ochtends en 's avonds,
+Ik kleed me aan, Я одеваюсь
+al, already
+vaak, often
+meestal, usually
+nooit, never
+zelden, seldom редко
+altijd, always

requirements.txt CHANGED Viewed

@@ -1,6 +1,6 @@
 gradio>=6.1.0
 openai>=1.40.0
-google-generativeai>=0.7.0
 olmocr>=0.4.0
 torch>=2.2.0
 transformers>=4.42.0

 gradio>=6.1.0
 openai>=1.40.0
+google-genai>=0.1.0
 olmocr>=0.4.0
 torch>=2.2.0
 transformers>=4.42.0