Spaces:

Luigi
/

tiny-scribe

Running

File size: 6,653 Bytes

7ac9e1f
 
f8cb070
7ac9e1f
 
 
2bc625f
8a9d263
 
7ac9e1f
 
3ec1246
7ac9e1f
c16840d
7ac9e1f
d4fd1c3
c16840d
7ac9e1f
fba2074
 
c16840d
7ac9e1f
 
01dc9b6
7ac9e1f
0745d93
7ac9e1f
d4fd1c3
7ac9e1f
 
 
 
 
 
 
 
8a9d263
 
62499af
 
8a9d263
 
 
 
 
 
62499af
8a9d263
 
62499af
 
8a9d263
 
 
 
 
 
 
 
 
 
 
 
 
 
e78283f
d4fd1c3
 
 
 
 
 
e78283f
d4fd1c3
3ec1246
d4fd1c3
e78283f
 
 
 
 
 
 
 
0745d93
e78283f
 
0745d93
d4fd1c3
 
e78283f
 
d4fd1c3
 
 
 
 
 
83a3f98
 
 
 
 
d4fd1c3
 
 
 
 
 
 
 
 
e78283f
 
 
 
 
 
 
d4fd1c3
 
 
 
 
 
 
 
 
7ac9e1f
fba2074
 
 
 
 
83a3f98
2fb2f79
c16840d
e78283f
 
2bc625f
 
fba2074
 
 
 
 
 
 
 
c16840d
d4fd1c3
7ac9e1f
c16840d
d4fd1c3
7ac9e1f
2bc625f
 
 
 
 
7ac9e1f
d4fd1c3
 
7ac9e1f
d4fd1c3
e78283f
 
d4fd1c3
7ac9e1f
8a9d263
 
 
 
 
 
 
 
d4fd1c3
8a9d263
 
 
 
d4fd1c3
7ac9e1f
 
 
 
d4fd1c3

#!/usr/bin/env python3
"""
Script to summarize transcript using ERNIE-4.5-21B-A3B-PT-GGUF model with SYCL acceleration.
"""

import os
import argparse
import re
from typing import Tuple
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
from opencc import OpenCC

def load_model(repo_id, filename, cpu_only=False):
    """Load the model from Hugging Face Hub."""

    # Initialize the model with SYCL support (or CPU only if requested)
    llm = Llama.from_pretrained(
        repo_id=repo_id,
        filename=filename,
        n_gpu_layers=0 if cpu_only else -1,  # 0 for CPU, -1 for all layers on GPU
        seed=1337,
        n_ctx=32768,       # Context size
        verbose=True,     # Reduced verbosity for cleaner output
        v_type=2,
        k_type=2,
    )

    return llm

def read_transcript(file_path):
    """Read the transcript file."""
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    return content

def parse_thinking_blocks(content: str) -> Tuple[str, str]:
    """
    Parse thinking blocks from model output.
    Supports both <think> and <thinking> tags.

    Args:
        content: Full model response containing thinking blocks and summary

    Returns:
        Tuple of (thinking_content, summary_content)
        - thinking_content: All text between <think>/<thinking> tags (or empty string)
        - summary_content: All text outside thinking blocks (or full content if no tags)
    """
    # Match both <think> and <thinking> tags
    pattern = r'<think(?:ing)?>(.*?)</think(?:ing)?>'
    matches = re.findall(pattern, content, re.DOTALL)

    if not matches:
        # No thinking blocks found - return entire content as summary
        return ("", content)

    # Extract all thinking blocks
    thinking = '\n\n'.join(match.strip() for match in matches)

    # Remove thinking blocks from content to get summary
    summary = re.sub(pattern, '', content, flags=re.DOTALL).strip()

    return (thinking, summary)

def stream_summarize_transcript(llm, transcript, output_language="en"):
    """
    Perform live streaming summary by getting real-time token output from the model.

    Args:
        llm: The loaded language model
        transcript: The full transcript to summarize
        output_language: Target language for summary ("en" or "zh-TW")
    """
    cc = OpenCC('s2twp')  # Simplified Chinese to Traditional Chinese (Taiwan standard with phrase conversion)

    # Use the model's chat format based on its template and language
    if output_language == "zh-TW":
        system_msg = "你是一個有助的助手，負責總結轉錄內容。"
        user_msg = f"請總結以下內容：\n\n{transcript}"
    else:
        system_msg = "You are a helpful assistant that summarizes transcripts."
        user_msg = f"Please summarize the following content:\n\n{transcript}"
    
    messages = [
        {"role": "system", "content": system_msg},
        {"role": "user", "content": user_msg}
    ]

    # Generate the summary using streaming completion
    lang_display = "zh-TW" if output_language == "zh-TW" else "English"
    print(f"\nStreaming {lang_display} summary:")
    print("="*50)

    full_response = ""

    stream = llm.create_chat_completion(
        messages=messages,
        max_tokens=1024,
        temperature=0.6,
        min_p=0.0,
        top_p=0.95,
        top_k=20,
        stop=["<|end_of_text|>", "<|eot_id|>", "<|eom_id|>"],
        stream=True
    )

    for chunk in stream:
        if 'choices' in chunk and len(chunk['choices']) > 0:
            delta = chunk['choices'][0].get('delta', {})
            content = delta.get('content', '')
            if content:
                if output_language == "zh-TW":
                    converted_content = cc.convert(content)
                    print(converted_content, end='', flush=True)
                    full_response += converted_content
                else:
                    print(content, end='', flush=True)
                    full_response += content

    print("\n" + "="*50)

    # Reset the model state to ensure clean state for next call
    llm.reset()

    return full_response.strip()


def main():
    parser = argparse.ArgumentParser(description="Summarize transcript in zh-TW using a GGUF model.")
    parser.add_argument("-i", "--input", type=str, 
                        default="./transcripts/short.txt", 
                        help="Path to the input transcript file (default: ./transcripts/short.txt)")
    parser.add_argument("-m", "--model", type=str, 
                        default="unsloth/Qwen3-0.6B-GGUF:Q4_0", 
                        help="HuggingFace model in format repo_id:quant (e.g., unsloth/Qwen3-1.7B-GGUF:Q2_K_L)")
    parser.add_argument("-c", "--cpu", action="store_true", help="Force CPU only inference")
    parser.add_argument("-l", "--language", type=str, choices=["en", "zh-TW"], default="en",
                        help="Output language (default: en)")
    args = parser.parse_args()

    # Parse model argument if provided
    if ":" in args.model:
        repo_id, quant = args.model.rsplit(":", 1)
        filename = f"*{quant}.gguf"
    else:
        print(f"Error: Invalid model format '{args.model}'. Expected format: repo_id:quant")
        return

    print(f"Loading model: {repo_id} ({filename}) with {'CPU only' if args.cpu else 'SYCL acceleration'}...")

    # Load the model
    llm = load_model(repo_id, filename, cpu_only=args.cpu)

    # Read the transcript
    transcript_path = args.input
    if not os.path.exists(transcript_path):
        print(f"Error: Input file '{transcript_path}' not found.")
        return

    transcript = read_transcript(transcript_path)

    print("\nOriginal Transcript (Preview):")
    print(transcript[:500] + "..." if len(transcript) > 500 else transcript)

    # Summarize with streaming
    summary = stream_summarize_transcript(llm, transcript, output_language=args.language)

    # Save summaries to files
    # Parse thinking blocks and separate content
    thinking_content, summary_content = parse_thinking_blocks(summary)

    # Write thinking content if present
    if thinking_content:
        with open("thinking.txt", 'w', encoding='utf-8') as f:
            f.write(thinking_content)
        print(f"\n[Thinking content saved to thinking.txt ({len(thinking_content)} chars)]")

    # Write summary content
    with open("summary.txt", 'w', encoding='utf-8') as f:
        f.write(summary_content)
    print(f"[Summary saved to summary.txt ({len(summary_content)} chars)]")

    # Clean up
    del llm

if __name__ == "__main__":
    main()