#!/usr/bin/env python3
"""
Script to summarize transcript using ERNIE-4.5-21B-A3B-PT-GGUF model with SYCL acceleration.
"""

import os
import argparse
import re
from typing import Tuple
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
from opencc import OpenCC

def load_model(repo_id, filename, cpu_only=False):
    """Load the model from Hugging Face Hub."""

    # Initialize the model with SYCL support (or CPU only if requested)
    llm = Llama.from_pretrained(
        repo_id=repo_id,
        filename=filename,
        n_gpu_layers=0 if cpu_only else -1,  # 0 for CPU, -1 for all layers on GPU
        seed=1337,
        n_ctx=32768,       # Context size
        verbose=True,     # Reduced verbosity for cleaner output
        v_type=2,
        k_type=2,
    )

    return llm

def read_transcript(file_path):
    """Read the transcript file."""
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    return content

def parse_thinking_blocks(content: str) -> Tuple[str, str]:
    """
    Parse thinking blocks from model output.
    Supports both <think> and <thinking> tags.

    Args:
        content: Full model response containing thinking blocks and summary

    Returns:
        Tuple of (thinking_content, summary_content)
        - thinking_content: All text between <think>/<thinking> tags (or empty string)
        - summary_content: All text outside thinking blocks (or full content if no tags)
    """
    # Match both <think> and <thinking> tags
    pattern = r'<think(?:ing)?>(.*?)</think(?:ing)?>'
    matches = re.findall(pattern, content, re.DOTALL)

    if not matches:
        # No thinking blocks found - return entire content as summary
        return ("", content)

    # Extract all thinking blocks
    thinking = '\n\n'.join(match.strip() for match in matches)

    # Remove thinking blocks from content to get summary
    summary = re.sub(pattern, '', content, flags=re.DOTALL).strip()

    return (thinking, summary)

def stream_summarize_transcript(llm, transcript, output_language="en"):
    """
    Perform live streaming summary by getting real-time token output from the model.

    Args:
        llm: The loaded language model
        transcript: The full transcript to summarize
        output_language: Target language for summary ("en" or "zh-TW")
    """
    cc = OpenCC('s2twp')  # Simplified Chinese to Traditional Chinese (Taiwan standard with phrase conversion)

    # Use the model's chat format based on its template and language
    if output_language == "zh-TW":
        system_msg = "你是一個有助的助手，負責總結轉錄內容。"
        user_msg = f"請總結以下內容：\n\n{transcript}"
    else:
        system_msg = "You are a helpful assistant that summarizes transcripts."
        user_msg = f"Please summarize the following content:\n\n{transcript}"
    
    messages = [
        {"role": "system", "content": system_msg},
        {"role": "user", "content": user_msg}
    ]

    # Generate the summary using streaming completion
    lang_display = "zh-TW" if output_language == "zh-TW" else "English"
    print(f"\nStreaming {lang_display} summary:")
    print("="*50)

    full_response = ""

    stream = llm.create_chat_completion(
        messages=messages,
        max_tokens=1024,
        temperature=0.6,
        min_p=0.0,
        top_p=0.95,
        top_k=20,
        stop=["<|end_of_text|>", "<|eot_id|>", "<|eom_id|>"],
        stream=True
    )

    for chunk in stream:
        if 'choices' in chunk and len(chunk['choices']) > 0:
            delta = chunk['choices'][0].get('delta', {})
            content = delta.get('content', '')
            if content:
                if output_language == "zh-TW":
                    converted_content = cc.convert(content)
                    print(converted_content, end='', flush=True)
                    full_response += converted_content
                else:
                    print(content, end='', flush=True)
                    full_response += content

    print("\n" + "="*50)

    # Reset the model state to ensure clean state for next call
    llm.reset()

    return full_response.strip()


def main():
    parser = argparse.ArgumentParser(description="Summarize transcript in zh-TW using a GGUF model.")
    parser.add_argument("-i", "--input", type=str, 
                        default="./transcripts/short.txt", 
                        help="Path to the input transcript file (default: ./transcripts/short.txt)")
    parser.add_argument("-m", "--model", type=str, 
                        default="unsloth/Qwen3-0.6B-GGUF:Q4_0", 
                        help="HuggingFace model in format repo_id:quant (e.g., unsloth/Qwen3-1.7B-GGUF:Q2_K_L)")
    parser.add_argument("-c", "--cpu", action="store_true", help="Force CPU only inference")
    parser.add_argument("-l", "--language", type=str, choices=["en", "zh-TW"], default="en",
                        help="Output language (default: en)")
    args = parser.parse_args()

    # Parse model argument if provided
    if ":" in args.model:
        repo_id, quant = args.model.rsplit(":", 1)
        filename = f"*{quant}.gguf"
    else:
        print(f"Error: Invalid model format '{args.model}'. Expected format: repo_id:quant")
        return

    print(f"Loading model: {repo_id} ({filename}) with {'CPU only' if args.cpu else 'SYCL acceleration'}...")

    # Load the model
    llm = load_model(repo_id, filename, cpu_only=args.cpu)

    # Read the transcript
    transcript_path = args.input
    if not os.path.exists(transcript_path):
        print(f"Error: Input file '{transcript_path}' not found.")
        return

    transcript = read_transcript(transcript_path)

    print("\nOriginal Transcript (Preview):")
    print(transcript[:500] + "..." if len(transcript) > 500 else transcript)

    # Summarize with streaming
    summary = stream_summarize_transcript(llm, transcript, output_language=args.language)

    # Save summaries to files
    # Parse thinking blocks and separate content
    thinking_content, summary_content = parse_thinking_blocks(summary)

    # Write thinking content if present
    if thinking_content:
        with open("thinking.txt", 'w', encoding='utf-8') as f:
            f.write(thinking_content)
        print(f"\n[Thinking content saved to thinking.txt ({len(thinking_content)} chars)]")

    # Write summary content
    with open("summary.txt", 'w', encoding='utf-8') as f:
        f.write(summary_content)
    print(f"[Summary saved to summary.txt ({len(summary_content)} chars)]")

    # Clean up
    del llm

if __name__ == "__main__":
    main()