#!/usr/bin/env python3 """ Script to summarize transcript using ERNIE-4.5-21B-A3B-PT-GGUF model with SYCL acceleration. """ import os import argparse import re from typing import Tuple from llama_cpp import Llama from huggingface_hub import hf_hub_download from opencc import OpenCC def load_model(repo_id, filename, cpu_only=False): """Load the model from Hugging Face Hub.""" # Initialize the model with SYCL support (or CPU only if requested) llm = Llama.from_pretrained( repo_id=repo_id, filename=filename, n_gpu_layers=0 if cpu_only else -1, # 0 for CPU, -1 for all layers on GPU seed=1337, n_ctx=32768, # Context size verbose=True, # Reduced verbosity for cleaner output v_type=2, k_type=2, ) return llm def read_transcript(file_path): """Read the transcript file.""" with open(file_path, 'r', encoding='utf-8') as f: content = f.read() return content def parse_thinking_blocks(content: str) -> Tuple[str, str]: """ Parse thinking blocks from model output. Supports both and tags. Args: content: Full model response containing thinking blocks and summary Returns: Tuple of (thinking_content, summary_content) - thinking_content: All text between / tags (or empty string) - summary_content: All text outside thinking blocks (or full content if no tags) """ # Match both and tags pattern = r'(.*?)' matches = re.findall(pattern, content, re.DOTALL) if not matches: # No thinking blocks found - return entire content as summary return ("", content) # Extract all thinking blocks thinking = '\n\n'.join(match.strip() for match in matches) # Remove thinking blocks from content to get summary summary = re.sub(pattern, '', content, flags=re.DOTALL).strip() return (thinking, summary) def stream_summarize_transcript(llm, transcript, output_language="en"): """ Perform live streaming summary by getting real-time token output from the model. Args: llm: The loaded language model transcript: The full transcript to summarize output_language: Target language for summary ("en" or "zh-TW") """ cc = OpenCC('s2twp') # Simplified Chinese to Traditional Chinese (Taiwan standard with phrase conversion) # Use the model's chat format based on its template and language if output_language == "zh-TW": system_msg = "你是一個有助的助手,負責總結轉錄內容。" user_msg = f"請總結以下內容:\n\n{transcript}" else: system_msg = "You are a helpful assistant that summarizes transcripts." user_msg = f"Please summarize the following content:\n\n{transcript}" messages = [ {"role": "system", "content": system_msg}, {"role": "user", "content": user_msg} ] # Generate the summary using streaming completion lang_display = "zh-TW" if output_language == "zh-TW" else "English" print(f"\nStreaming {lang_display} summary:") print("="*50) full_response = "" stream = llm.create_chat_completion( messages=messages, max_tokens=1024, temperature=0.6, min_p=0.0, top_p=0.95, top_k=20, stop=["<|end_of_text|>", "<|eot_id|>", "<|eom_id|>"], stream=True ) for chunk in stream: if 'choices' in chunk and len(chunk['choices']) > 0: delta = chunk['choices'][0].get('delta', {}) content = delta.get('content', '') if content: if output_language == "zh-TW": converted_content = cc.convert(content) print(converted_content, end='', flush=True) full_response += converted_content else: print(content, end='', flush=True) full_response += content print("\n" + "="*50) # Reset the model state to ensure clean state for next call llm.reset() return full_response.strip() def main(): parser = argparse.ArgumentParser(description="Summarize transcript in zh-TW using a GGUF model.") parser.add_argument("-i", "--input", type=str, default="./transcripts/short.txt", help="Path to the input transcript file (default: ./transcripts/short.txt)") parser.add_argument("-m", "--model", type=str, default="unsloth/Qwen3-0.6B-GGUF:Q4_0", help="HuggingFace model in format repo_id:quant (e.g., unsloth/Qwen3-1.7B-GGUF:Q2_K_L)") parser.add_argument("-c", "--cpu", action="store_true", help="Force CPU only inference") parser.add_argument("-l", "--language", type=str, choices=["en", "zh-TW"], default="en", help="Output language (default: en)") args = parser.parse_args() # Parse model argument if provided if ":" in args.model: repo_id, quant = args.model.rsplit(":", 1) filename = f"*{quant}.gguf" else: print(f"Error: Invalid model format '{args.model}'. Expected format: repo_id:quant") return print(f"Loading model: {repo_id} ({filename}) with {'CPU only' if args.cpu else 'SYCL acceleration'}...") # Load the model llm = load_model(repo_id, filename, cpu_only=args.cpu) # Read the transcript transcript_path = args.input if not os.path.exists(transcript_path): print(f"Error: Input file '{transcript_path}' not found.") return transcript = read_transcript(transcript_path) print("\nOriginal Transcript (Preview):") print(transcript[:500] + "..." if len(transcript) > 500 else transcript) # Summarize with streaming summary = stream_summarize_transcript(llm, transcript, output_language=args.language) # Save summaries to files # Parse thinking blocks and separate content thinking_content, summary_content = parse_thinking_blocks(summary) # Write thinking content if present if thinking_content: with open("thinking.txt", 'w', encoding='utf-8') as f: f.write(thinking_content) print(f"\n[Thinking content saved to thinking.txt ({len(thinking_content)} chars)]") # Write summary content with open("summary.txt", 'w', encoding='utf-8') as f: f.write(summary_content) print(f"[Summary saved to summary.txt ({len(summary_content)} chars)]") # Clean up del llm if __name__ == "__main__": main()