Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """ | |
| Script to summarize transcript using ERNIE-4.5-21B-A3B-PT-GGUF model with SYCL acceleration. | |
| """ | |
| import os | |
| import argparse | |
| import re | |
| from typing import Tuple | |
| from llama_cpp import Llama | |
| from huggingface_hub import hf_hub_download | |
| from opencc import OpenCC | |
| def load_model(repo_id, filename, cpu_only=False): | |
| """Load the model from Hugging Face Hub.""" | |
| # Initialize the model with SYCL support (or CPU only if requested) | |
| llm = Llama.from_pretrained( | |
| repo_id=repo_id, | |
| filename=filename, | |
| n_gpu_layers=0 if cpu_only else -1, # 0 for CPU, -1 for all layers on GPU | |
| seed=1337, | |
| n_ctx=32768, # Context size | |
| verbose=True, # Reduced verbosity for cleaner output | |
| v_type=2, | |
| k_type=2, | |
| ) | |
| return llm | |
| def read_transcript(file_path): | |
| """Read the transcript file.""" | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| content = f.read() | |
| return content | |
| def parse_thinking_blocks(content: str) -> Tuple[str, str]: | |
| """ | |
| Parse thinking blocks from model output. | |
| Supports both <think> and <thinking> tags. | |
| Args: | |
| content: Full model response containing thinking blocks and summary | |
| Returns: | |
| Tuple of (thinking_content, summary_content) | |
| - thinking_content: All text between <think>/<thinking> tags (or empty string) | |
| - summary_content: All text outside thinking blocks (or full content if no tags) | |
| """ | |
| # Match both <think> and <thinking> tags | |
| pattern = r'<think(?:ing)?>(.*?)</think(?:ing)?>' | |
| matches = re.findall(pattern, content, re.DOTALL) | |
| if not matches: | |
| # No thinking blocks found - return entire content as summary | |
| return ("", content) | |
| # Extract all thinking blocks | |
| thinking = '\n\n'.join(match.strip() for match in matches) | |
| # Remove thinking blocks from content to get summary | |
| summary = re.sub(pattern, '', content, flags=re.DOTALL).strip() | |
| return (thinking, summary) | |
| def stream_summarize_transcript(llm, transcript, output_language="en"): | |
| """ | |
| Perform live streaming summary by getting real-time token output from the model. | |
| Args: | |
| llm: The loaded language model | |
| transcript: The full transcript to summarize | |
| output_language: Target language for summary ("en" or "zh-TW") | |
| """ | |
| cc = OpenCC('s2twp') # Simplified Chinese to Traditional Chinese (Taiwan standard with phrase conversion) | |
| # Use the model's chat format based on its template and language | |
| if output_language == "zh-TW": | |
| system_msg = "你是一個有助的助手,負責總結轉錄內容。" | |
| user_msg = f"請總結以下內容:\n\n{transcript}" | |
| else: | |
| system_msg = "You are a helpful assistant that summarizes transcripts." | |
| user_msg = f"Please summarize the following content:\n\n{transcript}" | |
| messages = [ | |
| {"role": "system", "content": system_msg}, | |
| {"role": "user", "content": user_msg} | |
| ] | |
| # Generate the summary using streaming completion | |
| lang_display = "zh-TW" if output_language == "zh-TW" else "English" | |
| print(f"\nStreaming {lang_display} summary:") | |
| print("="*50) | |
| full_response = "" | |
| stream = llm.create_chat_completion( | |
| messages=messages, | |
| max_tokens=1024, | |
| temperature=0.6, | |
| min_p=0.0, | |
| top_p=0.95, | |
| top_k=20, | |
| stop=["<|end_of_text|>", "<|eot_id|>", "<|eom_id|>"], | |
| stream=True | |
| ) | |
| for chunk in stream: | |
| if 'choices' in chunk and len(chunk['choices']) > 0: | |
| delta = chunk['choices'][0].get('delta', {}) | |
| content = delta.get('content', '') | |
| if content: | |
| if output_language == "zh-TW": | |
| converted_content = cc.convert(content) | |
| print(converted_content, end='', flush=True) | |
| full_response += converted_content | |
| else: | |
| print(content, end='', flush=True) | |
| full_response += content | |
| print("\n" + "="*50) | |
| # Reset the model state to ensure clean state for next call | |
| llm.reset() | |
| return full_response.strip() | |
| def main(): | |
| parser = argparse.ArgumentParser(description="Summarize transcript in zh-TW using a GGUF model.") | |
| parser.add_argument("-i", "--input", type=str, | |
| default="./transcripts/short.txt", | |
| help="Path to the input transcript file (default: ./transcripts/short.txt)") | |
| parser.add_argument("-m", "--model", type=str, | |
| default="unsloth/Qwen3-0.6B-GGUF:Q4_0", | |
| help="HuggingFace model in format repo_id:quant (e.g., unsloth/Qwen3-1.7B-GGUF:Q2_K_L)") | |
| parser.add_argument("-c", "--cpu", action="store_true", help="Force CPU only inference") | |
| parser.add_argument("-l", "--language", type=str, choices=["en", "zh-TW"], default="en", | |
| help="Output language (default: en)") | |
| args = parser.parse_args() | |
| # Parse model argument if provided | |
| if ":" in args.model: | |
| repo_id, quant = args.model.rsplit(":", 1) | |
| filename = f"*{quant}.gguf" | |
| else: | |
| print(f"Error: Invalid model format '{args.model}'. Expected format: repo_id:quant") | |
| return | |
| print(f"Loading model: {repo_id} ({filename}) with {'CPU only' if args.cpu else 'SYCL acceleration'}...") | |
| # Load the model | |
| llm = load_model(repo_id, filename, cpu_only=args.cpu) | |
| # Read the transcript | |
| transcript_path = args.input | |
| if not os.path.exists(transcript_path): | |
| print(f"Error: Input file '{transcript_path}' not found.") | |
| return | |
| transcript = read_transcript(transcript_path) | |
| print("\nOriginal Transcript (Preview):") | |
| print(transcript[:500] + "..." if len(transcript) > 500 else transcript) | |
| # Summarize with streaming | |
| summary = stream_summarize_transcript(llm, transcript, output_language=args.language) | |
| # Save summaries to files | |
| # Parse thinking blocks and separate content | |
| thinking_content, summary_content = parse_thinking_blocks(summary) | |
| # Write thinking content if present | |
| if thinking_content: | |
| with open("thinking.txt", 'w', encoding='utf-8') as f: | |
| f.write(thinking_content) | |
| print(f"\n[Thinking content saved to thinking.txt ({len(thinking_content)} chars)]") | |
| # Write summary content | |
| with open("summary.txt", 'w', encoding='utf-8') as f: | |
| f.write(summary_content) | |
| print(f"[Summary saved to summary.txt ({len(summary_content)} chars)]") | |
| # Clean up | |
| del llm | |
| if __name__ == "__main__": | |
| main() | |