tiny-scribe / summarize_transcript.py
Luigi's picture
Add bilingual support: English and Traditional Chinese (zh-TW)
e78283f
#!/usr/bin/env python3
"""
Script to summarize transcript using ERNIE-4.5-21B-A3B-PT-GGUF model with SYCL acceleration.
"""
import os
import argparse
import re
from typing import Tuple
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
from opencc import OpenCC
def load_model(repo_id, filename, cpu_only=False):
"""Load the model from Hugging Face Hub."""
# Initialize the model with SYCL support (or CPU only if requested)
llm = Llama.from_pretrained(
repo_id=repo_id,
filename=filename,
n_gpu_layers=0 if cpu_only else -1, # 0 for CPU, -1 for all layers on GPU
seed=1337,
n_ctx=32768, # Context size
verbose=True, # Reduced verbosity for cleaner output
v_type=2,
k_type=2,
)
return llm
def read_transcript(file_path):
"""Read the transcript file."""
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
return content
def parse_thinking_blocks(content: str) -> Tuple[str, str]:
"""
Parse thinking blocks from model output.
Supports both <think> and <thinking> tags.
Args:
content: Full model response containing thinking blocks and summary
Returns:
Tuple of (thinking_content, summary_content)
- thinking_content: All text between <think>/<thinking> tags (or empty string)
- summary_content: All text outside thinking blocks (or full content if no tags)
"""
# Match both <think> and <thinking> tags
pattern = r'<think(?:ing)?>(.*?)</think(?:ing)?>'
matches = re.findall(pattern, content, re.DOTALL)
if not matches:
# No thinking blocks found - return entire content as summary
return ("", content)
# Extract all thinking blocks
thinking = '\n\n'.join(match.strip() for match in matches)
# Remove thinking blocks from content to get summary
summary = re.sub(pattern, '', content, flags=re.DOTALL).strip()
return (thinking, summary)
def stream_summarize_transcript(llm, transcript, output_language="en"):
"""
Perform live streaming summary by getting real-time token output from the model.
Args:
llm: The loaded language model
transcript: The full transcript to summarize
output_language: Target language for summary ("en" or "zh-TW")
"""
cc = OpenCC('s2twp') # Simplified Chinese to Traditional Chinese (Taiwan standard with phrase conversion)
# Use the model's chat format based on its template and language
if output_language == "zh-TW":
system_msg = "你是一個有助的助手,負責總結轉錄內容。"
user_msg = f"請總結以下內容:\n\n{transcript}"
else:
system_msg = "You are a helpful assistant that summarizes transcripts."
user_msg = f"Please summarize the following content:\n\n{transcript}"
messages = [
{"role": "system", "content": system_msg},
{"role": "user", "content": user_msg}
]
# Generate the summary using streaming completion
lang_display = "zh-TW" if output_language == "zh-TW" else "English"
print(f"\nStreaming {lang_display} summary:")
print("="*50)
full_response = ""
stream = llm.create_chat_completion(
messages=messages,
max_tokens=1024,
temperature=0.6,
min_p=0.0,
top_p=0.95,
top_k=20,
stop=["<|end_of_text|>", "<|eot_id|>", "<|eom_id|>"],
stream=True
)
for chunk in stream:
if 'choices' in chunk and len(chunk['choices']) > 0:
delta = chunk['choices'][0].get('delta', {})
content = delta.get('content', '')
if content:
if output_language == "zh-TW":
converted_content = cc.convert(content)
print(converted_content, end='', flush=True)
full_response += converted_content
else:
print(content, end='', flush=True)
full_response += content
print("\n" + "="*50)
# Reset the model state to ensure clean state for next call
llm.reset()
return full_response.strip()
def main():
parser = argparse.ArgumentParser(description="Summarize transcript in zh-TW using a GGUF model.")
parser.add_argument("-i", "--input", type=str,
default="./transcripts/short.txt",
help="Path to the input transcript file (default: ./transcripts/short.txt)")
parser.add_argument("-m", "--model", type=str,
default="unsloth/Qwen3-0.6B-GGUF:Q4_0",
help="HuggingFace model in format repo_id:quant (e.g., unsloth/Qwen3-1.7B-GGUF:Q2_K_L)")
parser.add_argument("-c", "--cpu", action="store_true", help="Force CPU only inference")
parser.add_argument("-l", "--language", type=str, choices=["en", "zh-TW"], default="en",
help="Output language (default: en)")
args = parser.parse_args()
# Parse model argument if provided
if ":" in args.model:
repo_id, quant = args.model.rsplit(":", 1)
filename = f"*{quant}.gguf"
else:
print(f"Error: Invalid model format '{args.model}'. Expected format: repo_id:quant")
return
print(f"Loading model: {repo_id} ({filename}) with {'CPU only' if args.cpu else 'SYCL acceleration'}...")
# Load the model
llm = load_model(repo_id, filename, cpu_only=args.cpu)
# Read the transcript
transcript_path = args.input
if not os.path.exists(transcript_path):
print(f"Error: Input file '{transcript_path}' not found.")
return
transcript = read_transcript(transcript_path)
print("\nOriginal Transcript (Preview):")
print(transcript[:500] + "..." if len(transcript) > 500 else transcript)
# Summarize with streaming
summary = stream_summarize_transcript(llm, transcript, output_language=args.language)
# Save summaries to files
# Parse thinking blocks and separate content
thinking_content, summary_content = parse_thinking_blocks(summary)
# Write thinking content if present
if thinking_content:
with open("thinking.txt", 'w', encoding='utf-8') as f:
f.write(thinking_content)
print(f"\n[Thinking content saved to thinking.txt ({len(thinking_content)} chars)]")
# Write summary content
with open("summary.txt", 'w', encoding='utf-8') as f:
f.write(summary_content)
print(f"[Summary saved to summary.txt ({len(summary_content)} chars)]")
# Clean up
del llm
if __name__ == "__main__":
main()