Spaces:

Luigi
/

tiny-scribe

Running

App Files Files Community

tiny-scribe / summarize_transcript.py

Luigi

Add bilingual support: English and Traditional Chinese (zh-TW)

e78283f 10 days ago

raw

history blame contribute delete

6.65 kB

	#!/usr/bin/env python3
	"""
	Script to summarize transcript using ERNIE-4.5-21B-A3B-PT-GGUF model with SYCL acceleration.
	"""

	import os
	import argparse
	import re
	from typing import Tuple
	from llama_cpp import Llama
	from huggingface_hub import hf_hub_download
	from opencc import OpenCC

	def load_model(repo_id, filename, cpu_only=False):
	"""Load the model from Hugging Face Hub."""

	# Initialize the model with SYCL support (or CPU only if requested)
	llm = Llama.from_pretrained(
	repo_id=repo_id,
	filename=filename,
	n_gpu_layers=0 if cpu_only else -1, # 0 for CPU, -1 for all layers on GPU
	seed=1337,
	n_ctx=32768, # Context size
	verbose=True, # Reduced verbosity for cleaner output
	v_type=2,
	k_type=2,
	)

	return llm

	def read_transcript(file_path):
	"""Read the transcript file."""
	with open(file_path, 'r', encoding='utf-8') as f:
	content = f.read()
	return content

	def parse_thinking_blocks(content: str) -> Tuple[str, str]:
	"""
	Parse thinking blocks from model output.
	Supports both <think> and <thinking> tags.

	Args:
	content: Full model response containing thinking blocks and summary

	Returns:
	Tuple of (thinking_content, summary_content)
	- thinking_content: All text between <think>/<thinking> tags (or empty string)
	- summary_content: All text outside thinking blocks (or full content if no tags)
	"""
	# Match both <think> and <thinking> tags
	pattern = r'<think(?:ing)?>(.*?)</think(?:ing)?>'
	matches = re.findall(pattern, content, re.DOTALL)

	if not matches:
	# No thinking blocks found - return entire content as summary
	return ("", content)

	# Extract all thinking blocks
	thinking = '\n\n'.join(match.strip() for match in matches)

	# Remove thinking blocks from content to get summary
	summary = re.sub(pattern, '', content, flags=re.DOTALL).strip()

	return (thinking, summary)

	def stream_summarize_transcript(llm, transcript, output_language="en"):
	"""
	Perform live streaming summary by getting real-time token output from the model.

	Args:
	llm: The loaded language model
	transcript: The full transcript to summarize
	output_language: Target language for summary ("en" or "zh-TW")
	"""
	cc = OpenCC('s2twp') # Simplified Chinese to Traditional Chinese (Taiwan standard with phrase conversion)

	# Use the model's chat format based on its template and language
	if output_language == "zh-TW":
	system_msg = "你是一個有助的助手，負責總結轉錄內容。"
	user_msg = f"請總結以下內容：\n\n{transcript}"
	else:
	system_msg = "You are a helpful assistant that summarizes transcripts."
	user_msg = f"Please summarize the following content:\n\n{transcript}"

	messages = [
	{"role": "system", "content": system_msg},
	{"role": "user", "content": user_msg}
	]

	# Generate the summary using streaming completion
	lang_display = "zh-TW" if output_language == "zh-TW" else "English"
	print(f"\nStreaming {lang_display} summary:")
	print("="*50)

	full_response = ""

	stream = llm.create_chat_completion(
	messages=messages,
	max_tokens=1024,
	temperature=0.6,
	min_p=0.0,
	top_p=0.95,
	top_k=20,
	stop=["<\|end_of_text\|>", "<\|eot_id\|>", "<\|eom_id\|>"],
	stream=True
	)

	for chunk in stream:
	if 'choices' in chunk and len(chunk['choices']) > 0:
	delta = chunk['choices'][0].get('delta', {})
	content = delta.get('content', '')
	if content:
	if output_language == "zh-TW":
	converted_content = cc.convert(content)
	print(converted_content, end='', flush=True)
	full_response += converted_content
	else:
	print(content, end='', flush=True)
	full_response += content

	print("\n" + "="*50)

	# Reset the model state to ensure clean state for next call
	llm.reset()

	return full_response.strip()


	def main():
	parser = argparse.ArgumentParser(description="Summarize transcript in zh-TW using a GGUF model.")
	parser.add_argument("-i", "--input", type=str,
	default="./transcripts/short.txt",
	help="Path to the input transcript file (default: ./transcripts/short.txt)")
	parser.add_argument("-m", "--model", type=str,
	default="unsloth/Qwen3-0.6B-GGUF:Q4_0",
	help="HuggingFace model in format repo_id:quant (e.g., unsloth/Qwen3-1.7B-GGUF:Q2_K_L)")
	parser.add_argument("-c", "--cpu", action="store_true", help="Force CPU only inference")
	parser.add_argument("-l", "--language", type=str, choices=["en", "zh-TW"], default="en",
	help="Output language (default: en)")
	args = parser.parse_args()

	# Parse model argument if provided
	if ":" in args.model:
	repo_id, quant = args.model.rsplit(":", 1)
	filename = f"*{quant}.gguf"
	else:
	print(f"Error: Invalid model format '{args.model}'. Expected format: repo_id:quant")
	return

	print(f"Loading model: {repo_id} ({filename}) with {'CPU only' if args.cpu else 'SYCL acceleration'}...")

	# Load the model
	llm = load_model(repo_id, filename, cpu_only=args.cpu)

	# Read the transcript
	transcript_path = args.input
	if not os.path.exists(transcript_path):
	print(f"Error: Input file '{transcript_path}' not found.")
	return

	transcript = read_transcript(transcript_path)

	print("\nOriginal Transcript (Preview):")
	print(transcript[:500] + "..." if len(transcript) > 500 else transcript)

	# Summarize with streaming
	summary = stream_summarize_transcript(llm, transcript, output_language=args.language)

	# Save summaries to files
	# Parse thinking blocks and separate content
	thinking_content, summary_content = parse_thinking_blocks(summary)

	# Write thinking content if present
	if thinking_content:
	with open("thinking.txt", 'w', encoding='utf-8') as f:
	f.write(thinking_content)
	print(f"\n[Thinking content saved to thinking.txt ({len(thinking_content)} chars)]")

	# Write summary content
	with open("summary.txt", 'w', encoding='utf-8') as f:
	f.write(summary_content)
	print(f"[Summary saved to summary.txt ({len(summary_content)} chars)]")

	# Clean up
	del llm

	if __name__ == "__main__":
	main()