Spaces:
Running
Running
File size: 6,653 Bytes
7ac9e1f f8cb070 7ac9e1f 2bc625f 8a9d263 7ac9e1f 3ec1246 7ac9e1f c16840d 7ac9e1f d4fd1c3 c16840d 7ac9e1f fba2074 c16840d 7ac9e1f 01dc9b6 7ac9e1f 0745d93 7ac9e1f d4fd1c3 7ac9e1f 8a9d263 62499af 8a9d263 62499af 8a9d263 62499af 8a9d263 e78283f d4fd1c3 e78283f d4fd1c3 3ec1246 d4fd1c3 e78283f 0745d93 e78283f 0745d93 d4fd1c3 e78283f d4fd1c3 83a3f98 d4fd1c3 e78283f d4fd1c3 7ac9e1f fba2074 83a3f98 2fb2f79 c16840d e78283f 2bc625f fba2074 c16840d d4fd1c3 7ac9e1f c16840d d4fd1c3 7ac9e1f 2bc625f 7ac9e1f d4fd1c3 7ac9e1f d4fd1c3 e78283f d4fd1c3 7ac9e1f 8a9d263 d4fd1c3 8a9d263 d4fd1c3 7ac9e1f d4fd1c3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 |
#!/usr/bin/env python3
"""
Script to summarize transcript using ERNIE-4.5-21B-A3B-PT-GGUF model with SYCL acceleration.
"""
import os
import argparse
import re
from typing import Tuple
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
from opencc import OpenCC
def load_model(repo_id, filename, cpu_only=False):
"""Load the model from Hugging Face Hub."""
# Initialize the model with SYCL support (or CPU only if requested)
llm = Llama.from_pretrained(
repo_id=repo_id,
filename=filename,
n_gpu_layers=0 if cpu_only else -1, # 0 for CPU, -1 for all layers on GPU
seed=1337,
n_ctx=32768, # Context size
verbose=True, # Reduced verbosity for cleaner output
v_type=2,
k_type=2,
)
return llm
def read_transcript(file_path):
"""Read the transcript file."""
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
return content
def parse_thinking_blocks(content: str) -> Tuple[str, str]:
"""
Parse thinking blocks from model output.
Supports both <think> and <thinking> tags.
Args:
content: Full model response containing thinking blocks and summary
Returns:
Tuple of (thinking_content, summary_content)
- thinking_content: All text between <think>/<thinking> tags (or empty string)
- summary_content: All text outside thinking blocks (or full content if no tags)
"""
# Match both <think> and <thinking> tags
pattern = r'<think(?:ing)?>(.*?)</think(?:ing)?>'
matches = re.findall(pattern, content, re.DOTALL)
if not matches:
# No thinking blocks found - return entire content as summary
return ("", content)
# Extract all thinking blocks
thinking = '\n\n'.join(match.strip() for match in matches)
# Remove thinking blocks from content to get summary
summary = re.sub(pattern, '', content, flags=re.DOTALL).strip()
return (thinking, summary)
def stream_summarize_transcript(llm, transcript, output_language="en"):
"""
Perform live streaming summary by getting real-time token output from the model.
Args:
llm: The loaded language model
transcript: The full transcript to summarize
output_language: Target language for summary ("en" or "zh-TW")
"""
cc = OpenCC('s2twp') # Simplified Chinese to Traditional Chinese (Taiwan standard with phrase conversion)
# Use the model's chat format based on its template and language
if output_language == "zh-TW":
system_msg = "你是一個有助的助手,負責總結轉錄內容。"
user_msg = f"請總結以下內容:\n\n{transcript}"
else:
system_msg = "You are a helpful assistant that summarizes transcripts."
user_msg = f"Please summarize the following content:\n\n{transcript}"
messages = [
{"role": "system", "content": system_msg},
{"role": "user", "content": user_msg}
]
# Generate the summary using streaming completion
lang_display = "zh-TW" if output_language == "zh-TW" else "English"
print(f"\nStreaming {lang_display} summary:")
print("="*50)
full_response = ""
stream = llm.create_chat_completion(
messages=messages,
max_tokens=1024,
temperature=0.6,
min_p=0.0,
top_p=0.95,
top_k=20,
stop=["<|end_of_text|>", "<|eot_id|>", "<|eom_id|>"],
stream=True
)
for chunk in stream:
if 'choices' in chunk and len(chunk['choices']) > 0:
delta = chunk['choices'][0].get('delta', {})
content = delta.get('content', '')
if content:
if output_language == "zh-TW":
converted_content = cc.convert(content)
print(converted_content, end='', flush=True)
full_response += converted_content
else:
print(content, end='', flush=True)
full_response += content
print("\n" + "="*50)
# Reset the model state to ensure clean state for next call
llm.reset()
return full_response.strip()
def main():
parser = argparse.ArgumentParser(description="Summarize transcript in zh-TW using a GGUF model.")
parser.add_argument("-i", "--input", type=str,
default="./transcripts/short.txt",
help="Path to the input transcript file (default: ./transcripts/short.txt)")
parser.add_argument("-m", "--model", type=str,
default="unsloth/Qwen3-0.6B-GGUF:Q4_0",
help="HuggingFace model in format repo_id:quant (e.g., unsloth/Qwen3-1.7B-GGUF:Q2_K_L)")
parser.add_argument("-c", "--cpu", action="store_true", help="Force CPU only inference")
parser.add_argument("-l", "--language", type=str, choices=["en", "zh-TW"], default="en",
help="Output language (default: en)")
args = parser.parse_args()
# Parse model argument if provided
if ":" in args.model:
repo_id, quant = args.model.rsplit(":", 1)
filename = f"*{quant}.gguf"
else:
print(f"Error: Invalid model format '{args.model}'. Expected format: repo_id:quant")
return
print(f"Loading model: {repo_id} ({filename}) with {'CPU only' if args.cpu else 'SYCL acceleration'}...")
# Load the model
llm = load_model(repo_id, filename, cpu_only=args.cpu)
# Read the transcript
transcript_path = args.input
if not os.path.exists(transcript_path):
print(f"Error: Input file '{transcript_path}' not found.")
return
transcript = read_transcript(transcript_path)
print("\nOriginal Transcript (Preview):")
print(transcript[:500] + "..." if len(transcript) > 500 else transcript)
# Summarize with streaming
summary = stream_summarize_transcript(llm, transcript, output_language=args.language)
# Save summaries to files
# Parse thinking blocks and separate content
thinking_content, summary_content = parse_thinking_blocks(summary)
# Write thinking content if present
if thinking_content:
with open("thinking.txt", 'w', encoding='utf-8') as f:
f.write(thinking_content)
print(f"\n[Thinking content saved to thinking.txt ({len(thinking_content)} chars)]")
# Write summary content
with open("summary.txt", 'w', encoding='utf-8') as f:
f.write(summary_content)
print(f"[Summary saved to summary.txt ({len(summary_content)} chars)]")
# Clean up
del llm
if __name__ == "__main__":
main()
|