AksharPatel commited on
Commit
5781f0e
Β·
1 Parent(s): 70b1bfb
Files changed (2) hide show
  1. app.py +236 -122
  2. requirements.txt +5 -2
app.py CHANGED
@@ -2,116 +2,153 @@ import gradio as gr
2
  import torch
3
  import torchaudio
4
  import os
 
 
5
  from pathlib import Path
6
  from huggingface_hub import snapshot_download
7
  import tempfile
8
  import warnings
9
  warnings.filterwarnings('ignore')
10
 
11
- # Try to import GLM-TTS components - if they fail, we'll handle gracefully
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  try:
13
- # These imports assume the GLM-TTS code structure
14
  from cosyvoice.cli.frontend import CosyVoiceFrontEnd
15
  from cosyvoice.utils.file_utils import load_wav
16
- import sys
17
-
18
- # Add current directory to path for imports
19
- sys.path.insert(0, str(Path(__file__).parent))
20
-
21
- IMPORTS_AVAILABLE = True
22
- except ImportError as e:
23
- print(f"Warning: Could not import all GLM-TTS components: {e}")
24
- IMPORTS_AVAILABLE = False
25
 
26
- class GLMTTSWrapper:
27
- """Simplified wrapper for GLM-TTS inference"""
28
 
29
  def __init__(self):
 
 
 
30
  self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
31
  print(f"🎯 Using device: {self.device}")
32
 
 
 
 
33
  # Model directory
34
  self.model_dir = Path("./ckpt")
35
 
36
  # Download models if not present
37
  if not self.model_dir.exists():
38
- print("πŸ“₯ Downloading GLM-TTS models from HuggingFace (this may take a few minutes)...")
39
- try:
40
- snapshot_download(
41
- repo_id="zai-org/GLM-TTS",
42
- local_dir=str(self.model_dir),
43
- local_dir_use_symlinks=False,
44
- resume_download=True
45
- )
46
- print("βœ… Models downloaded successfully!")
47
- except Exception as e:
48
- print(f"❌ Error downloading models: {e}")
49
- raise
50
 
51
- # Initialize models
52
  self.load_models()
53
 
54
  def load_models(self):
55
- """Load the GLM-TTS models"""
56
- print("πŸ”„ Loading GLM-TTS models...")
57
-
58
  try:
59
- # Import here to avoid issues if not available
60
- from transformers import AutoTokenizer, AutoModelForCausalLM
61
-
62
- # Load LLM tokenizer and model
63
- llm_path = self.model_dir / "llm"
64
- print(f"Loading LLM from {llm_path}")
65
 
66
- self.tokenizer = AutoTokenizer.from_pretrained(
67
- str(llm_path),
68
- trust_remote_code=True
 
 
 
 
69
  )
70
 
71
- self.llm_model = AutoModelForCausalLM.from_pretrained(
72
- str(llm_path),
73
- trust_remote_code=True,
 
 
74
  torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
75
- device_map="auto" if torch.cuda.is_available() else None
76
  )
 
 
77
 
78
- if not torch.cuda.is_available():
79
- self.llm_model = self.llm_model.to(self.device)
 
 
 
80
 
81
- self.llm_model.eval()
 
 
 
 
 
 
 
82
 
83
- print("βœ… Models loaded successfully!")
84
  self.models_loaded = True
85
 
86
  except Exception as e:
87
  print(f"❌ Error loading models: {e}")
88
- print("Note: GLM-TTS requires ~8GB VRAM and may not work on CPU-only spaces")
 
89
  self.models_loaded = False
90
  raise
91
 
92
  def process_reference_audio(self, audio_path):
93
  """Process reference audio for voice cloning"""
94
  try:
95
- audio, sr = torchaudio.load(audio_path)
 
96
 
97
- # Resample to 22050 Hz if needed
98
- if sr != 22050:
99
- resampler = torchaudio.transforms.Resample(sr, 22050)
100
- audio = resampler(audio)
101
 
102
- # Convert to mono
103
- if audio.shape[0] > 1:
104
- audio = torch.mean(audio, dim=0, keepdim=True)
105
-
106
- return audio, 22050
107
  except Exception as e:
108
  print(f"Error processing reference audio: {e}")
109
- return None, None
110
 
111
  def synthesize(
112
  self,
113
  text: str,
114
  ref_audio_path: str = None,
 
115
  speed: float = 1.0
116
  ):
117
  """
@@ -119,58 +156,118 @@ class GLMTTSWrapper:
119
 
120
  Args:
121
  text: Text to synthesize
122
- ref_audio_path: Optional reference audio for voice cloning
 
123
  speed: Speech speed multiplier
124
 
125
  Returns:
126
  tuple: (audio_file_path, status_message)
127
  """
128
  if not self.models_loaded:
129
- return None, "❌ Models not loaded. This space requires GPU resources."
130
 
131
  try:
132
- print(f"πŸŽ™οΈ Synthesizing: {text[:50]}...")
133
 
134
  # Process reference audio if provided
135
- if ref_audio_path:
136
- ref_audio, ref_sr = self.process_reference_audio(ref_audio_path)
137
- if ref_audio is None:
 
 
138
  return None, "❌ Failed to process reference audio"
139
- print("βœ“ Reference audio processed")
140
 
141
- # Generate speech
142
- # Note: This is a simplified version. Full implementation would use
143
- # the complete GLM-TTS pipeline with LLM -> Flow -> Vocoder
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
 
145
- # For now, return a placeholder message
146
- return None, "⚠️ Full inference pipeline requires GPU resources and complete GLM-TTS setup. Please run locally for best results."
 
 
 
 
 
 
 
 
147
 
148
  except Exception as e:
149
- return None, f"❌ Error during synthesis: {str(e)}"
 
 
 
150
 
151
- # Initialize model (will be set to None if loading fails)
 
152
  tts_model = None
153
- model_status = "Loading..."
154
 
155
  try:
156
- print("πŸš€ Initializing GLM-TTS...")
157
- tts_model = GLMTTSWrapper()
158
- model_status = "βœ… Ready"
159
  except Exception as e:
160
- print(f"Failed to initialize GLM-TTS: {e}")
161
- model_status = f"❌ Failed: {str(e)}"
162
- tts_model = None
 
163
 
164
  def generate_speech(text, ref_audio, speed):
165
  """Gradio interface function"""
166
 
167
- if tts_model is None:
168
- return None, f"❌ Model not available: {model_status}\n\nπŸ’‘ Tip: GLM-TTS requires GPU resources (8GB+ VRAM) to run."
169
 
170
  if not text or len(text.strip()) == 0:
171
  return None, "⚠️ Please enter text to synthesize"
172
 
173
- # Call synthesis
174
  audio_path, message = tts_model.synthesize(
175
  text=text,
176
  ref_audio_path=ref_audio,
@@ -182,45 +279,47 @@ def generate_speech(text, ref_audio, speed):
182
  # Create Gradio Interface
183
  with gr.Blocks(
184
  title="GLM-TTS Voice Cloning",
185
- theme=gr.themes.Soft()
 
 
 
 
186
  ) as demo:
187
 
188
  gr.Markdown("""
189
- # πŸŽ™οΈ GLM-TTS: Zero-Shot Voice Cloning & TTS
190
 
191
  **State-of-the-art voice cloning** with just 3-10 seconds of audio!
192
 
193
  ### ⚑ Features:
194
- - 🎯 **Zero-shot cloning** - No training required
195
  - 🌏 **Bilingual** - Chinese & English support
196
- - 🎭 **Emotion control** - Natural & expressive
197
- - ⚑ **High quality** - CER: 0.89 (best among open-source)
198
-
199
- ### πŸ“ How to Use:
200
- 1. **Basic TTS**: Enter text β†’ Click Generate
201
- 2. **Voice Cloning**: Upload 3-10s audio sample β†’ Enter text β†’ Generate
202
  """)
203
 
204
  gr.Markdown(f"""
205
- ### πŸ”§ Model Status: {model_status}
 
 
206
  """)
207
 
208
  with gr.Row():
209
  with gr.Column(scale=1):
210
  text_input = gr.Textbox(
211
  label="πŸ“ Text to Synthesize",
212
- placeholder="Enter text here...\n\nExample: Hello! This is a demonstration of GLM-TTS voice cloning technology.",
213
  lines=6,
214
  value="Hello! This is GLM-TTS, a powerful text-to-speech system with zero-shot voice cloning capabilities."
215
  )
216
 
217
- with gr.Accordion("🎡 Voice Cloning (Optional)", open=False):
218
  ref_audio_input = gr.Audio(
219
- label="Reference Audio (3-10 seconds)",
220
  type="filepath",
221
  sources=["upload", "microphone"]
222
  )
223
- gr.Markdown("*Upload audio of the voice you want to clone*")
224
 
225
  with gr.Accordion("βš™οΈ Advanced Settings", open=False):
226
  speed_slider = gr.Slider(
@@ -228,10 +327,18 @@ with gr.Blocks(
228
  minimum=0.5,
229
  maximum=2.0,
230
  value=1.0,
231
- step=0.1
 
232
  )
233
 
234
  generate_btn = gr.Button("🎡 Generate Speech", variant="primary", size="lg")
 
 
 
 
 
 
 
235
 
236
  with gr.Column(scale=1):
237
  audio_output = gr.Audio(
@@ -239,19 +346,21 @@ with gr.Blocks(
239
  type="filepath"
240
  )
241
  status_output = gr.Textbox(
242
- label="πŸ“Š Status",
243
- lines=4,
244
- interactive=False
 
245
  )
246
 
247
  # Examples
248
  gr.Markdown("### πŸ“š Example Texts")
249
  gr.Examples(
250
  examples=[
251
- ["Hello! Welcome to GLM-TTS, the state-of-the-art voice cloning system.", None, 1.0],
252
- ["ζ¬’θΏŽδ½Ώη”¨GLM-TTSθ―­ιŸ³εˆζˆη³»η»ŸοΌŒθΏ™ζ˜―δΈ€δΈͺε…ˆθΏ›ηš„ι›Άζ ·ζœ¬θ―­ιŸ³ε…‹ιš†ζŠ€ζœ―γ€‚", None, 1.0],
253
- ["Artificial intelligence is transforming how we interact with technology.", None, 1.0],
254
- ["δΊΊε·₯ζ™Ίθƒ½ζ­£εœ¨ζ”Ήε˜ζˆ‘δ»¬δΈŽη§‘ζŠ€δΊ’εŠ¨ηš„ζ–ΉεΌγ€‚", None, 1.0],
 
255
  ],
256
  inputs=[text_input, ref_audio_input, speed_slider],
257
  outputs=[audio_output, status_output],
@@ -261,27 +370,30 @@ with gr.Blocks(
261
 
262
  gr.Markdown("""
263
  ---
264
- ### πŸ’‘ Tips:
265
- - **Best quality**: Use clear audio with minimal noise
266
- - **Optimal length**: 3-10 seconds of reference audio
267
- - **Languages**: Full Chinese support, good English support
268
- - **Mixed text**: Chinese-English mixed text supported
269
-
270
- ### ⚠️ Requirements:
271
- - **GPU**: ~8GB VRAM recommended for inference
272
- - **CPU**: Possible but very slow
273
 
274
  ### πŸ”— Resources:
275
- - [GitHub](https://github.com/zai-org/GLM-TTS) | [Model Card](https://huggingface.co/zai-org/GLM-TTS)
276
- - [Paper](https://github.com/zai-org/GLM-TTS#citation) | [Demo Site](https://audio.z.ai)
 
 
 
 
 
 
 
277
 
278
  ### πŸ“„ Citation:
279
  ```bibtex
280
  @misc{glmtts2025,
281
- title={GLM-TTS: Controllable & Emotion-Expressive Zero-shot TTS with Multi-Reward Reinforcement Learning},
282
- author={CogAudio Group Members},
283
- year={2025},
284
- publisher={Zhipu AI Inc}
285
  }
286
  ```
287
  """)
@@ -290,14 +402,16 @@ with gr.Blocks(
290
  generate_btn.click(
291
  fn=generate_speech,
292
  inputs=[text_input, ref_audio_input, speed_slider],
293
- outputs=[audio_output, status_output]
 
294
  )
295
 
296
  # Launch
297
  if __name__ == "__main__":
298
- demo.queue(max_size=10)
299
  demo.launch(
300
  server_name="0.0.0.0",
301
  server_port=7860,
302
- share=False
 
303
  )
 
2
  import torch
3
  import torchaudio
4
  import os
5
+ import sys
6
+ import subprocess
7
  from pathlib import Path
8
  from huggingface_hub import snapshot_download
9
  import tempfile
10
  import warnings
11
  warnings.filterwarnings('ignore')
12
 
13
+ # Setup GLM-TTS environment
14
+ def setup_glm_tts():
15
+ """Download and setup GLM-TTS repository"""
16
+ glm_tts_dir = Path("./GLM-TTS")
17
+
18
+ if not glm_tts_dir.exists():
19
+ print("πŸ“₯ Cloning GLM-TTS repository...")
20
+ subprocess.run(
21
+ ["git", "clone", "https://github.com/zai-org/GLM-TTS.git"],
22
+ check=True
23
+ )
24
+ print("βœ… GLM-TTS repository cloned")
25
+
26
+ # Add to Python path
27
+ if str(glm_tts_dir) not in sys.path:
28
+ sys.path.insert(0, str(glm_tts_dir))
29
+
30
+ return glm_tts_dir
31
+
32
+ # Setup on import
33
+ print("πŸ”§ Setting up GLM-TTS environment...")
34
+ GLM_TTS_DIR = setup_glm_tts()
35
+
36
+ # Now import GLM-TTS components
37
  try:
 
38
  from cosyvoice.cli.frontend import CosyVoiceFrontEnd
39
  from cosyvoice.utils.file_utils import load_wav
40
+ from llm.glmtts import GLMTTSModel
41
+ from flow.flow import FlowMatchingModel
42
+ from utils.hift_util import load_hift
43
+ from utils.vocos_util import load_vocos
44
+ IMPORTS_OK = True
45
+ print("βœ… GLM-TTS components imported successfully")
46
+ except Exception as e:
47
+ print(f"❌ Failed to import GLM-TTS components: {e}")
48
+ IMPORTS_OK = False
49
 
50
+ class GLMTTSInference:
51
+ """GLM-TTS Inference Wrapper"""
52
 
53
  def __init__(self):
54
+ if not IMPORTS_OK:
55
+ raise RuntimeError("GLM-TTS components not available")
56
+
57
  self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
58
  print(f"🎯 Using device: {self.device}")
59
 
60
+ if not torch.cuda.is_available():
61
+ print("⚠️ WARNING: Running on CPU. Inference will be very slow!")
62
+
63
  # Model directory
64
  self.model_dir = Path("./ckpt")
65
 
66
  # Download models if not present
67
  if not self.model_dir.exists():
68
+ print("πŸ“₯ Downloading GLM-TTS models from HuggingFace...")
69
+ snapshot_download(
70
+ repo_id="zai-org/GLM-TTS",
71
+ local_dir=str(self.model_dir),
72
+ local_dir_use_symlinks=False,
73
+ resume_download=True
74
+ )
75
+ print("βœ… Models downloaded successfully!")
 
 
 
 
76
 
77
+ # Load models
78
  self.load_models()
79
 
80
  def load_models(self):
81
+ """Load all GLM-TTS models"""
 
 
82
  try:
83
+ print("πŸ”„ Loading GLM-TTS models...")
 
 
 
 
 
84
 
85
+ # Load frontend
86
+ print("Loading frontend...")
87
+ frontend_dir = self.model_dir / "frontend"
88
+ self.frontend = CosyVoiceFrontEnd(
89
+ speech_tokenizer_model_dir=str(self.model_dir / "speech_tokenizer"),
90
+ campplus_model_dir=str(frontend_dir / "campplus.onnx"),
91
+ speech_tokenizer_config_path=str(self.model_dir / "speech_tokenizer" / "config.json"),
92
  )
93
 
94
+ # Load LLM
95
+ print("Loading LLM model...")
96
+ llm_dir = self.model_dir / "llm"
97
+ self.llm_model = GLMTTSModel.from_pretrained(
98
+ str(llm_dir),
99
  torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
 
100
  )
101
+ self.llm_model = self.llm_model.to(self.device)
102
+ self.llm_model.eval()
103
 
104
+ # Load Flow model
105
+ print("Loading Flow model...")
106
+ flow_path = self.model_dir / "flow" / "flow.pt"
107
+ self.flow_model = torch.jit.load(str(flow_path), map_location=self.device)
108
+ self.flow_model.eval()
109
 
110
+ # Load vocoder
111
+ print("Loading vocoder...")
112
+ hift_path = self.model_dir / "hift" / "hift.pt"
113
+ if hift_path.exists():
114
+ self.vocoder = load_hift(str(hift_path), self.device)
115
+ else:
116
+ vocos_path = self.model_dir / "vocos2d" / "generator_jit.ckpt"
117
+ self.vocoder = load_vocos(str(vocos_path), self.device)
118
 
119
+ print("βœ… All models loaded successfully!")
120
  self.models_loaded = True
121
 
122
  except Exception as e:
123
  print(f"❌ Error loading models: {e}")
124
+ import traceback
125
+ traceback.print_exc()
126
  self.models_loaded = False
127
  raise
128
 
129
  def process_reference_audio(self, audio_path):
130
  """Process reference audio for voice cloning"""
131
  try:
132
+ # Use frontend to process audio
133
+ prompt_speech_16k = load_wav(audio_path, 16000)
134
 
135
+ # Extract features
136
+ tts_speech_token = self.frontend.extract_speech_token(prompt_speech_16k)
137
+ embedding = self.frontend.extract_spk_embedding(prompt_speech_16k)
 
138
 
139
+ return {
140
+ 'speech_token': tts_speech_token,
141
+ 'embedding': embedding
142
+ }
 
143
  except Exception as e:
144
  print(f"Error processing reference audio: {e}")
145
+ return None
146
 
147
  def synthesize(
148
  self,
149
  text: str,
150
  ref_audio_path: str = None,
151
+ ref_text: str = "",
152
  speed: float = 1.0
153
  ):
154
  """
 
156
 
157
  Args:
158
  text: Text to synthesize
159
+ ref_audio_path: Reference audio for voice cloning (optional)
160
+ ref_text: Transcript of reference audio (optional)
161
  speed: Speech speed multiplier
162
 
163
  Returns:
164
  tuple: (audio_file_path, status_message)
165
  """
166
  if not self.models_loaded:
167
+ return None, "❌ Models not loaded properly"
168
 
169
  try:
170
+ print(f"πŸŽ™οΈ Synthesizing: '{text[:100]}...'")
171
 
172
  # Process reference audio if provided
173
+ prompt_data = None
174
+ if ref_audio_path and os.path.exists(ref_audio_path):
175
+ print("Processing reference audio...")
176
+ prompt_data = self.process_reference_audio(ref_audio_path)
177
+ if prompt_data is None:
178
  return None, "❌ Failed to process reference audio"
 
179
 
180
+ # Prepare input
181
+ print("Preparing text input...")
182
+ text_input = self.frontend.text_normalize(text, split=True)
183
+
184
+ # Generate with LLM
185
+ print("Generating speech tokens...")
186
+ with torch.no_grad():
187
+ # Create input for LLM
188
+ if prompt_data:
189
+ # Zero-shot with reference
190
+ model_input = self.frontend.frontend_zero_shot(
191
+ text_input,
192
+ prompt_data['speech_token'],
193
+ prompt_data['embedding']
194
+ )
195
+ else:
196
+ # Basic TTS without reference
197
+ model_input = self.frontend.frontend_sft(text_input)
198
+
199
+ # Move to device
200
+ for key in model_input:
201
+ if isinstance(model_input[key], torch.Tensor):
202
+ model_input[key] = model_input[key].to(self.device)
203
+
204
+ # Generate speech tokens
205
+ speech_token = self.llm_model.generate(
206
+ **model_input,
207
+ max_new_tokens=2000,
208
+ do_sample=True,
209
+ temperature=0.8,
210
+ top_k=20,
211
+ top_p=0.95,
212
+ )
213
+
214
+ # Convert tokens to mel-spectrogram using Flow
215
+ print("Converting to mel-spectrogram...")
216
+ mel = self.flow_model(speech_token)
217
+
218
+ # Convert mel to audio using vocoder
219
+ print("Generating audio waveform...")
220
+ audio = self.vocoder(mel)
221
+
222
+ # Convert to numpy and save
223
+ audio_np = audio.squeeze().cpu().numpy()
224
+
225
+ # Apply speed adjustment if needed
226
+ if speed != 1.0:
227
+ import librosa
228
+ audio_np = librosa.effects.time_stretch(audio_np, rate=1.0/speed)
229
 
230
+ # Save to temporary file
231
+ output_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
232
+ torchaudio.save(
233
+ output_path,
234
+ torch.from_numpy(audio_np).unsqueeze(0),
235
+ 22050
236
+ )
237
+
238
+ print(f"βœ… Audio saved to {output_path}")
239
+ return output_path, "βœ… Success! Audio generated successfully."
240
 
241
  except Exception as e:
242
+ import traceback
243
+ error_msg = f"❌ Error during synthesis: {str(e)}\n{traceback.format_exc()}"
244
+ print(error_msg)
245
+ return None, error_msg
246
 
247
+ # Initialize model
248
+ print("πŸš€ Initializing GLM-TTS...")
249
  tts_model = None
250
+ model_status = "⏳ Loading..."
251
 
252
  try:
253
+ tts_model = GLMTTSInference()
254
+ model_status = "βœ… Ready! (Note: CPU inference is slow)"
 
255
  except Exception as e:
256
+ import traceback
257
+ model_status = f"❌ Failed to load: {str(e)}"
258
+ print(f"Failed to initialize: {e}")
259
+ traceback.print_exc()
260
 
261
  def generate_speech(text, ref_audio, speed):
262
  """Gradio interface function"""
263
 
264
+ if tts_model is None or not tts_model.models_loaded:
265
+ return None, f"❌ Model not available.\n\n{model_status}\n\nπŸ’‘ This may require GPU resources or additional setup."
266
 
267
  if not text or len(text.strip()) == 0:
268
  return None, "⚠️ Please enter text to synthesize"
269
 
270
+ # Synthesize
271
  audio_path, message = tts_model.synthesize(
272
  text=text,
273
  ref_audio_path=ref_audio,
 
279
  # Create Gradio Interface
280
  with gr.Blocks(
281
  title="GLM-TTS Voice Cloning",
282
+ theme=gr.themes.Soft(),
283
+ css="""
284
+ .gradio-container {max-width: 1200px !important}
285
+ .status-box {font-family: monospace; font-size: 12px;}
286
+ """
287
  ) as demo:
288
 
289
  gr.Markdown("""
290
+ # πŸŽ™οΈ GLM-TTS: Zero-Shot Voice Cloning & Text-to-Speech
291
 
292
  **State-of-the-art voice cloning** with just 3-10 seconds of audio!
293
 
294
  ### ⚑ Features:
295
+ - 🎯 **Zero-shot cloning** - Clone any voice without training
296
  - 🌏 **Bilingual** - Chinese & English support
297
+ - 🎭 **Emotion control** - Natural & expressive speech
298
+ - ⚑ **High quality** - Best-in-class among open-source models
 
 
 
 
299
  """)
300
 
301
  gr.Markdown(f"""
302
+ <div style="padding: 10px; background-color: #f0f0f0; border-radius: 5px; margin: 10px 0;">
303
+ <strong>πŸ”§ Model Status:</strong> {model_status}
304
+ </div>
305
  """)
306
 
307
  with gr.Row():
308
  with gr.Column(scale=1):
309
  text_input = gr.Textbox(
310
  label="πŸ“ Text to Synthesize",
311
+ placeholder="Enter text here (Chinese or English)...\n\nExample: Hello! This is a demonstration of GLM-TTS voice cloning.",
312
  lines=6,
313
  value="Hello! This is GLM-TTS, a powerful text-to-speech system with zero-shot voice cloning capabilities."
314
  )
315
 
316
+ with gr.Accordion("🎡 Voice Cloning (Optional)", open=True):
317
  ref_audio_input = gr.Audio(
318
+ label="Reference Audio (3-10 seconds recommended)",
319
  type="filepath",
320
  sources=["upload", "microphone"]
321
  )
322
+ gr.Markdown("*Upload audio of the voice you want to clone. Leave empty for default voice.*")
323
 
324
  with gr.Accordion("βš™οΈ Advanced Settings", open=False):
325
  speed_slider = gr.Slider(
 
327
  minimum=0.5,
328
  maximum=2.0,
329
  value=1.0,
330
+ step=0.1,
331
+ info="Adjust speaking speed (1.0 = normal)"
332
  )
333
 
334
  generate_btn = gr.Button("🎡 Generate Speech", variant="primary", size="lg")
335
+
336
+ gr.Markdown("""
337
+ ### ⚠️ Note:
338
+ - **CPU inference is very slow** (~5-10 minutes per generation)
339
+ - For faster results, use GPU-enabled spaces
340
+ - First generation may take longer as models initialize
341
+ """)
342
 
343
  with gr.Column(scale=1):
344
  audio_output = gr.Audio(
 
346
  type="filepath"
347
  )
348
  status_output = gr.Textbox(
349
+ label="πŸ“Š Status / Logs",
350
+ lines=8,
351
+ interactive=False,
352
+ elem_classes=["status-box"]
353
  )
354
 
355
  # Examples
356
  gr.Markdown("### πŸ“š Example Texts")
357
  gr.Examples(
358
  examples=[
359
+ ["Hello! Welcome to GLM-TTS voice cloning system.", None, 1.0],
360
+ ["ζ¬’θΏŽδ½Ώη”¨GLM-TTS语音合成系统!", None, 1.0],
361
+ ["Artificial intelligence is transforming our world.", None, 1.0],
362
+ ["δΊΊε·₯ζ™Ίθƒ½ζ­£εœ¨ζ”Ήε˜δΈ–η•ŒοΌŒθ―­ιŸ³εˆζˆζŠ€ζœ―δΉŸεœ¨δΈζ–­θΏ›ζ­₯。", None, 1.0],
363
+ ["This is a test of zero-shot voice cloning technology.", None, 1.0],
364
  ],
365
  inputs=[text_input, ref_audio_input, speed_slider],
366
  outputs=[audio_output, status_output],
 
370
 
371
  gr.Markdown("""
372
  ---
373
+ ### πŸ’‘ Tips for Best Results:
374
+ - **Clear audio**: Use high-quality audio with minimal background noise
375
+ - **Optimal length**: 3-10 seconds of reference audio works best
376
+ - **Languages**: Excellent Chinese support, good English support
377
+ - **Mixed text**: Supports Chinese-English mixed sentences
378
+ - **Speed control**: Adjust from 0.5x (slow) to 2.0x (fast)
 
 
 
379
 
380
  ### πŸ”— Resources:
381
+ - [GitHub Repository](https://github.com/zai-org/GLM-TTS)
382
+ - [Model Card on HuggingFace](https://huggingface.co/zai-org/GLM-TTS)
383
+ - [Official Demo](https://audio.z.ai)
384
+
385
+ ### πŸ“Š Performance:
386
+ - **Character Error Rate**: 0.89 (best among open-source)
387
+ - **Speaker Similarity**: 76.4
388
+ - **Architecture**: LLM + Flow Matching + Vocoder
389
+ - **Model Size**: ~8.9 GB
390
 
391
  ### πŸ“„ Citation:
392
  ```bibtex
393
  @misc{glmtts2025,
394
+ title={GLM-TTS: Controllable & Emotion-Expressive Zero-shot TTS},
395
+ author={Zhipu AI CogAudio Group},
396
+ year={2025}
 
397
  }
398
  ```
399
  """)
 
402
  generate_btn.click(
403
  fn=generate_speech,
404
  inputs=[text_input, ref_audio_input, speed_slider],
405
+ outputs=[audio_output, status_output],
406
+ api_name="generate"
407
  )
408
 
409
  # Launch
410
  if __name__ == "__main__":
411
+ demo.queue(max_size=20)
412
  demo.launch(
413
  server_name="0.0.0.0",
414
  server_port=7860,
415
+ share=False,
416
+ show_error=True
417
  )
requirements.txt CHANGED
@@ -16,7 +16,6 @@ omegaconf>=2.3.0
16
  WeTextProcessing
17
  soxr
18
  matplotlib>=3.7.0
19
- encodec
20
  tensorboard
21
  tensorboardX
22
  kaldiio
@@ -27,4 +26,8 @@ inflect
27
  eng_to_ipa
28
  unidecode
29
  g2p_en
30
- regex
 
 
 
 
 
16
  WeTextProcessing
17
  soxr
18
  matplotlib>=3.7.0
 
19
  tensorboard
20
  tensorboardX
21
  kaldiio
 
26
  eng_to_ipa
27
  unidecode
28
  g2p_en
29
+ regex
30
+ safetensors
31
+ accelerate
32
+ sentencepiece
33
+ protobuf