Spaces:

dromus
/

glm-tts

Running

App Files Files Community

AksharPatel commited on 10 days ago

Commit

5781f0e

1 Parent(s): 70b1bfb

init3

Browse files

Files changed (2) hide show

app.py +236 -122
requirements.txt +5 -2

app.py CHANGED Viewed

@@ -2,116 +2,153 @@ import gradio as gr
 import torch
 import torchaudio
 import os
 from pathlib import Path
 from huggingface_hub import snapshot_download
 import tempfile
 import warnings
 warnings.filterwarnings('ignore')
-# Try to import GLM-TTS components - if they fail, we'll handle gracefully
 try:
-    # These imports assume the GLM-TTS code structure
     from cosyvoice.cli.frontend import CosyVoiceFrontEnd
     from cosyvoice.utils.file_utils import load_wav
-    import sys
-    # Add current directory to path for imports
-    sys.path.insert(0, str(Path(__file__).parent))
-    IMPORTS_AVAILABLE = True
-except ImportError as e:
-    print(f"Warning: Could not import all GLM-TTS components: {e}")
-    IMPORTS_AVAILABLE = False
-class GLMTTSWrapper:
-    """Simplified wrapper for GLM-TTS inference"""
     def __init__(self):
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
         print(f"🎯 Using device: {self.device}")
         # Model directory
         self.model_dir = Path("./ckpt")
         # Download models if not present
         if not self.model_dir.exists():
-            print("📥 Downloading GLM-TTS models from HuggingFace (this may take a few minutes)...")
-            try:
-                snapshot_download(
-                    repo_id="zai-org/GLM-TTS",
-                    local_dir=str(self.model_dir),
-                    local_dir_use_symlinks=False,
-                    resume_download=True
-                )
-                print("✅ Models downloaded successfully!")
-            except Exception as e:
-                print(f"❌ Error downloading models: {e}")
-                raise
-        # Initialize models
         self.load_models()
     def load_models(self):
-        """Load the GLM-TTS models"""
-        print("🔄 Loading GLM-TTS models...")
         try:
-            # Import here to avoid issues if not available
-            from transformers import AutoTokenizer, AutoModelForCausalLM
-            # Load LLM tokenizer and model
-            llm_path = self.model_dir / "llm"
-            print(f"Loading LLM from {llm_path}")
-            self.tokenizer = AutoTokenizer.from_pretrained(
-                str(llm_path),
-                trust_remote_code=True
             )
-            self.llm_model = AutoModelForCausalLM.from_pretrained(
-                str(llm_path),
-                trust_remote_code=True,
                 torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-                device_map="auto" if torch.cuda.is_available() else None
             )
-            if not torch.cuda.is_available():
-                self.llm_model = self.llm_model.to(self.device)
-            self.llm_model.eval()
-            print("✅ Models loaded successfully!")
             self.models_loaded = True
         except Exception as e:
             print(f"❌ Error loading models: {e}")
-            print("Note: GLM-TTS requires ~8GB VRAM and may not work on CPU-only spaces")
             self.models_loaded = False
             raise
     def process_reference_audio(self, audio_path):
         """Process reference audio for voice cloning"""
         try:
-            audio, sr = torchaudio.load(audio_path)
-            # Resample to 22050 Hz if needed
-            if sr != 22050:
-                resampler = torchaudio.transforms.Resample(sr, 22050)
-                audio = resampler(audio)
-            # Convert to mono
-            if audio.shape[0] > 1:
-                audio = torch.mean(audio, dim=0, keepdim=True)
-            return audio, 22050
         except Exception as e:
             print(f"Error processing reference audio: {e}")
-            return None, None
     def synthesize(
         self,
         text: str,
         ref_audio_path: str = None,
         speed: float = 1.0
     ):
         """
@@ -119,58 +156,118 @@ class GLMTTSWrapper:
         Args:
             text: Text to synthesize
-            ref_audio_path: Optional reference audio for voice cloning
             speed: Speech speed multiplier
         Returns:
             tuple: (audio_file_path, status_message)
         """
         if not self.models_loaded:
-            return None, "❌ Models not loaded. This space requires GPU resources."
         try:
-            print(f"🎙️ Synthesizing: {text[:50]}...")
             # Process reference audio if provided
-            if ref_audio_path:
-                ref_audio, ref_sr = self.process_reference_audio(ref_audio_path)
-                if ref_audio is None:
                     return None, "❌ Failed to process reference audio"
-                print("✓ Reference audio processed")
-            # Generate speech
-            # Note: This is a simplified version. Full implementation would use
-            # the complete GLM-TTS pipeline with LLM -> Flow -> Vocoder
-            # For now, return a placeholder message
-            return None, "⚠️ Full inference pipeline requires GPU resources and complete GLM-TTS setup. Please run locally for best results."
         except Exception as e:
-            return None, f"❌ Error during synthesis: {str(e)}"
-# Initialize model (will be set to None if loading fails)
 tts_model = None
-model_status = "Loading..."
 try:
-    print("🚀 Initializing GLM-TTS...")
-    tts_model = GLMTTSWrapper()
-    model_status = "✅ Ready"
 except Exception as e:
-    print(f"Failed to initialize GLM-TTS: {e}")
-    model_status = f"❌ Failed: {str(e)}"
-    tts_model = None
 def generate_speech(text, ref_audio, speed):
     """Gradio interface function"""
-    if tts_model is None:
-        return None, f"❌ Model not available: {model_status}\n\n💡 Tip: GLM-TTS requires GPU resources (8GB+ VRAM) to run."
     if not text or len(text.strip()) == 0:
         return None, "⚠️ Please enter text to synthesize"
-    # Call synthesis
     audio_path, message = tts_model.synthesize(
         text=text,
         ref_audio_path=ref_audio,
@@ -182,45 +279,47 @@ def generate_speech(text, ref_audio, speed):
 # Create Gradio Interface
 with gr.Blocks(
     title="GLM-TTS Voice Cloning",
-    theme=gr.themes.Soft()
 ) as demo:
     gr.Markdown("""
-    # 🎙️ GLM-TTS: Zero-Shot Voice Cloning & TTS
     **State-of-the-art voice cloning** with just 3-10 seconds of audio!
     ### ⚡ Features:
-    - 🎯 **Zero-shot cloning** - No training required
     - 🌏 **Bilingual** - Chinese & English support
-    - 🎭 **Emotion control** - Natural & expressive
-    - ⚡ **High quality** - CER: 0.89 (best among open-source)
-    ### 📝 How to Use:
-    1. **Basic TTS**: Enter text → Click Generate
-    2. **Voice Cloning**: Upload 3-10s audio sample → Enter text → Generate
     """)
     gr.Markdown(f"""
-    ### 🔧 Model Status: {model_status}
     """)
     with gr.Row():
         with gr.Column(scale=1):
             text_input = gr.Textbox(
                 label="📝 Text to Synthesize",
-                placeholder="Enter text here...\n\nExample: Hello! This is a demonstration of GLM-TTS voice cloning technology.",
                 lines=6,
                 value="Hello! This is GLM-TTS, a powerful text-to-speech system with zero-shot voice cloning capabilities."
             )
-            with gr.Accordion("🎵 Voice Cloning (Optional)", open=False):
                 ref_audio_input = gr.Audio(
-                    label="Reference Audio (3-10 seconds)",
                     type="filepath",
                     sources=["upload", "microphone"]
                 )
-                gr.Markdown("*Upload audio of the voice you want to clone*")
             with gr.Accordion("⚙️ Advanced Settings", open=False):
                 speed_slider = gr.Slider(
@@ -228,10 +327,18 @@ with gr.Blocks(
                     minimum=0.5,
                     maximum=2.0,
                     value=1.0,
-                    step=0.1
                 )
             generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
         with gr.Column(scale=1):
             audio_output = gr.Audio(
@@ -239,19 +346,21 @@ with gr.Blocks(
                 type="filepath"
             )
             status_output = gr.Textbox(
-                label="📊 Status",
-                lines=4,
-                interactive=False
             )
     # Examples
     gr.Markdown("### 📚 Example Texts")
     gr.Examples(
         examples=[
-            ["Hello! Welcome to GLM-TTS, the state-of-the-art voice cloning system.", None, 1.0],
-            ["欢迎使用GLM-TTS语音合成系统，这是一个先进的零样本语音克隆技术。", None, 1.0],
-            ["Artificial intelligence is transforming how we interact with technology.", None, 1.0],
-            ["人工智能正在改变我们与科技互动的方式。", None, 1.0],
         ],
         inputs=[text_input, ref_audio_input, speed_slider],
         outputs=[audio_output, status_output],
@@ -261,27 +370,30 @@ with gr.Blocks(
     gr.Markdown("""
     ---
-    ### 💡 Tips:
-    - **Best quality**: Use clear audio with minimal noise
-    - **Optimal length**: 3-10 seconds of reference audio
-    - **Languages**: Full Chinese support, good English support
-    - **Mixed text**: Chinese-English mixed text supported
-    ### ⚠️ Requirements:
-    - **GPU**: ~8GB VRAM recommended for inference
-    - **CPU**: Possible but very slow
     ### 🔗 Resources:
-    - [GitHub](https://github.com/zai-org/GLM-TTS) | [Model Card](https://huggingface.co/zai-org/GLM-TTS)
-    - [Paper](https://github.com/zai-org/GLM-TTS#citation) | [Demo Site](https://audio.z.ai)
     ### 📄 Citation:
     ```bibtex
     @misc{glmtts2025,
-      title={GLM-TTS: Controllable & Emotion-Expressive Zero-shot TTS with Multi-Reward Reinforcement Learning},
-      author={CogAudio Group Members},
-      year={2025},
-      publisher={Zhipu AI Inc}
     }
     ```
     """)
@@ -290,14 +402,16 @@ with gr.Blocks(
     generate_btn.click(
         fn=generate_speech,
         inputs=[text_input, ref_audio_input, speed_slider],
-        outputs=[audio_output, status_output]
     )
 # Launch
 if __name__ == "__main__":
-    demo.queue(max_size=10)
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,
-        share=False
     )

 import torch
 import torchaudio
 import os
+import sys
+import subprocess
 from pathlib import Path
 from huggingface_hub import snapshot_download
 import tempfile
 import warnings
 warnings.filterwarnings('ignore')
+# Setup GLM-TTS environment
+def setup_glm_tts():
+    """Download and setup GLM-TTS repository"""
+    glm_tts_dir = Path("./GLM-TTS")
+    if not glm_tts_dir.exists():
+        print("📥 Cloning GLM-TTS repository...")
+        subprocess.run(
+            ["git", "clone", "https://github.com/zai-org/GLM-TTS.git"],
+            check=True
+        )
+        print("✅ GLM-TTS repository cloned")
+    # Add to Python path
+    if str(glm_tts_dir) not in sys.path:
+        sys.path.insert(0, str(glm_tts_dir))
+    return glm_tts_dir
+# Setup on import
+print("🔧 Setting up GLM-TTS environment...")
+GLM_TTS_DIR = setup_glm_tts()
+# Now import GLM-TTS components
 try:
     from cosyvoice.cli.frontend import CosyVoiceFrontEnd
     from cosyvoice.utils.file_utils import load_wav
+    from llm.glmtts import GLMTTSModel
+    from flow.flow import FlowMatchingModel
+    from utils.hift_util import load_hift
+    from utils.vocos_util import load_vocos
+    IMPORTS_OK = True
+    print("✅ GLM-TTS components imported successfully")
+except Exception as e:
+    print(f"❌ Failed to import GLM-TTS components: {e}")
+    IMPORTS_OK = False
+class GLMTTSInference:
+    """GLM-TTS Inference Wrapper"""
     def __init__(self):
+        if not IMPORTS_OK:
+            raise RuntimeError("GLM-TTS components not available")
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
         print(f"🎯 Using device: {self.device}")
+        if not torch.cuda.is_available():
+            print("⚠️ WARNING: Running on CPU. Inference will be very slow!")
         # Model directory
         self.model_dir = Path("./ckpt")
         # Download models if not present
         if not self.model_dir.exists():
+            print("📥 Downloading GLM-TTS models from HuggingFace...")
+            snapshot_download(
+                repo_id="zai-org/GLM-TTS",
+                local_dir=str(self.model_dir),
+                local_dir_use_symlinks=False,
+                resume_download=True
+            )
+            print("✅ Models downloaded successfully!")
+        # Load models
         self.load_models()
     def load_models(self):
+        """Load all GLM-TTS models"""
         try:
+            print("🔄 Loading GLM-TTS models...")
+            # Load frontend
+            print("Loading frontend...")
+            frontend_dir = self.model_dir / "frontend"
+            self.frontend = CosyVoiceFrontEnd(
+                speech_tokenizer_model_dir=str(self.model_dir / "speech_tokenizer"),
+                campplus_model_dir=str(frontend_dir / "campplus.onnx"),
+                speech_tokenizer_config_path=str(self.model_dir / "speech_tokenizer" / "config.json"),
             )
+            # Load LLM
+            print("Loading LLM model...")
+            llm_dir = self.model_dir / "llm"
+            self.llm_model = GLMTTSModel.from_pretrained(
+                str(llm_dir),
                 torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
             )
+            self.llm_model = self.llm_model.to(self.device)
+            self.llm_model.eval()
+            # Load Flow model
+            print("Loading Flow model...")
+            flow_path = self.model_dir / "flow" / "flow.pt"
+            self.flow_model = torch.jit.load(str(flow_path), map_location=self.device)
+            self.flow_model.eval()
+            # Load vocoder
+            print("Loading vocoder...")
+            hift_path = self.model_dir / "hift" / "hift.pt"
+            if hift_path.exists():
+                self.vocoder = load_hift(str(hift_path), self.device)
+            else:
+                vocos_path = self.model_dir / "vocos2d" / "generator_jit.ckpt"
+                self.vocoder = load_vocos(str(vocos_path), self.device)
+            print("✅ All models loaded successfully!")
             self.models_loaded = True
         except Exception as e:
             print(f"❌ Error loading models: {e}")
+            import traceback
+            traceback.print_exc()
             self.models_loaded = False
             raise
     def process_reference_audio(self, audio_path):
         """Process reference audio for voice cloning"""
         try:
+            # Use frontend to process audio
+            prompt_speech_16k = load_wav(audio_path, 16000)
+            # Extract features
+            tts_speech_token = self.frontend.extract_speech_token(prompt_speech_16k)
+            embedding = self.frontend.extract_spk_embedding(prompt_speech_16k)
+            return {
+                'speech_token': tts_speech_token,
+                'embedding': embedding
+            }
         except Exception as e:
             print(f"Error processing reference audio: {e}")
+            return None
     def synthesize(
         self,
         text: str,
         ref_audio_path: str = None,
+        ref_text: str = "",
         speed: float = 1.0
     ):
         """
         Args:
             text: Text to synthesize
+            ref_audio_path: Reference audio for voice cloning (optional)
+            ref_text: Transcript of reference audio (optional)
             speed: Speech speed multiplier
         Returns:
             tuple: (audio_file_path, status_message)
         """
         if not self.models_loaded:
+            return None, "❌ Models not loaded properly"
         try:
+            print(f"🎙️ Synthesizing: '{text[:100]}...'")
             # Process reference audio if provided
+            prompt_data = None
+            if ref_audio_path and os.path.exists(ref_audio_path):
+                print("Processing reference audio...")
+                prompt_data = self.process_reference_audio(ref_audio_path)
+                if prompt_data is None:
                     return None, "❌ Failed to process reference audio"
+            # Prepare input
+            print("Preparing text input...")
+            text_input = self.frontend.text_normalize(text, split=True)
+            # Generate with LLM
+            print("Generating speech tokens...")
+            with torch.no_grad():
+                # Create input for LLM
+                if prompt_data:
+                    # Zero-shot with reference
+                    model_input = self.frontend.frontend_zero_shot(
+                        text_input,
+                        prompt_data['speech_token'],
+                        prompt_data['embedding']
+                    )
+                else:
+                    # Basic TTS without reference
+                    model_input = self.frontend.frontend_sft(text_input)
+                # Move to device
+                for key in model_input:
+                    if isinstance(model_input[key], torch.Tensor):
+                        model_input[key] = model_input[key].to(self.device)
+                # Generate speech tokens
+                speech_token = self.llm_model.generate(
+                    **model_input,
+                    max_new_tokens=2000,
+                    do_sample=True,
+                    temperature=0.8,
+                    top_k=20,
+                    top_p=0.95,
+                )
+                # Convert tokens to mel-spectrogram using Flow
+                print("Converting to mel-spectrogram...")
+                mel = self.flow_model(speech_token)
+                # Convert mel to audio using vocoder
+                print("Generating audio waveform...")
+                audio = self.vocoder(mel)
+            # Convert to numpy and save
+            audio_np = audio.squeeze().cpu().numpy()
+            # Apply speed adjustment if needed
+            if speed != 1.0:
+                import librosa
+                audio_np = librosa.effects.time_stretch(audio_np, rate=1.0/speed)
+            # Save to temporary file
+            output_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
+            torchaudio.save(
+                output_path,
+                torch.from_numpy(audio_np).unsqueeze(0),
+                22050
+            )
+            print(f"✅ Audio saved to {output_path}")
+            return output_path, "✅ Success! Audio generated successfully."
         except Exception as e:
+            import traceback
+            error_msg = f"❌ Error during synthesis: {str(e)}\n{traceback.format_exc()}"
+            print(error_msg)
+            return None, error_msg
+# Initialize model
+print("🚀 Initializing GLM-TTS...")
 tts_model = None
+model_status = "⏳ Loading..."
 try:
+    tts_model = GLMTTSInference()
+    model_status = "✅ Ready! (Note: CPU inference is slow)"
 except Exception as e:
+    import traceback
+    model_status = f"❌ Failed to load: {str(e)}"
+    print(f"Failed to initialize: {e}")
+    traceback.print_exc()
 def generate_speech(text, ref_audio, speed):
     """Gradio interface function"""
+    if tts_model is None or not tts_model.models_loaded:
+        return None, f"❌ Model not available.\n\n{model_status}\n\n💡 This may require GPU resources or additional setup."
     if not text or len(text.strip()) == 0:
         return None, "⚠️ Please enter text to synthesize"
+    # Synthesize
     audio_path, message = tts_model.synthesize(
         text=text,
         ref_audio_path=ref_audio,
 # Create Gradio Interface
 with gr.Blocks(
     title="GLM-TTS Voice Cloning",
+    theme=gr.themes.Soft(),
+    css="""
+    .gradio-container {max-width: 1200px !important}
+    .status-box {font-family: monospace; font-size: 12px;}
+    """
 ) as demo:
     gr.Markdown("""
+    # 🎙️ GLM-TTS: Zero-Shot Voice Cloning & Text-to-Speech
     **State-of-the-art voice cloning** with just 3-10 seconds of audio!
     ### ⚡ Features:
+    - 🎯 **Zero-shot cloning** - Clone any voice without training
     - 🌏 **Bilingual** - Chinese & English support
+    - 🎭 **Emotion control** - Natural & expressive speech
+    - ⚡ **High quality** - Best-in-class among open-source models
     """)
     gr.Markdown(f"""
+    <div style="padding: 10px; background-color: #f0f0f0; border-radius: 5px; margin: 10px 0;">
+    <strong>🔧 Model Status:</strong> {model_status}
+    </div>
     """)
     with gr.Row():
         with gr.Column(scale=1):
             text_input = gr.Textbox(
                 label="📝 Text to Synthesize",
+                placeholder="Enter text here (Chinese or English)...\n\nExample: Hello! This is a demonstration of GLM-TTS voice cloning.",
                 lines=6,
                 value="Hello! This is GLM-TTS, a powerful text-to-speech system with zero-shot voice cloning capabilities."
             )
+            with gr.Accordion("🎵 Voice Cloning (Optional)", open=True):
                 ref_audio_input = gr.Audio(
+                    label="Reference Audio (3-10 seconds recommended)",
                     type="filepath",
                     sources=["upload", "microphone"]
                 )
+                gr.Markdown("*Upload audio of the voice you want to clone. Leave empty for default voice.*")
             with gr.Accordion("⚙️ Advanced Settings", open=False):
                 speed_slider = gr.Slider(
                     minimum=0.5,
                     maximum=2.0,
                     value=1.0,
+                    step=0.1,
+                    info="Adjust speaking speed (1.0 = normal)"
                 )
             generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
+            gr.Markdown("""
+            ### ⚠️ Note:
+            - **CPU inference is very slow** (~5-10 minutes per generation)
+            - For faster results, use GPU-enabled spaces
+            - First generation may take longer as models initialize
+            """)
         with gr.Column(scale=1):
             audio_output = gr.Audio(
                 type="filepath"
             )
             status_output = gr.Textbox(
+                label="📊 Status / Logs",
+                lines=8,
+                interactive=False,
+                elem_classes=["status-box"]
             )
     # Examples
     gr.Markdown("### 📚 Example Texts")
     gr.Examples(
         examples=[
+            ["Hello! Welcome to GLM-TTS voice cloning system.", None, 1.0],
+            ["欢迎使用GLM-TTS语音合成系统！", None, 1.0],
+            ["Artificial intelligence is transforming our world.", None, 1.0],
+            ["人工智能正在改变世界，语音合成技术也在不断进步。", None, 1.0],
+            ["This is a test of zero-shot voice cloning technology.", None, 1.0],
         ],
         inputs=[text_input, ref_audio_input, speed_slider],
         outputs=[audio_output, status_output],
     gr.Markdown("""
     ---
+    ### 💡 Tips for Best Results:
+    - **Clear audio**: Use high-quality audio with minimal background noise
+    - **Optimal length**: 3-10 seconds of reference audio works best
+    - **Languages**: Excellent Chinese support, good English support
+    - **Mixed text**: Supports Chinese-English mixed sentences
+    - **Speed control**: Adjust from 0.5x (slow) to 2.0x (fast)
     ### 🔗 Resources:
+    - [GitHub Repository](https://github.com/zai-org/GLM-TTS)
+    - [Model Card on HuggingFace](https://huggingface.co/zai-org/GLM-TTS)
+    - [Official Demo](https://audio.z.ai)
+    ### 📊 Performance:
+    - **Character Error Rate**: 0.89 (best among open-source)
+    - **Speaker Similarity**: 76.4
+    - **Architecture**: LLM + Flow Matching + Vocoder
+    - **Model Size**: ~8.9 GB
     ### 📄 Citation:
     ```bibtex
     @misc{glmtts2025,
+      title={GLM-TTS: Controllable & Emotion-Expressive Zero-shot TTS},
+      author={Zhipu AI CogAudio Group},
+      year={2025}
     }
     ```
     """)
     generate_btn.click(
         fn=generate_speech,
         inputs=[text_input, ref_audio_input, speed_slider],
+        outputs=[audio_output, status_output],
+        api_name="generate"
     )
 # Launch
 if __name__ == "__main__":
+    demo.queue(max_size=20)
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,
+        share=False,
+        show_error=True
     )

requirements.txt CHANGED Viewed

@@ -16,7 +16,6 @@ omegaconf>=2.3.0
 WeTextProcessing
 soxr
 matplotlib>=3.7.0
-encodec
 tensorboard
 tensorboardX
 kaldiio
@@ -27,4 +26,8 @@ inflect
 eng_to_ipa
 unidecode
 g2p_en
-regex

 WeTextProcessing
 soxr
 matplotlib>=3.7.0
 tensorboard
 tensorboardX
 kaldiio
 eng_to_ipa
 unidecode
 g2p_en
+regex
+safetensors
+accelerate
+sentencepiece
+protobuf