Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import torch | |
| import numpy as np | |
| import tempfile | |
| import os | |
| from pathlib import Path | |
| from sam_audio import SAMAudio, SAMAudioProcessor | |
| from torchcodec.decoders import VideoDecoder | |
| import warnings | |
| # Suppress warnings for cleaner output | |
| warnings.filterwarnings("ignore") | |
| # Global variables to store model and processor | |
| model = None | |
| processor = None | |
| device = None | |
| # Custom theme for professional UI | |
| custom_theme = gr.themes.Soft( | |
| primary_hue="blue", | |
| secondary_hue="indigo", | |
| neutral_hue="slate", | |
| font=gr.themes.GoogleFont("Inter"), | |
| text_size="lg", | |
| spacing_size="lg", | |
| radius_size="md" | |
| ).set( | |
| button_primary_background_fill="*primary_600", | |
| button_primary_background_fill_hover="*primary_700", | |
| block_title_text_weight="600", | |
| ) | |
| def load_models(): | |
| """Load the SAM-Audio model and processor""" | |
| global model, processor, device | |
| if model is None: | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| print(f"Loading SAM-Audio model on {device}...") | |
| model = SAMAudio.from_pretrained("facebook/sam-audio-large").to(device).eval() | |
| processor = SAMAudioProcessor.from_pretrained("facebook/sam-audio-large") | |
| print("Models loaded successfully!") | |
| return "Models loaded and ready for audio separation!" | |
| def create_mask_from_video(video_path, prompt_text): | |
| """ | |
| Create a mask using SAM3 (simplified version for demo) | |
| In a real implementation, you would use the actual SAM3 model | |
| """ | |
| try: | |
| # For demo purposes, we'll create a simple mock mask | |
| # In production, you would use the actual SAM3 model here | |
| # Load video to get dimensions | |
| decoder = VideoDecoder(video_path) | |
| frames = decoder[:] | |
| height, width = frames.shape[3], frames.shape[4] | |
| # Create a simple mock mask (this would be replaced with actual SAM3 output) | |
| # For demo, we'll create a mask that covers the left half of the video | |
| mask = np.zeros((len(decoder), 1, height, width), dtype=bool) | |
| mask[:, :, :, :width//2] = True # Left half mask | |
| return mask | |
| except Exception as e: | |
| print(f"Error creating mask: {e}") | |
| # Return empty mask if there's an error | |
| decoder = VideoDecoder(video_path) | |
| frames = decoder[:] | |
| height, width = frames.shape[3], frames.shape[4] | |
| return np.zeros((len(decoder), 1, height, width), dtype=bool) | |
| def separate_audio_with_visual_prompting(video_file, prompt_text, progress=gr.Progress()): | |
| """ | |
| Separate audio using visual prompting with SAM3 masks | |
| """ | |
| global model, processor, device | |
| # Ensure models are loaded | |
| if model is None: | |
| load_models() | |
| try: | |
| # Create temporary file if needed | |
| if isinstance(video_file, str): | |
| video_path = video_file | |
| else: | |
| # Save uploaded file to temp location | |
| with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as temp_file: | |
| temp_file.write(video_file) | |
| video_path = temp_file.name | |
| progress(0.1, "Creating visual mask...") | |
| # Create mask using SAM3 (simplified for demo) | |
| mask = create_mask_from_video(video_path, prompt_text) | |
| mask_tensor = torch.from_numpy(mask) | |
| progress(0.3, "Loading video frames...") | |
| # Load video frames | |
| decoder = VideoDecoder(video_path) | |
| frames = decoder[:] | |
| progress(0.5, "Processing with SAM-Audio model...") | |
| # Process with visual prompting | |
| inputs = processor( | |
| audios=[video_path], | |
| descriptions=[""], # Empty description for visual-only prompting | |
| masked_videos=processor.mask_videos([frames], [mask_tensor]), | |
| ).to(device) | |
| progress(0.7, "Separating audio...") | |
| # Perform audio separation | |
| with torch.inference_mode(): | |
| result = model.separate(inputs) | |
| progress(0.9, "Processing results...") | |
| # Convert result to numpy array for playback | |
| target_audio = result.target[0].cpu().numpy() | |
| residual_audio = result.residual[0].cpu().numpy() | |
| # Clean up temp file | |
| if not isinstance(video_file, str): | |
| os.unlink(video_path) | |
| progress(1.0, "Audio separation complete!") | |
| return { | |
| "target_audio": (48000, target_audio), | |
| "residual_audio": (48000, residual_audio), | |
| "status": "Success: Audio separation completed!" | |
| } | |
| except Exception as e: | |
| error_msg = f"Error during audio separation: {str(e)}" | |
| print(error_msg) | |
| return { | |
| "target_audio": None, | |
| "residual_audio": None, | |
| "status": error_msg | |
| } | |
| def simple_audio_separation(video_file, progress=gr.Progress()): | |
| """ | |
| Simple audio separation without visual prompting | |
| """ | |
| global model, processor, device | |
| # Ensure models are loaded | |
| if model is None: | |
| load_models() | |
| try: | |
| # Create temporary file if needed | |
| if isinstance(video_file, str): | |
| video_path = video_file | |
| else: | |
| # Save uploaded file to temp location | |
| with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as temp_file: | |
| temp_file.write(video_file) | |
| video_path = temp_file.name | |
| progress(0.3, "Processing with SAM-Audio model...") | |
| # Process without visual prompting | |
| inputs = processor( | |
| audios=[video_path], | |
| descriptions=["Separate the main audio source"], | |
| ).to(device) | |
| progress(0.6, "Separating audio...") | |
| # Perform audio separation | |
| with torch.inference_mode(): | |
| result = model.separate(inputs) | |
| progress(0.9, "Processing results...") | |
| # Convert result to numpy array for playback | |
| target_audio = result.target[0].cpu().numpy() | |
| residual_audio = result.residual[0].cpu().numpy() | |
| # Clean up temp file | |
| if not isinstance(video_file, str): | |
| os.unlink(video_path) | |
| progress(1.0, "Audio separation complete!") | |
| return { | |
| "target_audio": (48000, target_audio), | |
| "residual_audio": (48000, residual_audio), | |
| "status": "Success: Audio separation completed!" | |
| } | |
| except Exception as e: | |
| error_msg = f"Error during audio separation: {str(e)}" | |
| print(error_msg) | |
| return { | |
| "target_audio": None, | |
| "residual_audio": None, | |
| "status": error_msg | |
| } | |
| # Create the Gradio interface | |
| with gr.Blocks(title="SAM Audio Large - Audio Separation", theme=custom_theme) as demo: | |
| gr.Markdown(""" | |
| # π΅ SAM Audio Large - Audio Separation | |
| This demo showcases the SAM Audio Large model for audio separation with visual prompting capabilities. | |
| Upload a video and separate audio sources using text prompts to identify visual objects. | |
| **Built with anycoder** - [Visit our Space](https://huggingface.co/spaces/akhaliq/anycoder) | |
| """) | |
| # Initialize models on app load | |
| gr.Button("Load Models", variant="primary").click( | |
| fn=load_models, | |
| outputs=gr.Textbox(label="Model Status", interactive=False) | |
| ) | |
| with gr.Tabs(): | |
| # Tab 1: Visual Prompting | |
| with gr.Tab("Visual Prompting"): | |
| gr.Markdown(""" | |
| ## π₯ Visual Prompting for Audio Separation | |
| Use text prompts to identify visual objects in the video and separate their associated audio. | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| video_input = gr.Video(label="Upload Video", sources=["upload"]) | |
| prompt_input = gr.Textbox( | |
| label="Visual Prompt", | |
| placeholder="e.g., 'The person on the left', 'The guitar player', 'The car engine'", | |
| lines=2 | |
| ) | |
| separate_btn = gr.Button("Separate Audio with Visual Prompt", variant="primary") | |
| status_output = gr.Textbox(label="Status", interactive=False) | |
| with gr.Column(): | |
| target_audio_output = gr.Audio(label="Target Audio (Separated)", type="numpy") | |
| residual_audio_output = gr.Audio(label="Residual Audio", type="numpy") | |
| separate_btn.click( | |
| fn=separate_audio_with_visual_prompting, | |
| inputs=[video_input, prompt_input], | |
| outputs=[target_audio_output, residual_audio_output, status_output], | |
| api_visibility="public" | |
| ) | |
| # Tab 2: Simple Audio Separation | |
| with gr.Tab("Simple Audio Separation"): | |
| gr.Markdown(""" | |
| ## π΅ Simple Audio Separation | |
| Basic audio separation without visual prompting. | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| simple_video_input = gr.Video(label="Upload Video", sources=["upload"]) | |
| simple_separate_btn = gr.Button("Separate Audio", variant="primary") | |
| simple_status_output = gr.Textbox(label="Status", interactive=False) | |
| with gr.Column(): | |
| simple_target_audio_output = gr.Audio(label="Target Audio (Separated)", type="numpy") | |
| simple_residual_audio_output = gr.Audio(label="Residual Audio", type="numpy") | |
| simple_separate_btn.click( | |
| fn=simple_audio_separation, | |
| inputs=[simple_video_input], | |
| outputs=[simple_target_audio_output, simple_residual_audio_output, simple_status_output], | |
| api_visibility="public" | |
| ) | |
| # Tab 3: About | |
| with gr.Tab("About"): | |
| gr.Markdown(""" | |
| ## π About SAM Audio Large | |
| **SAM Audio Large** is a state-of-the-art audio separation model that can isolate specific audio sources from complex mixtures. | |
| ### Features: | |
| - **Visual Prompting**: Use text descriptions to identify visual objects and separate their associated audio | |
| - **High Quality**: Produces clean audio separations with minimal artifacts | |
| - **Flexible**: Works with various audio sources and video formats | |
| ### How to Use: | |
| 1. **Visual Prompting**: Upload a video and provide a text prompt describing the visual object you want to isolate | |
| 2. **Simple Separation**: Upload a video for basic audio separation without visual guidance | |
| 3. The model will process the video and return separated audio tracks | |
| ### Technical Details: | |
| - Model: `facebook/sam-audio-large` | |
| - Sampling Rate: 48kHz | |
| - Processing: GPU-accelerated for fast inference | |
| ### Limitations: | |
| - Visual prompting requires SAM3 for mask generation (simplified in this demo) | |
| - Processing time depends on video length and complexity | |
| - Best results with clear visual-audio associations | |
| **Built with anycoder** - [Visit our Space](https://huggingface.co/spaces/akhaliq/anycoder) | |
| """) | |
| # Add examples | |
| gr.Examples( | |
| examples=[ | |
| ["https://gradio-builds.s3.amazonaws.com/assets/sample_video.mp4", "The person speaking"], | |
| ["https://gradio-builds.s3.amazonaws.com/assets/music_video.mp4", "The guitar player"], | |
| ], | |
| inputs=[video_input, prompt_input], | |
| outputs=[target_audio_output, residual_audio_output, status_output], | |
| label="Example Videos", | |
| examples_per_page=4 | |
| ) | |
| # Launch the app with Gradio 6 syntax | |
| demo.launch( | |
| theme=custom_theme, | |
| footer_links=[ | |
| {"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"}, | |
| {"label": "GitHub", "url": "https://github.com/facebookresearch/sam-audio"}, | |
| {"label": "Model Card", "url": "https://huggingface.co/facebook/sam-audio-large"} | |
| ], | |
| show_error=True, | |
| debug=False | |
| ) |