import gradio as gr import torch import numpy as np import tempfile import os from pathlib import Path from sam_audio import SAMAudio, SAMAudioProcessor from torchcodec.decoders import VideoDecoder import warnings # Suppress warnings for cleaner output warnings.filterwarnings("ignore") # Global variables to store model and processor model = None processor = None device = None # Custom theme for professional UI custom_theme = gr.themes.Soft( primary_hue="blue", secondary_hue="indigo", neutral_hue="slate", font=gr.themes.GoogleFont("Inter"), text_size="lg", spacing_size="lg", radius_size="md" ).set( button_primary_background_fill="*primary_600", button_primary_background_fill_hover="*primary_700", block_title_text_weight="600", ) def load_models(): """Load the SAM-Audio model and processor""" global model, processor, device if model is None: device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"Loading SAM-Audio model on {device}...") model = SAMAudio.from_pretrained("facebook/sam-audio-large").to(device).eval() processor = SAMAudioProcessor.from_pretrained("facebook/sam-audio-large") print("Models loaded successfully!") return "Models loaded and ready for audio separation!" def create_mask_from_video(video_path, prompt_text): """ Create a mask using SAM3 (simplified version for demo) In a real implementation, you would use the actual SAM3 model """ try: # For demo purposes, we'll create a simple mock mask # In production, you would use the actual SAM3 model here # Load video to get dimensions decoder = VideoDecoder(video_path) frames = decoder[:] height, width = frames.shape[3], frames.shape[4] # Create a simple mock mask (this would be replaced with actual SAM3 output) # For demo, we'll create a mask that covers the left half of the video mask = np.zeros((len(decoder), 1, height, width), dtype=bool) mask[:, :, :, :width//2] = True # Left half mask return mask except Exception as e: print(f"Error creating mask: {e}") # Return empty mask if there's an error decoder = VideoDecoder(video_path) frames = decoder[:] height, width = frames.shape[3], frames.shape[4] return np.zeros((len(decoder), 1, height, width), dtype=bool) def separate_audio_with_visual_prompting(video_file, prompt_text, progress=gr.Progress()): """ Separate audio using visual prompting with SAM3 masks """ global model, processor, device # Ensure models are loaded if model is None: load_models() try: # Create temporary file if needed if isinstance(video_file, str): video_path = video_file else: # Save uploaded file to temp location with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as temp_file: temp_file.write(video_file) video_path = temp_file.name progress(0.1, "Creating visual mask...") # Create mask using SAM3 (simplified for demo) mask = create_mask_from_video(video_path, prompt_text) mask_tensor = torch.from_numpy(mask) progress(0.3, "Loading video frames...") # Load video frames decoder = VideoDecoder(video_path) frames = decoder[:] progress(0.5, "Processing with SAM-Audio model...") # Process with visual prompting inputs = processor( audios=[video_path], descriptions=[""], # Empty description for visual-only prompting masked_videos=processor.mask_videos([frames], [mask_tensor]), ).to(device) progress(0.7, "Separating audio...") # Perform audio separation with torch.inference_mode(): result = model.separate(inputs) progress(0.9, "Processing results...") # Convert result to numpy array for playback target_audio = result.target[0].cpu().numpy() residual_audio = result.residual[0].cpu().numpy() # Clean up temp file if not isinstance(video_file, str): os.unlink(video_path) progress(1.0, "Audio separation complete!") return { "target_audio": (48000, target_audio), "residual_audio": (48000, residual_audio), "status": "Success: Audio separation completed!" } except Exception as e: error_msg = f"Error during audio separation: {str(e)}" print(error_msg) return { "target_audio": None, "residual_audio": None, "status": error_msg } def simple_audio_separation(video_file, progress=gr.Progress()): """ Simple audio separation without visual prompting """ global model, processor, device # Ensure models are loaded if model is None: load_models() try: # Create temporary file if needed if isinstance(video_file, str): video_path = video_file else: # Save uploaded file to temp location with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as temp_file: temp_file.write(video_file) video_path = temp_file.name progress(0.3, "Processing with SAM-Audio model...") # Process without visual prompting inputs = processor( audios=[video_path], descriptions=["Separate the main audio source"], ).to(device) progress(0.6, "Separating audio...") # Perform audio separation with torch.inference_mode(): result = model.separate(inputs) progress(0.9, "Processing results...") # Convert result to numpy array for playback target_audio = result.target[0].cpu().numpy() residual_audio = result.residual[0].cpu().numpy() # Clean up temp file if not isinstance(video_file, str): os.unlink(video_path) progress(1.0, "Audio separation complete!") return { "target_audio": (48000, target_audio), "residual_audio": (48000, residual_audio), "status": "Success: Audio separation completed!" } except Exception as e: error_msg = f"Error during audio separation: {str(e)}" print(error_msg) return { "target_audio": None, "residual_audio": None, "status": error_msg } # Create the Gradio interface with gr.Blocks(title="SAM Audio Large - Audio Separation", theme=custom_theme) as demo: gr.Markdown(""" # 🎵 SAM Audio Large - Audio Separation This demo showcases the SAM Audio Large model for audio separation with visual prompting capabilities. Upload a video and separate audio sources using text prompts to identify visual objects. **Built with anycoder** - [Visit our Space](https://huggingface.co/spaces/akhaliq/anycoder) """) # Initialize models on app load gr.Button("Load Models", variant="primary").click( fn=load_models, outputs=gr.Textbox(label="Model Status", interactive=False) ) with gr.Tabs(): # Tab 1: Visual Prompting with gr.Tab("Visual Prompting"): gr.Markdown(""" ## 🎥 Visual Prompting for Audio Separation Use text prompts to identify visual objects in the video and separate their associated audio. """) with gr.Row(): with gr.Column(): video_input = gr.Video(label="Upload Video", sources=["upload"]) prompt_input = gr.Textbox( label="Visual Prompt", placeholder="e.g., 'The person on the left', 'The guitar player', 'The car engine'", lines=2 ) separate_btn = gr.Button("Separate Audio with Visual Prompt", variant="primary") status_output = gr.Textbox(label="Status", interactive=False) with gr.Column(): target_audio_output = gr.Audio(label="Target Audio (Separated)", type="numpy") residual_audio_output = gr.Audio(label="Residual Audio", type="numpy") separate_btn.click( fn=separate_audio_with_visual_prompting, inputs=[video_input, prompt_input], outputs=[target_audio_output, residual_audio_output, status_output], api_visibility="public" ) # Tab 2: Simple Audio Separation with gr.Tab("Simple Audio Separation"): gr.Markdown(""" ## 🎵 Simple Audio Separation Basic audio separation without visual prompting. """) with gr.Row(): with gr.Column(): simple_video_input = gr.Video(label="Upload Video", sources=["upload"]) simple_separate_btn = gr.Button("Separate Audio", variant="primary") simple_status_output = gr.Textbox(label="Status", interactive=False) with gr.Column(): simple_target_audio_output = gr.Audio(label="Target Audio (Separated)", type="numpy") simple_residual_audio_output = gr.Audio(label="Residual Audio", type="numpy") simple_separate_btn.click( fn=simple_audio_separation, inputs=[simple_video_input], outputs=[simple_target_audio_output, simple_residual_audio_output, simple_status_output], api_visibility="public" ) # Tab 3: About with gr.Tab("About"): gr.Markdown(""" ## 📋 About SAM Audio Large **SAM Audio Large** is a state-of-the-art audio separation model that can isolate specific audio sources from complex mixtures. ### Features: - **Visual Prompting**: Use text descriptions to identify visual objects and separate their associated audio - **High Quality**: Produces clean audio separations with minimal artifacts - **Flexible**: Works with various audio sources and video formats ### How to Use: 1. **Visual Prompting**: Upload a video and provide a text prompt describing the visual object you want to isolate 2. **Simple Separation**: Upload a video for basic audio separation without visual guidance 3. The model will process the video and return separated audio tracks ### Technical Details: - Model: `facebook/sam-audio-large` - Sampling Rate: 48kHz - Processing: GPU-accelerated for fast inference ### Limitations: - Visual prompting requires SAM3 for mask generation (simplified in this demo) - Processing time depends on video length and complexity - Best results with clear visual-audio associations **Built with anycoder** - [Visit our Space](https://huggingface.co/spaces/akhaliq/anycoder) """) # Add examples gr.Examples( examples=[ ["https://gradio-builds.s3.amazonaws.com/assets/sample_video.mp4", "The person speaking"], ["https://gradio-builds.s3.amazonaws.com/assets/music_video.mp4", "The guitar player"], ], inputs=[video_input, prompt_input], outputs=[target_audio_output, residual_audio_output, status_output], label="Example Videos", examples_per_page=4 ) # Launch the app with Gradio 6 syntax demo.launch( theme=custom_theme, footer_links=[ {"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"}, {"label": "GitHub", "url": "https://github.com/facebookresearch/sam-audio"}, {"label": "Model Card", "url": "https://huggingface.co/facebook/sam-audio-large"} ], show_error=True, debug=False )