Spaces:

throgletworld
/

MultiModalSpeechDisfluencyDetectionSystem

Sleeping

App Files Files Community

throgletworld commited on 8 days ago

Commit

b8adbd4

verified ·

1 Parent(s): 47a6dd6

Upload 3 files

Browse files

Files changed (3) hide show

app.py +31 -23
packages.txt +2 -0
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -17,6 +17,7 @@ import tempfile
 import os
 import json
 import soundfile as sf
 from datetime import datetime
 from transformers import WavLMModel
 import torch.nn as nn
@@ -112,30 +113,31 @@ def load_models():
 # ============================================================================
 def preprocess_audio(audio_path):
-    """Convert audio to 16kHz mono using soundfile to avoid torchcodec."""
-    # Read audio file with soundfile
-    waveform_np, sr = sf.read(audio_path, dtype='float32')
-    # Convert numpy array to torch tensor
-    waveform = torch.from_numpy(waveform_np).float()
-    # Add channel dimension if it's mono
-    if waveform.dim() == 1:
-        waveform = waveform.unsqueeze(0)
-    # Transpose if it's (samples, channels)
-    elif waveform.shape[1] < waveform.shape[0]:
-         waveform = waveform.T
-    # Convert to mono
-    if waveform.shape[0] > 1:
-        waveform = waveform.mean(dim=0, keepdim=True)
-    # Resample to 16kHz
     if sr != 16000:
         resampler = torchaudio.transforms.Resample(sr, 16000)
-        waveform = resampler(waveform)
-    return waveform.squeeze(0), 16000
 def chunk_audio(waveform, sr, chunk_sec=3.0):
     """Split audio into chunks"""
@@ -194,12 +196,15 @@ def analyze_audio(audio_file, threshold=0.5):
         load_models()
     if audio_file is None:
-        return "Please upload an audio file", "", "", ""
     try:
         # Preprocess
         waveform, sr = preprocess_audio(audio_file)
         duration = len(waveform) / sr
         # Chunk and analyze with WavLM
         chunks = chunk_audio(waveform, sr)
@@ -288,13 +293,16 @@ def analyze_audio(audio_file, threshold=0.5):
         return summary, annotated_text, timeline_text, definitions
     except Exception as e:
-        return f"Error: {str(e)}", "", "", ""
 # ============================================================================
 # GRADIO INTERFACE
 # ============================================================================
-with gr.Blocks(title="🎙️ Stutter Analysis", theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
     # 🎙️ Speech Fluency Analysis System
@@ -351,4 +359,4 @@ with gr.Blocks(title="🎙️ Stutter Analysis", theme=gr.themes.Soft()) as demo
 load_models()
 if __name__ == "__main__":
-    demo.launch()

 import os
 import json
 import soundfile as sf
+import librosa
 from datetime import datetime
 from transformers import WavLMModel
 import torch.nn as nn
 # ============================================================================
 def preprocess_audio(audio_path):
+    """Convert audio to 16kHz mono using soundfile or librosa."""
+    try:
+        # Try loading with soundfile first (faster)
+        waveform_np, sr = sf.read(audio_path, dtype='float32')
+        # Handle multi-channel (soundfile returns (samples, channels))
+        if len(waveform_np.shape) > 1:
+            waveform_np = waveform_np.mean(axis=1)
+    except Exception as e:
+        print(f"Soundfile load failed, trying librosa: {e}")
+        # Fallback to librosa (handles mp3/m4a better via ffmpeg)
+        # librosa loads as mono by default, and we can force sr=16000 here
+        waveform_np, sr = librosa.load(audio_path, sr=16000, mono=True)
+    # Convert to tensor
+    waveform = torch.from_numpy(waveform_np).float()
+    # Resample if needed (only if soundfile was used and sr != 16000)
+    # If librosa was used, it's already 16000
     if sr != 16000:
         resampler = torchaudio.transforms.Resample(sr, 16000)
+        waveform = resampler(waveform.unsqueeze(0)).squeeze(0)
+    return waveform, 16000
 def chunk_audio(waveform, sr, chunk_sec=3.0):
     """Split audio into chunks"""
         load_models()
     if audio_file is None:
+        return "⚠️ Please upload an audio file", "", "", ""
     try:
+        print(f"Starting analysis of: {audio_file}")
         # Preprocess
         waveform, sr = preprocess_audio(audio_file)
         duration = len(waveform) / sr
+        print(f"Audio preprocessed: {duration:.1f}s, {sr}Hz")
         # Chunk and analyze with WavLM
         chunks = chunk_audio(waveform, sr)
         return summary, annotated_text, timeline_text, definitions
     except Exception as e:
+        import traceback
+        error_trace = traceback.format_exc()
+        print(f"Error in analyze_audio: {error_trace}")
+        return f"❌ Error: {str(e)}\n\n```\n{error_trace}\n```", "", "", ""
 # ============================================================================
 # GRADIO INTERFACE
 # ============================================================================
+with gr.Blocks(title="🎙️ Stutter Analysis") as demo:
     gr.Markdown("""
     # 🎙️ Speech Fluency Analysis System
 load_models()
 if __name__ == "__main__":
+    demo.launch(theme=gr.themes.Soft())

packages.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ ffmpeg
2	+ libsndfile1

requirements.txt CHANGED Viewed

@@ -8,3 +8,4 @@ gradio>=4.0.0
 openai-whisper>=20231117
 numpy>=1.24.0
 soundfile>=0.12.0

 openai-whisper>=20231117
 numpy>=1.24.0
 soundfile>=0.12.0
+librosa>=0.10.0