throgletworld commited on
Commit
b8adbd4
ยท
verified ยท
1 Parent(s): 47a6dd6

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +31 -23
  2. packages.txt +2 -0
  3. requirements.txt +1 -0
app.py CHANGED
@@ -17,6 +17,7 @@ import tempfile
17
  import os
18
  import json
19
  import soundfile as sf
 
20
  from datetime import datetime
21
  from transformers import WavLMModel
22
  import torch.nn as nn
@@ -112,30 +113,31 @@ def load_models():
112
  # ============================================================================
113
 
114
  def preprocess_audio(audio_path):
115
- """Convert audio to 16kHz mono using soundfile to avoid torchcodec."""
116
- # Read audio file with soundfile
117
- waveform_np, sr = sf.read(audio_path, dtype='float32')
118
-
119
- # Convert numpy array to torch tensor
120
- waveform = torch.from_numpy(waveform_np).float()
121
-
122
- # Add channel dimension if it's mono
123
- if waveform.dim() == 1:
124
- waveform = waveform.unsqueeze(0)
125
- # Transpose if it's (samples, channels)
126
- elif waveform.shape[1] < waveform.shape[0]:
127
- waveform = waveform.T
 
128
 
129
- # Convert to mono
130
- if waveform.shape[0] > 1:
131
- waveform = waveform.mean(dim=0, keepdim=True)
132
 
133
- # Resample to 16kHz
 
134
  if sr != 16000:
135
  resampler = torchaudio.transforms.Resample(sr, 16000)
136
- waveform = resampler(waveform)
137
 
138
- return waveform.squeeze(0), 16000
139
 
140
  def chunk_audio(waveform, sr, chunk_sec=3.0):
141
  """Split audio into chunks"""
@@ -194,12 +196,15 @@ def analyze_audio(audio_file, threshold=0.5):
194
  load_models()
195
 
196
  if audio_file is None:
197
- return "Please upload an audio file", "", "", ""
198
 
199
  try:
 
 
200
  # Preprocess
201
  waveform, sr = preprocess_audio(audio_file)
202
  duration = len(waveform) / sr
 
203
 
204
  # Chunk and analyze with WavLM
205
  chunks = chunk_audio(waveform, sr)
@@ -288,13 +293,16 @@ def analyze_audio(audio_file, threshold=0.5):
288
  return summary, annotated_text, timeline_text, definitions
289
 
290
  except Exception as e:
291
- return f"Error: {str(e)}", "", "", ""
 
 
 
292
 
293
  # ============================================================================
294
  # GRADIO INTERFACE
295
  # ============================================================================
296
 
297
- with gr.Blocks(title="๐ŸŽ™๏ธ Stutter Analysis", theme=gr.themes.Soft()) as demo:
298
  gr.Markdown("""
299
  # ๐ŸŽ™๏ธ Speech Fluency Analysis System
300
 
@@ -351,4 +359,4 @@ with gr.Blocks(title="๐ŸŽ™๏ธ Stutter Analysis", theme=gr.themes.Soft()) as demo
351
  load_models()
352
 
353
  if __name__ == "__main__":
354
- demo.launch()
 
17
  import os
18
  import json
19
  import soundfile as sf
20
+ import librosa
21
  from datetime import datetime
22
  from transformers import WavLMModel
23
  import torch.nn as nn
 
113
  # ============================================================================
114
 
115
  def preprocess_audio(audio_path):
116
+ """Convert audio to 16kHz mono using soundfile or librosa."""
117
+ try:
118
+ # Try loading with soundfile first (faster)
119
+ waveform_np, sr = sf.read(audio_path, dtype='float32')
120
+
121
+ # Handle multi-channel (soundfile returns (samples, channels))
122
+ if len(waveform_np.shape) > 1:
123
+ waveform_np = waveform_np.mean(axis=1)
124
+
125
+ except Exception as e:
126
+ print(f"Soundfile load failed, trying librosa: {e}")
127
+ # Fallback to librosa (handles mp3/m4a better via ffmpeg)
128
+ # librosa loads as mono by default, and we can force sr=16000 here
129
+ waveform_np, sr = librosa.load(audio_path, sr=16000, mono=True)
130
 
131
+ # Convert to tensor
132
+ waveform = torch.from_numpy(waveform_np).float()
 
133
 
134
+ # Resample if needed (only if soundfile was used and sr != 16000)
135
+ # If librosa was used, it's already 16000
136
  if sr != 16000:
137
  resampler = torchaudio.transforms.Resample(sr, 16000)
138
+ waveform = resampler(waveform.unsqueeze(0)).squeeze(0)
139
 
140
+ return waveform, 16000
141
 
142
  def chunk_audio(waveform, sr, chunk_sec=3.0):
143
  """Split audio into chunks"""
 
196
  load_models()
197
 
198
  if audio_file is None:
199
+ return "โš ๏ธ Please upload an audio file", "", "", ""
200
 
201
  try:
202
+ print(f"Starting analysis of: {audio_file}")
203
+
204
  # Preprocess
205
  waveform, sr = preprocess_audio(audio_file)
206
  duration = len(waveform) / sr
207
+ print(f"Audio preprocessed: {duration:.1f}s, {sr}Hz")
208
 
209
  # Chunk and analyze with WavLM
210
  chunks = chunk_audio(waveform, sr)
 
293
  return summary, annotated_text, timeline_text, definitions
294
 
295
  except Exception as e:
296
+ import traceback
297
+ error_trace = traceback.format_exc()
298
+ print(f"Error in analyze_audio: {error_trace}")
299
+ return f"โŒ Error: {str(e)}\n\n```\n{error_trace}\n```", "", "", ""
300
 
301
  # ============================================================================
302
  # GRADIO INTERFACE
303
  # ============================================================================
304
 
305
+ with gr.Blocks(title="๐ŸŽ™๏ธ Stutter Analysis") as demo:
306
  gr.Markdown("""
307
  # ๐ŸŽ™๏ธ Speech Fluency Analysis System
308
 
 
359
  load_models()
360
 
361
  if __name__ == "__main__":
362
+ demo.launch(theme=gr.themes.Soft())
packages.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ ffmpeg
2
+ libsndfile1
requirements.txt CHANGED
@@ -8,3 +8,4 @@ gradio>=4.0.0
8
  openai-whisper>=20231117
9
  numpy>=1.24.0
10
  soundfile>=0.12.0
 
 
8
  openai-whisper>=20231117
9
  numpy>=1.24.0
10
  soundfile>=0.12.0
11
+ librosa>=0.10.0