Upload 3 files
Browse files- app.py +31 -23
- packages.txt +2 -0
- requirements.txt +1 -0
app.py
CHANGED
|
@@ -17,6 +17,7 @@ import tempfile
|
|
| 17 |
import os
|
| 18 |
import json
|
| 19 |
import soundfile as sf
|
|
|
|
| 20 |
from datetime import datetime
|
| 21 |
from transformers import WavLMModel
|
| 22 |
import torch.nn as nn
|
|
@@ -112,30 +113,31 @@ def load_models():
|
|
| 112 |
# ============================================================================
|
| 113 |
|
| 114 |
def preprocess_audio(audio_path):
|
| 115 |
-
"""Convert audio to 16kHz mono using soundfile
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
|
|
|
| 128 |
|
| 129 |
-
# Convert to
|
| 130 |
-
|
| 131 |
-
waveform = waveform.mean(dim=0, keepdim=True)
|
| 132 |
|
| 133 |
-
# Resample
|
|
|
|
| 134 |
if sr != 16000:
|
| 135 |
resampler = torchaudio.transforms.Resample(sr, 16000)
|
| 136 |
-
waveform = resampler(waveform)
|
| 137 |
|
| 138 |
-
return waveform
|
| 139 |
|
| 140 |
def chunk_audio(waveform, sr, chunk_sec=3.0):
|
| 141 |
"""Split audio into chunks"""
|
|
@@ -194,12 +196,15 @@ def analyze_audio(audio_file, threshold=0.5):
|
|
| 194 |
load_models()
|
| 195 |
|
| 196 |
if audio_file is None:
|
| 197 |
-
return "Please upload an audio file", "", "", ""
|
| 198 |
|
| 199 |
try:
|
|
|
|
|
|
|
| 200 |
# Preprocess
|
| 201 |
waveform, sr = preprocess_audio(audio_file)
|
| 202 |
duration = len(waveform) / sr
|
|
|
|
| 203 |
|
| 204 |
# Chunk and analyze with WavLM
|
| 205 |
chunks = chunk_audio(waveform, sr)
|
|
@@ -288,13 +293,16 @@ def analyze_audio(audio_file, threshold=0.5):
|
|
| 288 |
return summary, annotated_text, timeline_text, definitions
|
| 289 |
|
| 290 |
except Exception as e:
|
| 291 |
-
|
|
|
|
|
|
|
|
|
|
| 292 |
|
| 293 |
# ============================================================================
|
| 294 |
# GRADIO INTERFACE
|
| 295 |
# ============================================================================
|
| 296 |
|
| 297 |
-
with gr.Blocks(title="๐๏ธ Stutter Analysis"
|
| 298 |
gr.Markdown("""
|
| 299 |
# ๐๏ธ Speech Fluency Analysis System
|
| 300 |
|
|
@@ -351,4 +359,4 @@ with gr.Blocks(title="๐๏ธ Stutter Analysis", theme=gr.themes.Soft()) as demo
|
|
| 351 |
load_models()
|
| 352 |
|
| 353 |
if __name__ == "__main__":
|
| 354 |
-
demo.launch()
|
|
|
|
| 17 |
import os
|
| 18 |
import json
|
| 19 |
import soundfile as sf
|
| 20 |
+
import librosa
|
| 21 |
from datetime import datetime
|
| 22 |
from transformers import WavLMModel
|
| 23 |
import torch.nn as nn
|
|
|
|
| 113 |
# ============================================================================
|
| 114 |
|
| 115 |
def preprocess_audio(audio_path):
|
| 116 |
+
"""Convert audio to 16kHz mono using soundfile or librosa."""
|
| 117 |
+
try:
|
| 118 |
+
# Try loading with soundfile first (faster)
|
| 119 |
+
waveform_np, sr = sf.read(audio_path, dtype='float32')
|
| 120 |
+
|
| 121 |
+
# Handle multi-channel (soundfile returns (samples, channels))
|
| 122 |
+
if len(waveform_np.shape) > 1:
|
| 123 |
+
waveform_np = waveform_np.mean(axis=1)
|
| 124 |
+
|
| 125 |
+
except Exception as e:
|
| 126 |
+
print(f"Soundfile load failed, trying librosa: {e}")
|
| 127 |
+
# Fallback to librosa (handles mp3/m4a better via ffmpeg)
|
| 128 |
+
# librosa loads as mono by default, and we can force sr=16000 here
|
| 129 |
+
waveform_np, sr = librosa.load(audio_path, sr=16000, mono=True)
|
| 130 |
|
| 131 |
+
# Convert to tensor
|
| 132 |
+
waveform = torch.from_numpy(waveform_np).float()
|
|
|
|
| 133 |
|
| 134 |
+
# Resample if needed (only if soundfile was used and sr != 16000)
|
| 135 |
+
# If librosa was used, it's already 16000
|
| 136 |
if sr != 16000:
|
| 137 |
resampler = torchaudio.transforms.Resample(sr, 16000)
|
| 138 |
+
waveform = resampler(waveform.unsqueeze(0)).squeeze(0)
|
| 139 |
|
| 140 |
+
return waveform, 16000
|
| 141 |
|
| 142 |
def chunk_audio(waveform, sr, chunk_sec=3.0):
|
| 143 |
"""Split audio into chunks"""
|
|
|
|
| 196 |
load_models()
|
| 197 |
|
| 198 |
if audio_file is None:
|
| 199 |
+
return "โ ๏ธ Please upload an audio file", "", "", ""
|
| 200 |
|
| 201 |
try:
|
| 202 |
+
print(f"Starting analysis of: {audio_file}")
|
| 203 |
+
|
| 204 |
# Preprocess
|
| 205 |
waveform, sr = preprocess_audio(audio_file)
|
| 206 |
duration = len(waveform) / sr
|
| 207 |
+
print(f"Audio preprocessed: {duration:.1f}s, {sr}Hz")
|
| 208 |
|
| 209 |
# Chunk and analyze with WavLM
|
| 210 |
chunks = chunk_audio(waveform, sr)
|
|
|
|
| 293 |
return summary, annotated_text, timeline_text, definitions
|
| 294 |
|
| 295 |
except Exception as e:
|
| 296 |
+
import traceback
|
| 297 |
+
error_trace = traceback.format_exc()
|
| 298 |
+
print(f"Error in analyze_audio: {error_trace}")
|
| 299 |
+
return f"โ Error: {str(e)}\n\n```\n{error_trace}\n```", "", "", ""
|
| 300 |
|
| 301 |
# ============================================================================
|
| 302 |
# GRADIO INTERFACE
|
| 303 |
# ============================================================================
|
| 304 |
|
| 305 |
+
with gr.Blocks(title="๐๏ธ Stutter Analysis") as demo:
|
| 306 |
gr.Markdown("""
|
| 307 |
# ๐๏ธ Speech Fluency Analysis System
|
| 308 |
|
|
|
|
| 359 |
load_models()
|
| 360 |
|
| 361 |
if __name__ == "__main__":
|
| 362 |
+
demo.launch(theme=gr.themes.Soft())
|
packages.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ffmpeg
|
| 2 |
+
libsndfile1
|
requirements.txt
CHANGED
|
@@ -8,3 +8,4 @@ gradio>=4.0.0
|
|
| 8 |
openai-whisper>=20231117
|
| 9 |
numpy>=1.24.0
|
| 10 |
soundfile>=0.12.0
|
|
|
|
|
|
| 8 |
openai-whisper>=20231117
|
| 9 |
numpy>=1.24.0
|
| 10 |
soundfile>=0.12.0
|
| 11 |
+
librosa>=0.10.0
|