Spaces:
Sleeping
Sleeping
Commit ·
475892a
1
Parent(s): 3f868f0
Implement automatic visual-to-audio fallback system
Browse files- Keep visual analysis enabled by default (enable_visual=True)
- Automatically fall back to audio-only when visual analysis times out
- Provide reliable highlights generation even with resource constraints
- Reduce timeout to 60s per segment for faster fallback
- Maintain best user experience with graceful degradation
- audio_enhanced_highlights_final.py +16 -3
- highlights_api.py +8 -4
audio_enhanced_highlights_final.py
CHANGED
|
@@ -43,12 +43,19 @@ logger = logging.getLogger(__name__)
|
|
| 43 |
class AudioVisualAnalyzer:
|
| 44 |
"""Comprehensive analyzer combining visual and audio analysis"""
|
| 45 |
|
| 46 |
-
def __init__(self, whisper_model_size="base", timeout_seconds=30):
|
| 47 |
"""Initialize with SmolVLM2 and Whisper models"""
|
| 48 |
print("🔧 Initializing Audio-Visual Analyzer...")
|
| 49 |
|
|
|
|
|
|
|
| 50 |
# Initialize SmolVLM2 for visual analysis
|
| 51 |
-
self.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
self.timeout_seconds = timeout_seconds
|
| 53 |
|
| 54 |
# Initialize Whisper for audio analysis
|
|
@@ -122,6 +129,11 @@ class AudioVisualAnalyzer:
|
|
| 122 |
|
| 123 |
def analyze_visual_content(self, frame_path: str) -> Dict:
|
| 124 |
"""Analyze visual content using SmolVLM2 with robust error handling"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
max_retries = 2
|
| 126 |
retry_count = 0
|
| 127 |
|
|
@@ -152,7 +164,8 @@ class AudioVisualAnalyzer:
|
|
| 152 |
logger.warning(f"⏰ Visual analysis timed out after {self.timeout_seconds}s (attempt {retry_count + 1})")
|
| 153 |
retry_count += 1
|
| 154 |
if retry_count >= max_retries:
|
| 155 |
-
|
|
|
|
| 156 |
continue
|
| 157 |
|
| 158 |
if exception_result[0]:
|
|
|
|
| 43 |
class AudioVisualAnalyzer:
|
| 44 |
"""Comprehensive analyzer combining visual and audio analysis"""
|
| 45 |
|
| 46 |
+
def __init__(self, whisper_model_size="base", timeout_seconds=30, enable_visual=True):
|
| 47 |
"""Initialize with SmolVLM2 and Whisper models"""
|
| 48 |
print("🔧 Initializing Audio-Visual Analyzer...")
|
| 49 |
|
| 50 |
+
self.enable_visual = enable_visual
|
| 51 |
+
|
| 52 |
# Initialize SmolVLM2 for visual analysis
|
| 53 |
+
if self.enable_visual:
|
| 54 |
+
print("🔥 Loading SmolVLM2...")
|
| 55 |
+
self.vlm_handler = SmolVLM2Handler()
|
| 56 |
+
else:
|
| 57 |
+
print("🔇 Visual analysis disabled - audio-only mode")
|
| 58 |
+
self.vlm_handler = None
|
| 59 |
self.timeout_seconds = timeout_seconds
|
| 60 |
|
| 61 |
# Initialize Whisper for audio analysis
|
|
|
|
| 129 |
|
| 130 |
def analyze_visual_content(self, frame_path: str) -> Dict:
|
| 131 |
"""Analyze visual content using SmolVLM2 with robust error handling"""
|
| 132 |
+
# If visual analysis is disabled, return audio-focused fallback
|
| 133 |
+
if not self.enable_visual or self.vlm_handler is None:
|
| 134 |
+
logger.info("📹 Visual analysis disabled, using audio-only mode")
|
| 135 |
+
return {"description": "Audio-only analysis mode - visual analysis disabled", "score": 7.0}
|
| 136 |
+
|
| 137 |
max_retries = 2
|
| 138 |
retry_count = 0
|
| 139 |
|
|
|
|
| 164 |
logger.warning(f"⏰ Visual analysis timed out after {self.timeout_seconds}s (attempt {retry_count + 1})")
|
| 165 |
retry_count += 1
|
| 166 |
if retry_count >= max_retries:
|
| 167 |
+
logger.info("🔇 Switching to audio-only mode due to visual timeout")
|
| 168 |
+
return {"description": "Visual analysis timed out - using audio-only mode", "score": 7.0}
|
| 169 |
continue
|
| 170 |
|
| 171 |
if exception_result[0]:
|
highlights_api.py
CHANGED
|
@@ -115,7 +115,8 @@ async def upload_video(
|
|
| 115 |
min_score: float = 3.0,
|
| 116 |
max_highlights: int = 3,
|
| 117 |
whisper_model: str = "base",
|
| 118 |
-
timeout: int =
|
|
|
|
| 119 |
):
|
| 120 |
"""
|
| 121 |
Upload a video and start processing highlights
|
|
@@ -160,7 +161,8 @@ async def upload_video(
|
|
| 160 |
min_score,
|
| 161 |
max_highlights,
|
| 162 |
whisper_model,
|
| 163 |
-
timeout
|
|
|
|
| 164 |
)
|
| 165 |
|
| 166 |
return AnalysisResponse(
|
|
@@ -224,7 +226,8 @@ async def process_video_highlights(
|
|
| 224 |
min_score: float,
|
| 225 |
max_highlights: int,
|
| 226 |
whisper_model: str,
|
| 227 |
-
timeout: int
|
|
|
|
| 228 |
):
|
| 229 |
"""
|
| 230 |
Background task to process video highlights
|
|
@@ -237,7 +240,8 @@ async def process_video_highlights(
|
|
| 237 |
# Initialize analyzer
|
| 238 |
analyzer = AudioVisualAnalyzer(
|
| 239 |
whisper_model_size=whisper_model,
|
| 240 |
-
timeout_seconds=timeout
|
|
|
|
| 241 |
)
|
| 242 |
|
| 243 |
active_jobs[job_id]["progress"] = 20
|
|
|
|
| 115 |
min_score: float = 3.0,
|
| 116 |
max_highlights: int = 3,
|
| 117 |
whisper_model: str = "base",
|
| 118 |
+
timeout: int = 60,
|
| 119 |
+
enable_visual: bool = True
|
| 120 |
):
|
| 121 |
"""
|
| 122 |
Upload a video and start processing highlights
|
|
|
|
| 161 |
min_score,
|
| 162 |
max_highlights,
|
| 163 |
whisper_model,
|
| 164 |
+
timeout,
|
| 165 |
+
enable_visual
|
| 166 |
)
|
| 167 |
|
| 168 |
return AnalysisResponse(
|
|
|
|
| 226 |
min_score: float,
|
| 227 |
max_highlights: int,
|
| 228 |
whisper_model: str,
|
| 229 |
+
timeout: int,
|
| 230 |
+
enable_visual: bool
|
| 231 |
):
|
| 232 |
"""
|
| 233 |
Background task to process video highlights
|
|
|
|
| 240 |
# Initialize analyzer
|
| 241 |
analyzer = AudioVisualAnalyzer(
|
| 242 |
whisper_model_size=whisper_model,
|
| 243 |
+
timeout_seconds=timeout,
|
| 244 |
+
enable_visual=enable_visual
|
| 245 |
)
|
| 246 |
|
| 247 |
active_jobs[job_id]["progress"] = 20
|