avinashHuggingface108 commited on
Commit
475892a
·
1 Parent(s): 3f868f0

Implement automatic visual-to-audio fallback system

Browse files

- Keep visual analysis enabled by default (enable_visual=True)
- Automatically fall back to audio-only when visual analysis times out
- Provide reliable highlights generation even with resource constraints
- Reduce timeout to 60s per segment for faster fallback
- Maintain best user experience with graceful degradation

audio_enhanced_highlights_final.py CHANGED
@@ -43,12 +43,19 @@ logger = logging.getLogger(__name__)
43
  class AudioVisualAnalyzer:
44
  """Comprehensive analyzer combining visual and audio analysis"""
45
 
46
- def __init__(self, whisper_model_size="base", timeout_seconds=30):
47
  """Initialize with SmolVLM2 and Whisper models"""
48
  print("🔧 Initializing Audio-Visual Analyzer...")
49
 
 
 
50
  # Initialize SmolVLM2 for visual analysis
51
- self.vlm_handler = SmolVLM2Handler()
 
 
 
 
 
52
  self.timeout_seconds = timeout_seconds
53
 
54
  # Initialize Whisper for audio analysis
@@ -122,6 +129,11 @@ class AudioVisualAnalyzer:
122
 
123
  def analyze_visual_content(self, frame_path: str) -> Dict:
124
  """Analyze visual content using SmolVLM2 with robust error handling"""
 
 
 
 
 
125
  max_retries = 2
126
  retry_count = 0
127
 
@@ -152,7 +164,8 @@ class AudioVisualAnalyzer:
152
  logger.warning(f"⏰ Visual analysis timed out after {self.timeout_seconds}s (attempt {retry_count + 1})")
153
  retry_count += 1
154
  if retry_count >= max_retries:
155
- return {"description": "Analysis timed out after multiple attempts", "score": 6.0}
 
156
  continue
157
 
158
  if exception_result[0]:
 
43
  class AudioVisualAnalyzer:
44
  """Comprehensive analyzer combining visual and audio analysis"""
45
 
46
+ def __init__(self, whisper_model_size="base", timeout_seconds=30, enable_visual=True):
47
  """Initialize with SmolVLM2 and Whisper models"""
48
  print("🔧 Initializing Audio-Visual Analyzer...")
49
 
50
+ self.enable_visual = enable_visual
51
+
52
  # Initialize SmolVLM2 for visual analysis
53
+ if self.enable_visual:
54
+ print("🔥 Loading SmolVLM2...")
55
+ self.vlm_handler = SmolVLM2Handler()
56
+ else:
57
+ print("🔇 Visual analysis disabled - audio-only mode")
58
+ self.vlm_handler = None
59
  self.timeout_seconds = timeout_seconds
60
 
61
  # Initialize Whisper for audio analysis
 
129
 
130
  def analyze_visual_content(self, frame_path: str) -> Dict:
131
  """Analyze visual content using SmolVLM2 with robust error handling"""
132
+ # If visual analysis is disabled, return audio-focused fallback
133
+ if not self.enable_visual or self.vlm_handler is None:
134
+ logger.info("📹 Visual analysis disabled, using audio-only mode")
135
+ return {"description": "Audio-only analysis mode - visual analysis disabled", "score": 7.0}
136
+
137
  max_retries = 2
138
  retry_count = 0
139
 
 
164
  logger.warning(f"⏰ Visual analysis timed out after {self.timeout_seconds}s (attempt {retry_count + 1})")
165
  retry_count += 1
166
  if retry_count >= max_retries:
167
+ logger.info("🔇 Switching to audio-only mode due to visual timeout")
168
+ return {"description": "Visual analysis timed out - using audio-only mode", "score": 7.0}
169
  continue
170
 
171
  if exception_result[0]:
highlights_api.py CHANGED
@@ -115,7 +115,8 @@ async def upload_video(
115
  min_score: float = 3.0,
116
  max_highlights: int = 3,
117
  whisper_model: str = "base",
118
- timeout: int = 120
 
119
  ):
120
  """
121
  Upload a video and start processing highlights
@@ -160,7 +161,8 @@ async def upload_video(
160
  min_score,
161
  max_highlights,
162
  whisper_model,
163
- timeout
 
164
  )
165
 
166
  return AnalysisResponse(
@@ -224,7 +226,8 @@ async def process_video_highlights(
224
  min_score: float,
225
  max_highlights: int,
226
  whisper_model: str,
227
- timeout: int
 
228
  ):
229
  """
230
  Background task to process video highlights
@@ -237,7 +240,8 @@ async def process_video_highlights(
237
  # Initialize analyzer
238
  analyzer = AudioVisualAnalyzer(
239
  whisper_model_size=whisper_model,
240
- timeout_seconds=timeout
 
241
  )
242
 
243
  active_jobs[job_id]["progress"] = 20
 
115
  min_score: float = 3.0,
116
  max_highlights: int = 3,
117
  whisper_model: str = "base",
118
+ timeout: int = 60,
119
+ enable_visual: bool = True
120
  ):
121
  """
122
  Upload a video and start processing highlights
 
161
  min_score,
162
  max_highlights,
163
  whisper_model,
164
+ timeout,
165
+ enable_visual
166
  )
167
 
168
  return AnalysisResponse(
 
226
  min_score: float,
227
  max_highlights: int,
228
  whisper_model: str,
229
+ timeout: int,
230
+ enable_visual: bool
231
  ):
232
  """
233
  Background task to process video highlights
 
240
  # Initialize analyzer
241
  analyzer = AudioVisualAnalyzer(
242
  whisper_model_size=whisper_model,
243
+ timeout_seconds=timeout,
244
+ enable_visual=enable_visual
245
  )
246
 
247
  active_jobs[job_id]["progress"] = 20