VideoToText

Sleeping

avinashHuggingface108 commited on Sep 17, 2025

Commit

475892a

1 Parent(s): 3f868f0

Implement automatic visual-to-audio fallback system

- Keep visual analysis enabled by default (enable_visual=True)
- Automatically fall back to audio-only when visual analysis times out
- Provide reliable highlights generation even with resource constraints
- Reduce timeout to 60s per segment for faster fallback
- Maintain best user experience with graceful degradation

Files changed (2) hide show

audio_enhanced_highlights_final.py +16 -3
highlights_api.py +8 -4

audio_enhanced_highlights_final.py CHANGED Viewed

@@ -43,12 +43,19 @@ logger = logging.getLogger(__name__)
 class AudioVisualAnalyzer:
     """Comprehensive analyzer combining visual and audio analysis"""
-    def __init__(self, whisper_model_size="base", timeout_seconds=30):
         """Initialize with SmolVLM2 and Whisper models"""
         print("🔧 Initializing Audio-Visual Analyzer...")
         # Initialize SmolVLM2 for visual analysis
-        self.vlm_handler = SmolVLM2Handler()
         self.timeout_seconds = timeout_seconds
         # Initialize Whisper for audio analysis
@@ -122,6 +129,11 @@ class AudioVisualAnalyzer:
     def analyze_visual_content(self, frame_path: str) -> Dict:
         """Analyze visual content using SmolVLM2 with robust error handling"""
         max_retries = 2
         retry_count = 0
@@ -152,7 +164,8 @@ class AudioVisualAnalyzer:
                     logger.warning(f"⏰ Visual analysis timed out after {self.timeout_seconds}s (attempt {retry_count + 1})")
                     retry_count += 1
                     if retry_count >= max_retries:
-                        return {"description": "Analysis timed out after multiple attempts", "score": 6.0}
                     continue
                 if exception_result[0]:

 class AudioVisualAnalyzer:
     """Comprehensive analyzer combining visual and audio analysis"""
+    def __init__(self, whisper_model_size="base", timeout_seconds=30, enable_visual=True):
         """Initialize with SmolVLM2 and Whisper models"""
         print("🔧 Initializing Audio-Visual Analyzer...")
+        self.enable_visual = enable_visual
         # Initialize SmolVLM2 for visual analysis
+        if self.enable_visual:
+            print("🔥 Loading SmolVLM2...")
+            self.vlm_handler = SmolVLM2Handler()
+        else:
+            print("🔇 Visual analysis disabled - audio-only mode")
+            self.vlm_handler = None
         self.timeout_seconds = timeout_seconds
         # Initialize Whisper for audio analysis
     def analyze_visual_content(self, frame_path: str) -> Dict:
         """Analyze visual content using SmolVLM2 with robust error handling"""
+        # If visual analysis is disabled, return audio-focused fallback
+        if not self.enable_visual or self.vlm_handler is None:
+            logger.info("📹 Visual analysis disabled, using audio-only mode")
+            return {"description": "Audio-only analysis mode - visual analysis disabled", "score": 7.0}
         max_retries = 2
         retry_count = 0
                     logger.warning(f"⏰ Visual analysis timed out after {self.timeout_seconds}s (attempt {retry_count + 1})")
                     retry_count += 1
                     if retry_count >= max_retries:
+                        logger.info("🔇 Switching to audio-only mode due to visual timeout")
+                        return {"description": "Visual analysis timed out - using audio-only mode", "score": 7.0}
                     continue
                 if exception_result[0]:

highlights_api.py CHANGED Viewed

@@ -115,7 +115,8 @@ async def upload_video(
     min_score: float = 3.0,
     max_highlights: int = 3,
     whisper_model: str = "base",
-    timeout: int = 120
 ):
     """
     Upload a video and start processing highlights
@@ -160,7 +161,8 @@ async def upload_video(
             min_score,
             max_highlights,
             whisper_model,
-            timeout
         )
         return AnalysisResponse(
@@ -224,7 +226,8 @@ async def process_video_highlights(
     min_score: float,
     max_highlights: int,
     whisper_model: str,
-    timeout: int
 ):
     """
     Background task to process video highlights
@@ -237,7 +240,8 @@ async def process_video_highlights(
         # Initialize analyzer
         analyzer = AudioVisualAnalyzer(
             whisper_model_size=whisper_model,
-            timeout_seconds=timeout
         )
         active_jobs[job_id]["progress"] = 20

     min_score: float = 3.0,
     max_highlights: int = 3,
     whisper_model: str = "base",
+    timeout: int = 60,
+    enable_visual: bool = True
 ):
     """
     Upload a video and start processing highlights
             min_score,
             max_highlights,
             whisper_model,
+            timeout,
+            enable_visual
         )
         return AnalysisResponse(
     min_score: float,
     max_highlights: int,
     whisper_model: str,
+    timeout: int,
+    enable_visual: bool
 ):
     """
     Background task to process video highlights
         # Initialize analyzer
         analyzer = AudioVisualAnalyzer(
             whisper_model_size=whisper_model,
+            timeout_seconds=timeout,
+            enable_visual=enable_visual
         )
         active_jobs[job_id]["progress"] = 20