Merge branch 'fix/vram-cleanup' into fix/backend-non-root

davidamacey · davidamacey · commit 49d2c43f40fc · 2025-10-14T12:02:55.000-04:00
diff --git a/backend/app/services/speaker_embedding_service.py b/backend/app/services/speaker_embedding_service.py
@@ -218,3 +218,25 @@ def extract_reference_embedding(self, audio_paths: list[str]) -> Optional[np.nda
     def get_embedding_dimension(self) -> int:
         """Get the dimension of the embeddings produced by the model."""
         return 512  # Pyannote embedding dimension (updated for newer models)
+
+    def cleanup(self):
+        """
+        Explicitly cleanup the embedding model and free GPU memory.
+
+        This should be called when the service is no longer needed to ensure
+        proper GPU memory management, especially when multiple models are used
+        in sequence during transcription processing.
+        """
+        if hasattr(self, "inference"):
+            logger.info("Cleaning up PyAnnote embedding model")
+            del self.inference
+
+        # Force aggressive memory cleanup
+        import gc
+
+        gc.collect()
+
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.synchronize()
+            logger.info("GPU memory cleaned up after embedding service")
diff --git a/backend/app/tasks/transcription/core.py b/backend/app/tasks/transcription/core.py
@@ -321,10 +321,24 @@ def whisperx_progress_callback(progress, message):
                         f"Speaker identification completed: {len(speaker_results)} speakers processed"
                     )
 
+                    # CRITICAL: Clean up embedding service and matching service to free VRAM
+                    # PyAnnote embedding model uses ~500 MB and must be deleted before OpenSearch indexing
+                    embedding_service.cleanup()
+                    del embedding_service
+                    del matching_service
+
                 except Exception as e:
                     logger.warning(f"Error in speaker identification: {e}")
                     # Continue with transcription even if speaker matching fails
 
+                # Force GPU memory cleanup checkpoint before OpenSearch indexing
+                # This ensures all models are fully cleared from VRAM
+                from app.utils.hardware_detection import detect_hardware
+
+                hardware_config = detect_hardware()
+                hardware_config.optimize_memory_usage()
+                logger.info("GPU memory cleanup checkpoint completed")
+
                 with session_scope() as db:
                     update_task_status(db, task_id, "in_progress", progress=0.85)
 
diff --git a/backend/app/tasks/transcription/whisperx_service.py b/backend/app/tasks/transcription/whisperx_service.py
@@ -245,6 +245,12 @@ def perform_speaker_diarization(
             )
 
             diarize_segments = diarize_model(audio, **diarize_params)
+
+            # CRITICAL: Clean up diarization model immediately to free VRAM
+            # This model uses ~2-3 GB and must be deleted before speaker embedding extraction
+            del diarize_model
+            self.hardware_config.optimize_memory_usage()
+
             return diarize_segments
 
         except Exception as e: