From 223fa188f7c734610bd6cadf0e8b971fa2c45489 Mon Sep 17 00:00:00 2001
From: satishgsonwane <satish.sonwane@gmail.com>
Date: Mon, 10 Feb 2025 12:13:51 +0530
Subject: [PATCH 1/7] Added timing measurements for different stages of
 processing

---
 run.py | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/run.py b/run.py
index a7d63cb..2cc96f2 100644
--- a/run.py
+++ b/run.py
@@ -15,6 +15,7 @@
 import numpy as np
 import os
 import torch
+import time
 
 from video_depth_anything.video_depth import VideoDepthAnything
 from utils.dc_utils import read_video_frames, save_video
@@ -42,13 +43,26 @@
         'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
     }
 
+    start_time = time.time()
+    
+    # Model loading
     video_depth_anything = VideoDepthAnything(**model_configs[args.encoder])
     video_depth_anything.load_state_dict(torch.load(f'./checkpoints/video_depth_anything_{args.encoder}.pth', map_location='cpu'), strict=True)
     video_depth_anything = video_depth_anything.to(DEVICE).eval()
+    model_load_time = time.time() - start_time
 
+    # Video reading
+    read_start = time.time()
     frames, target_fps = read_video_frames(args.input_video, args.max_len, args.target_fps, args.max_res)
+    read_time = time.time() - read_start
+
+    # Depth inference
+    inference_start = time.time()
     depths, fps = video_depth_anything.infer_video_depth(frames, target_fps, input_size=args.input_size, device=DEVICE, fp32=args.fp32)
+    inference_time = time.time() - inference_start
     
+    # Video saving
+    save_start = time.time()
     video_name = os.path.basename(args.input_video)
     if not os.path.exists(args.output_dir):
         os.makedirs(args.output_dir)
@@ -57,7 +71,10 @@
     depth_vis_path = os.path.join(args.output_dir, os.path.splitext(video_name)[0]+'_vis.mp4')
     save_video(frames, processed_video_path, fps=fps)
     save_video(depths, depth_vis_path, fps=fps, is_depths=True, grayscale=args.grayscale)
+    save_time = time.time() - save_start
 
+    # Optional saving of additional formats
+    extra_save_start = time.time()
     if args.save_npz:
         depth_npz_path = os.path.join(args.output_dir, os.path.splitext(video_name)[0]+'_depths.npz')
         np.savez_compressed(depth_npz_path, depths=depths)
@@ -75,6 +92,25 @@
             exr_file = OpenEXR.OutputFile(output_exr, header)
             exr_file.writePixels({"Z": depth.tobytes()})
             exr_file.close()
+    extra_save_time = time.time() - extra_save_start
+
+    total_time = time.time() - start_time
+    
+    # Print timing information
+    print("\nProcessing Time Breakdown:")
+    print(f"Model Loading: {model_load_time:.2f}s")
+    print(f"Video Reading: {read_time:.2f}s")
+    print(f"Depth Inference: {inference_time:.2f}s")
+    print(f"Video Saving: {save_time:.2f}s")
+    if args.save_npz or args.save_exr:
+        print(f"Additional Format Saving: {extra_save_time:.2f}s")
+    print(f"Total Time: {total_time:.2f}s")
+    
+    # Print per-frame statistics
+    num_frames = len(frames)
+    print(f"\nPer-frame Statistics:")
+    print(f"Number of Frames: {num_frames}")
+    print(f"Average Processing Time per Frame: {inference_time/num_frames:.3f}s ({(num_frames/inference_time):.1f} FPS)")
 
     
 

From 926ccc2be9bd993e72bc973068a955cd839e6b18 Mon Sep 17 00:00:00 2001
From: Satish Sonwane <144023761+satishgsonwane@users.noreply.github.com>
Date: Mon, 10 Feb 2025 12:25:22 +0530
Subject: [PATCH 2/7] Update README.md

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 131d886..05787e5 100644
--- a/README.md
+++ b/README.md
@@ -69,6 +69,7 @@ We provide **two models** of varying scales for robust and consistent video dept
 | Video-Depth-Anything-V2-Large | 381.8M | [Download](https://huggingface.co/depth-anything/Video-Depth-Anything-Large/resolve/main/video_depth_anything_vitl.pth?download=true) |
 
 ## Usage
+Requires Python>3.9
 
 ### Preparation
 

From 50b38e235101a0c16d0d36256e6062c3771ab2f7 Mon Sep 17 00:00:00 2001
From: satishgsonwane <satish.sonwane@gmail.com>
Date: Mon, 10 Feb 2025 13:59:26 +0530
Subject: [PATCH 3/7] Video files ignored from upload

---
 .gitignore | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 22e63fe..a8c0672 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,4 +4,5 @@ checkpoints
 __pycache__
 */__pycache__
 *.pyc
-gradio*
\ No newline at end of file
+gradio*
+*.mp4

From 01f788a26195606ce17aee7e417005409cbe0572 Mon Sep 17 00:00:00 2001
From: satishgsonwane <satish.sonwane@gmail.com>
Date: Thu, 13 Feb 2025 11:59:36 +0530
Subject: [PATCH 4/7] Added improved timing logic

---
 run1.py | 133 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 133 insertions(+)
 create mode 100644 run1.py

diff --git a/run1.py b/run1.py
new file mode 100644
index 0000000..6998b69
--- /dev/null
+++ b/run1.py
@@ -0,0 +1,133 @@
+# Copyright (2025) Bytedance Ltd. and/or its affiliates 
+
+# Licensed under the Apache License, Version 2.0 (the "License"); 
+# you may not use this file except in compliance with the License. 
+# You may obtain a copy of the License at 
+
+#     http://www.apache.org/licenses/LICENSE-2.0 
+
+# Unless required by applicable law or agreed to in writing, software 
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+# See the License for the specific language governing permissions and 
+# limitations under the License. 
+import argparse
+import numpy as np
+import os
+import torch
+import time
+
+from video_depth_anything.video_depth import VideoDepthAnything
+from utils.dc_utils import read_video_frames, save_video
+
+class TimingStats:
+    def __init__(self):
+        self.start_time = time.time()
+        self.model_load_time = 0
+        self.read_time = 0
+        self.inference_time = 0
+        self.save_time = 0
+        self.extra_save_time = 0
+    
+    def get_total_time(self):
+        return time.time() - self.start_time
+    
+    def print_stats(self, num_frames):
+        print("\nProcessing Time Breakdown:")
+        print(f"Model Loading: {self.model_load_time:.2f}s")
+        print(f"Video Reading: {self.read_time:.2f}s")
+        print(f"Depth Inference: {self.inference_time:.2f}s")
+        print(f"Video Saving: {self.save_time:.2f}s")
+        if self.extra_save_time > 0:
+            print(f"Additional Format Saving: {self.extra_save_time:.2f}s")
+        print(f"Total Time: {self.get_total_time():.2f}s")
+        
+        print(f"\nPer-frame Statistics:")
+        print(f"Number of Frames: {num_frames}")
+        print(f"Average Processing Time per Frame: {self.inference_time/num_frames:.3f}s ({(num_frames/self.inference_time):.1f} FPS)")
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Video Depth Anything')
+    parser.add_argument('--input_video', type=str, default='./assets/example_videos/davis_rollercoaster.mp4')
+    parser.add_argument('--output_dir', type=str, default='./outputs')
+    parser.add_argument('--input_size', type=int, default=518)
+    parser.add_argument('--max_res', type=int, default=1280)
+    parser.add_argument('--encoder', type=str, default='vitl', choices=['vits', 'vitl'])
+    parser.add_argument('--max_len', type=int, default=-1, help='maximum length of the input video, -1 means no limit')
+    parser.add_argument('--target_fps', type=int, default=-1, help='target fps of the input video, -1 means the original fps')
+    parser.add_argument('--fp32', action='store_true', help='model infer with torch.float32, default is torch.float16')
+    parser.add_argument('--grayscale', action='store_true', help='do not apply colorful palette')
+    parser.add_argument('--save_npz', action='store_true', help='save depths as npz')
+    parser.add_argument('--save_exr', action='store_true', help='save depths as exr')
+
+    args = parser.parse_args()
+
+    DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
+
+    model_configs = {
+        'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
+        'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
+    }
+
+    timing = TimingStats()
+    
+    try:
+        # Model loading
+        model_start = time.time()
+        video_depth_anything = VideoDepthAnything(**model_configs[args.encoder])
+        video_depth_anything.load_state_dict(torch.load(f'./checkpoints/video_depth_anything_{args.encoder}.pth', map_location='cpu'), strict=True)
+        video_depth_anything = video_depth_anything.to(DEVICE).eval()
+        timing.model_load_time = time.time() - model_start
+
+        # Video reading
+        read_start = time.time()
+        frames, target_fps = read_video_frames(args.input_video, args.max_len, args.target_fps, args.max_res)
+        timing.read_time = time.time() - read_start
+
+        # Depth inference
+        inference_start = time.time()
+        depths, fps = video_depth_anything.infer_video_depth(frames, target_fps, input_size=args.input_size, device=DEVICE, fp32=args.fp32)
+        timing.inference_time = time.time() - inference_start
+        
+        # Video saving
+        save_start = time.time()
+        video_name = os.path.basename(args.input_video)
+        if not os.path.exists(args.output_dir):
+            os.makedirs(args.output_dir)
+
+        processed_video_path = os.path.join(args.output_dir, os.path.splitext(video_name)[0]+'_src.mp4')
+        depth_vis_path = os.path.join(args.output_dir, os.path.splitext(video_name)[0]+'_vis.mp4')
+        save_video(frames, processed_video_path, fps=fps)
+        save_video(depths, depth_vis_path, fps=fps, is_depths=True, grayscale=args.grayscale)
+        timing.save_time = time.time() - save_start
+
+        # Optional saving of additional formats
+        if args.save_npz or args.save_exr:
+            extra_save_start = time.time()
+            
+            if args.save_npz:
+                depth_npz_path = os.path.join(args.output_dir, os.path.splitext(video_name)[0]+'_depths.npz')
+                np.savez_compressed(depth_npz_path, depths=depths)
+                
+            if args.save_exr:
+                depth_exr_dir = os.path.join(args.output_dir, os.path.splitext(video_name)[0]+'_depths_exr')
+                os.makedirs(depth_exr_dir, exist_ok=True)
+                import OpenEXR
+                import Imath
+                for i, depth in enumerate(depths):
+                    output_exr = f"{depth_exr_dir}/frame_{i:05d}.exr"
+                    header = OpenEXR.Header(depth.shape[1], depth.shape[0])
+                    header["channels"] = {
+                        "Z": Imath.Channel(Imath.PixelType(Imath.PixelType.FLOAT))
+                    }
+                    exr_file = OpenEXR.OutputFile(output_exr, header)
+                    exr_file.writePixels({"Z": depth.tobytes()})
+                    exr_file.close()
+                    
+            timing.extra_save_time = time.time() - extra_save_start
+
+        # Print timing statistics
+        timing.print_stats(len(frames))
+
+    except Exception as e:
+        raise RuntimeError(f"Error during video processing: {str(e)}")
\ No newline at end of file

From 3fd4fee1293dce4d18ec5fe398a6ff7c90eb341e Mon Sep 17 00:00:00 2001
From: satishgsonwane <satish.sonwane@gmail.com>
Date: Thu, 13 Feb 2025 12:23:36 +0530
Subject: [PATCH 5/7] GPU selection logic added

---
 run2.py | 141 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 141 insertions(+)
 create mode 100644 run2.py

diff --git a/run2.py b/run2.py
new file mode 100644
index 0000000..65f8a86
--- /dev/null
+++ b/run2.py
@@ -0,0 +1,141 @@
+# Copyright (2025) Bytedance Ltd. and/or its affiliates 
+
+# Licensed under the Apache License, Version 2.0 (the "License"); 
+# you may not use this file except in compliance with the License. 
+# You may obtain a copy of the License at 
+
+#     http://www.apache.org/licenses/LICENSE-2.0 
+
+# Unless required by applicable law or agreed to in writing, software 
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+# See the License for the specific language governing permissions and 
+# limitations under the License. 
+import argparse
+import numpy as np
+import os
+import torch
+import time
+
+from video_depth_anything.video_depth import VideoDepthAnything
+from utils.dc_utils import read_video_frames, save_video
+
+class TimingStats:
+    def __init__(self):
+        self.start_time = time.time()
+        self.model_load_time = 0
+        self.read_time = 0
+        self.inference_time = 0
+        self.save_time = 0
+        self.extra_save_time = 0
+    
+    def get_total_time(self):
+        return time.time() - self.start_time
+    
+    def print_stats(self, num_frames):
+        print("\nProcessing Time Breakdown:")
+        print(f"Model Loading: {self.model_load_time:.2f}s")
+        print(f"Video Reading: {self.read_time:.2f}s")
+        print(f"Depth Inference: {self.inference_time:.2f}s")
+        print(f"Video Saving: {self.save_time:.2f}s")
+        if self.extra_save_time > 0:
+            print(f"Additional Format Saving: {self.extra_save_time:.2f}s")
+        print(f"Total Time: {self.get_total_time():.2f}s")
+        
+        print(f"\nPer-frame Statistics:")
+        print(f"Number of Frames: {num_frames}")
+        print(f"Average Processing Time per Frame: {self.inference_time/num_frames:.3f}s ({(num_frames/self.inference_time):.1f} FPS)")
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Video Depth Anything')
+    parser.add_argument('--input_video', type=str, default='./assets/example_videos/davis_rollercoaster.mp4')
+    parser.add_argument('--output_dir', type=str, default='./outputs')
+    parser.add_argument('--input_size', type=int, default=518)
+    parser.add_argument('--max_res', type=int, default=1280)
+    parser.add_argument('--encoder', type=str, default='vitl', choices=['vits', 'vitl'])
+    parser.add_argument('--max_len', type=int, default=-1, help='maximum length of the input video, -1 means no limit')
+    parser.add_argument('--target_fps', type=int, default=-1, help='target fps of the input video, -1 means the original fps')
+    parser.add_argument('--fp32', action='store_true', help='model infer with torch.float32, default is torch.float16')
+    parser.add_argument('--grayscale', action='store_true', help='do not apply colorful palette')
+    parser.add_argument('--save_npz', action='store_true', help='save depths as npz')
+    parser.add_argument('--save_exr', action='store_true', help='save depths as exr')
+    parser.add_argument('--gpu', type=int, default=0, help='GPU device number to use (default: 0)')
+
+    args = parser.parse_args()
+
+    if torch.cuda.is_available():
+        if args.gpu >= torch.cuda.device_count():
+            raise ValueError(f"GPU device {args.gpu} not found. Available devices: 0 to {torch.cuda.device_count()-1}")
+        DEVICE = f'cuda:{args.gpu}'
+        print(f"Using GPU device {args.gpu}: {torch.cuda.get_device_name(args.gpu)}")
+    else:
+        DEVICE = 'cpu'
+        print("CUDA not available, using CPU")
+
+    model_configs = {
+        'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
+        'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
+    }
+
+    timing = TimingStats()
+    
+    try:
+        # Model loading
+        model_start = time.time()
+        video_depth_anything = VideoDepthAnything(**model_configs[args.encoder])
+        video_depth_anything.load_state_dict(torch.load(f'./checkpoints/video_depth_anything_{args.encoder}.pth', map_location='cpu'), strict=True)
+        video_depth_anything = video_depth_anything.to(DEVICE).eval()
+        timing.model_load_time = time.time() - model_start
+
+        # Video reading
+        read_start = time.time()
+        frames, target_fps = read_video_frames(args.input_video, args.max_len, args.target_fps, args.max_res)
+        timing.read_time = time.time() - read_start
+
+        # Depth inference
+        inference_start = time.time()
+        depths, fps = video_depth_anything.infer_video_depth(frames, target_fps, input_size=args.input_size, device=DEVICE, fp32=args.fp32)
+        timing.inference_time = time.time() - inference_start
+        
+        # Video saving
+        save_start = time.time()
+        video_name = os.path.basename(args.input_video)
+        if not os.path.exists(args.output_dir):
+            os.makedirs(args.output_dir)
+
+        processed_video_path = os.path.join(args.output_dir, os.path.splitext(video_name)[0]+'_src.mp4')
+        depth_vis_path = os.path.join(args.output_dir, os.path.splitext(video_name)[0]+'_vis.mp4')
+        save_video(frames, processed_video_path, fps=fps)
+        save_video(depths, depth_vis_path, fps=fps, is_depths=True, grayscale=args.grayscale)
+        timing.save_time = time.time() - save_start
+
+        # Optional saving of additional formats
+        if args.save_npz or args.save_exr:
+            extra_save_start = time.time()
+            
+            if args.save_npz:
+                depth_npz_path = os.path.join(args.output_dir, os.path.splitext(video_name)[0]+'_depths.npz')
+                np.savez_compressed(depth_npz_path, depths=depths)
+                
+            if args.save_exr:
+                depth_exr_dir = os.path.join(args.output_dir, os.path.splitext(video_name)[0]+'_depths_exr')
+                os.makedirs(depth_exr_dir, exist_ok=True)
+                import OpenEXR
+                import Imath
+                for i, depth in enumerate(depths):
+                    output_exr = f"{depth_exr_dir}/frame_{i:05d}.exr"
+                    header = OpenEXR.Header(depth.shape[1], depth.shape[0])
+                    header["channels"] = {
+                        "Z": Imath.Channel(Imath.PixelType(Imath.PixelType.FLOAT))
+                    }
+                    exr_file = OpenEXR.OutputFile(output_exr, header)
+                    exr_file.writePixels({"Z": depth.tobytes()})
+                    exr_file.close()
+                    
+            timing.extra_save_time = time.time() - extra_save_start
+
+        # Print timing statistics
+        timing.print_stats(len(frames))
+
+    except Exception as e:
+        raise RuntimeError(f"Error during video processing: {str(e)}")
\ No newline at end of file

From a3e598d2ffacd25ab58c54de2629fd00b691e6a6 Mon Sep 17 00:00:00 2001
From: satishgsonwane <satish.sonwane@gmail.com>
Date: Thu, 13 Feb 2025 12:34:32 +0530
Subject: [PATCH 6/7] More metrics to measure time

---
 run3.py | 195 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 195 insertions(+)
 create mode 100644 run3.py

diff --git a/run3.py b/run3.py
new file mode 100644
index 0000000..5038d58
--- /dev/null
+++ b/run3.py
@@ -0,0 +1,195 @@
+# Copyright (2025) Bytedance Ltd. and/or its affiliates 
+
+# Licensed under the Apache License, Version 2.0 (the "License"); 
+# you may not use this file except in compliance with the License. 
+# You may obtain a copy of the License at 
+
+#     http://www.apache.org/licenses/LICENSE-2.0 
+
+# Unless required by applicable law or agreed to in writing, software 
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+# See the License for the specific language governing permissions and 
+# limitations under the License. 
+import argparse
+import numpy as np
+import os
+import torch
+import time
+
+from video_depth_anything.video_depth import VideoDepthAnything
+from utils.dc_utils import read_video_frames, save_video
+
+class TimingStats:
+    def __init__(self):
+        self.start_time = time.time()
+        # Core processing times
+        self.model_load_time = 0
+        self.read_time = 0
+        self.inference_time = 0
+        self.save_time = 0
+        self.extra_save_time = 0
+        
+        # Detailed model timing
+        self.model_init_time = 0
+        self.weight_load_time = 0
+        self.model_to_device_time = 0
+        
+        # Memory stats
+        self.peak_gpu_memory = 0 if torch.cuda.is_available() else None
+        self.peak_cpu_memory = 0
+        
+        # I/O stats
+        self.file_io_time = 0
+        self.video_decode_time = 0
+    
+    def get_total_time(self):
+        return time.time() - self.start_time
+    
+    def update_memory_stats(self):
+        import psutil
+        self.peak_cpu_memory = psutil.Process().memory_info().rss / 1024 / 1024  # MB
+    
+    def print_stats(self, num_frames):
+        self.update_memory_stats()
+        
+        print("\nDetailed Processing Time Breakdown:")
+        print("\nModel Setup Times:")
+        print(f"├─ Model Initialization: {self.model_init_time:.2f}s")
+        print(f"├─ Weight Loading: {self.weight_load_time:.2f}s")
+        print(f"└─ Device Transfer: {self.model_to_device_time:.2f}s")
+        
+        print("\nVideo Processing Times:")
+        print(f"├─ Total Read Time: {self.read_time:.2f}s")
+        print(f"│  ├─ Video Decode: {self.video_decode_time:.2f}s")
+        print(f"│  └─ File I/O: {self.file_io_time:.2f}s")
+        print(f"├─ Total Inference: {self.inference_time:.2f}s")
+        print(f"├─ Video Saving: {self.save_time:.2f}s")
+        if self.extra_save_time > 0:
+            print(f"└─ Additional Format Saving: {self.extra_save_time:.2f}s")
+        
+        print(f"\nTotal Time: {self.get_total_time():.2f}s")
+        
+        print("\nMemory Usage:")
+        print(f"├─ Peak CPU Memory: {self.peak_cpu_memory:.1f}MB")
+        if self.peak_gpu_memory is not None:
+            print(f"└─ Peak GPU Memory: {self.peak_gpu_memory:.1f}MB")
+        
+        print(f"\nPer-frame Performance:")
+        print(f"├─ Total Frames: {num_frames}")
+        print(f"├─ Average Processing Time: {self.inference_time/num_frames:.3f}s/frame")
+        print(f"└─ Effective FPS: {(num_frames/self.inference_time):.1f}")
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Video Depth Anything')
+    parser.add_argument('--input_video', type=str, default='./assets/example_videos/davis_rollercoaster.mp4')
+    parser.add_argument('--output_dir', type=str, default='./outputs')
+    parser.add_argument('--input_size', type=int, default=518)
+    parser.add_argument('--max_res', type=int, default=1280)
+    parser.add_argument('--encoder', type=str, default='vitl', choices=['vits', 'vitl'])
+    parser.add_argument('--max_len', type=int, default=-1, help='maximum length of the input video, -1 means no limit')
+    parser.add_argument('--target_fps', type=int, default=-1, help='target fps of the input video, -1 means the original fps')
+    parser.add_argument('--fp32', action='store_true', help='model infer with torch.float32, default is torch.float16')
+    parser.add_argument('--grayscale', action='store_true', help='do not apply colorful palette')
+    parser.add_argument('--save_npz', action='store_true', help='save depths as npz')
+    parser.add_argument('--save_exr', action='store_true', help='save depths as exr')
+
+    args = parser.parse_args()
+
+    DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
+
+    model_configs = {
+        'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
+        'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
+    }
+
+    timing = TimingStats()
+    
+    try:
+        # Detailed model loading
+        model_init_start = time.time()
+        video_depth_anything = VideoDepthAnything(**model_configs[args.encoder])
+        timing.model_init_time = time.time() - model_init_start
+
+        weight_load_start = time.time()
+        video_depth_anything.load_state_dict(torch.load(f'./checkpoints/video_depth_anything_{args.encoder}.pth', map_location='cpu'), strict=True)
+        timing.weight_load_time = time.time() - weight_load_start
+
+        device_transfer_start = time.time()
+        video_depth_anything = video_depth_anything.to(DEVICE).eval()
+        timing.model_to_device_time = time.time() - device_transfer_start
+        
+        timing.model_load_time = timing.model_init_time + timing.weight_load_time + timing.model_to_device_time
+
+        # Detailed video reading
+        read_start = time.time()
+        io_start = time.time()
+        with open(args.input_video, 'rb') as f:
+            video_data = f.read()
+        timing.file_io_time = time.time() - io_start
+
+        decode_start = time.time()
+        frames, target_fps = read_video_frames(args.input_video, args.max_len, args.target_fps, args.max_res)
+        timing.video_decode_time = time.time() - decode_start
+        timing.read_time = time.time() - read_start
+
+        # Detailed depth inference
+        inference_start = time.time()
+        
+        # Track CUDA memory before inference
+        if torch.cuda.is_available():
+            torch.cuda.reset_peak_memory_stats()
+            
+        depths, fps = video_depth_anything.infer_video_depth(frames, target_fps, 
+                                                           input_size=args.input_size, 
+                                                           device=DEVICE, 
+                                                           fp32=args.fp32)
+        
+        # Update memory stats after inference
+        if torch.cuda.is_available():
+            timing.peak_gpu_memory = torch.cuda.max_memory_allocated() / 1024 / 1024  # MB
+            
+        timing.inference_time = time.time() - inference_start
+        
+        # Video saving
+        save_start = time.time()
+        video_name = os.path.basename(args.input_video)
+        if not os.path.exists(args.output_dir):
+            os.makedirs(args.output_dir)
+
+        processed_video_path = os.path.join(args.output_dir, os.path.splitext(video_name)[0]+'_src.mp4')
+        depth_vis_path = os.path.join(args.output_dir, os.path.splitext(video_name)[0]+'_vis.mp4')
+        save_video(frames, processed_video_path, fps=fps)
+        save_video(depths, depth_vis_path, fps=fps, is_depths=True, grayscale=args.grayscale)
+        timing.save_time = time.time() - save_start
+
+        # Optional saving of additional formats
+        if args.save_npz or args.save_exr:
+            extra_save_start = time.time()
+            
+            if args.save_npz:
+                depth_npz_path = os.path.join(args.output_dir, os.path.splitext(video_name)[0]+'_depths.npz')
+                np.savez_compressed(depth_npz_path, depths=depths)
+                
+            if args.save_exr:
+                depth_exr_dir = os.path.join(args.output_dir, os.path.splitext(video_name)[0]+'_depths_exr')
+                os.makedirs(depth_exr_dir, exist_ok=True)
+                import OpenEXR
+                import Imath
+                for i, depth in enumerate(depths):
+                    output_exr = f"{depth_exr_dir}/frame_{i:05d}.exr"
+                    header = OpenEXR.Header(depth.shape[1], depth.shape[0])
+                    header["channels"] = {
+                        "Z": Imath.Channel(Imath.PixelType(Imath.PixelType.FLOAT))
+                    }
+                    exr_file = OpenEXR.OutputFile(output_exr, header)
+                    exr_file.writePixels({"Z": depth.tobytes()})
+                    exr_file.close()
+                    
+            timing.extra_save_time = time.time() - extra_save_start
+
+        # Print timing statistics
+        timing.print_stats(len(frames))
+
+    except Exception as e:
+        raise RuntimeError(f"Error during video processing: {str(e)}")
\ No newline at end of file

From 0748ac98ea3432c3a62f700c5dd593565f12fa7d Mon Sep 17 00:00:00 2001
From: satishgsonwane <satish.sonwane@gmail.com>
Date: Thu, 13 Feb 2025 12:36:22 +0530
Subject: [PATCH 7/7] GPU selection logic and detailed timing

---
 run3.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/run3.py b/run3.py
index 5038d58..2e03f30 100644
--- a/run3.py
+++ b/run3.py
@@ -94,9 +94,18 @@ def print_stats(self, num_frames):
     parser.add_argument('--save_npz', action='store_true', help='save depths as npz')
     parser.add_argument('--save_exr', action='store_true', help='save depths as exr')
 
+    parser.add_argument('--gpu_id', type=int, default=0, help='GPU ID to use (default: 0)')
     args = parser.parse_args()
 
-    DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
+    if torch.cuda.is_available():
+        if args.gpu_id >= torch.cuda.device_count():
+            raise ValueError(f"GPU ID {args.gpu_id} is not available. Available GPUs: {torch.cuda.device_count()}")
+        torch.cuda.set_device(args.gpu_id)
+        DEVICE = f'cuda:{args.gpu_id}'
+        print(f"Using GPU {args.gpu_id}: {torch.cuda.get_device_name(args.gpu_id)}")
+    else:
+        DEVICE = 'cpu'
+        print("CUDA is not available. Using CPU.")
 
     model_configs = {
         'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},