diff --git a/depth_thermal_generation/depth_ddp_glpn.py b/depth_thermal_generation/depth_ddp_glpn.py
new file mode 100644
index 0000000..af4752b
--- /dev/null
+++ b/depth_thermal_generation/depth_ddp_glpn.py
@@ -0,0 +1,545 @@
+import sys
+
+from PIL import Image
+from torchvision import transforms
+# from transformers import OFATokenizer, OFAModel
+# from transformers.models.ofa.generate import sequence_generator  # from generate import sequence_generator
+from transformers import DPTImageProcessor, DPTForDepthEstimation
+import os.path
+from argparse import ArgumentParser
+from torch.utils import data
+import json
+import torch
+import torch.distributed as dist
+import os
+import os.path as osp
+from os.path import join as  opj
+import pandas as pd
+from random import randint
+import cv2
+import torch
+from torch.utils.data import Dataset, DataLoader
+import decord
+import glob
+import subprocess
+import time
+import numpy as np
+
+import os
+os.environ["HF_DATASETS_OFFLINE"] = "1"
+
+import decord
+from decord import cpu
+#glpn
+from transformers import GLPNFeatureExtractor, GLPNForDepthEstimation
+import torch
+import numpy as np
+from PIL import Image
+import requests
+import io
+
+import cv2
+import numpy as np
+from decord import VideoReader, cpu
+
+try:
+    from petrel_client.client import Client
+    petrel_backend_imported = True
+except (ImportError, ModuleNotFoundError):
+    petrel_backend_imported = False
+
+
+def get_video_loader(use_petrel_backend: bool = True,
+                     enable_mc: bool = True,
+                     conf_path: str = None):
+    if petrel_backend_imported and use_petrel_backend:
+        _client = Client(conf_path=conf_path, enable_mc=enable_mc)
+    else:
+        _client = None
+
+    def _loader(video_path):
+        if _client is not None and 's3:' in video_path:
+            video_path = io.BytesIO(_client.get(video_path))
+
+        vr = VideoReader(video_path, num_threads=1, ctx=cpu(0))
+        return vr
+
+    return _loader
+
+class my_dataset(Dataset):
+    def __init__(self, args):
+        super().__init__()
+        self.args = args
+        self.shuffle = True
+        self.resolution = args.resolution # 对于动态大小视频无用
+        self.loader = get_video_loader()
+
+        if args.train_file.endswith('.csv'):
+            self.train_file = pd.read_csv(args.train_file)
+        elif args.train_file.endswith('.json'):
+            # coco_vat_vat0_11_all_id_rootfolder_clsidx_spacy.json 
+            # 格式：   id :  {   'idx_list' :  [0],  'root_folder'  :  'coco_vat_9' }
+
+            if hasattr(args, 'part_nums') and args.part_nums >1:
+                self.part_nums = args.part_nums
+            else:
+                self.part_nums = 100000
+            self.part_index = args.part_index
+            t1 = time.time()
+            with open(args.train_file, 'r', encoding='utf-8') as f:
+                self.train_file = json.load(f)
+                if type(self.train_file) is str:
+                    self.train_file = json.loads(self.train_file)
+
+                self.id_list = list(self.train_file.keys())
+                #=============================
+                # obtain subset of self.id_list
+                self.id_list = self.id_list[self.part_nums*(self.part_index-1):self.part_nums*self.part_index]
+
+
+                print(f'Nums of train_file is {len(self.id_list)},part_index:{self.part_index}, first:{self.id_list[0]}')
+                self.no_caption_id_list = []
+                for idx,id in enumerate(self.id_list):
+                    caption_json = osp.join('/apdcephfs_cq3/share_1311970/A_Youtube',self.train_file[id]['root_folder'],f'{id}_depth_f8glpn_folder')
+                    mp4_path = osp.join('/apdcephfs_cq3/share_1311970/A_Youtube',self.train_file[id]['root_folder'],f'{id}.mp4')
+                    if not os.path.exists(caption_json) and os.path.exists(mp4_path):
+                        self.no_caption_id_list.append(mp4_path)
+                    # else:
+                    #     print(f'{caption_json} is exist!')
+                    if idx%10000==0:
+                        print(f'Time_cost:{time.time()-t1}s, idx:{idx}, caption_json:{caption_json}')
+                try:
+                    print(f'Nums of no_depth_folder_id_list is {len(self.no_caption_id_list)}, first:{self.no_caption_id_list[0]}')
+                except:
+                    print(f'Nums of no_depth_folder_id_list is {len(self.no_caption_id_list)}')
+            t2 = time.time()
+            print(f'Time cost:{t2-t1}s')
+        # DPT
+        # self.patch_resize_transform = DPTImageProcessor.from_pretrained("Intel/dpt-large", cache_dir= args.weights_folder)
+        # glpn
+        self.patch_resize_transform  = GLPNFeatureExtractor.from_pretrained("vinvino02/glpn-nyu", cache_dir= args.weights_folder)
+        print('Dataset nums is {}'.format(self.__len__()))
+        time.sleep(10)
+
+
+    def __len__(self):
+        return len(self.no_caption_id_list)
+
+    def random_sample(self):
+        return self.__getitem__(randint(0, self.__len__() - 1))
+
+    def sequential_sample(self, idx):
+        if idx >= self.__len__() - 1:
+            return self.__getitem__(0)
+        return self.__getitem__(idx + 1)
+
+    def skip_sample(self, idx):
+        if self.shuffle:
+            return self.random_sample()
+        return self.sequential_sample(idx=idx)
+
+    def resize_frame(self, frame):
+        height, width = frame.shape[:2]
+        if height < width:
+            new_height = 256
+            new_width = int(width * (new_height / height))
+        else:
+            new_width = 256
+            new_height = int(height * (new_width / width))
+        # resized_frame = cv2.resize(frame, (new_width, new_height))
+        resized_frame = cv2.resize(frame, (448, 796)) # 576*448   796,448
+        return resized_frame, new_width,  new_height # frame的形状是和new_w, new_w不一样的！！
+
+
+    def get_frames_from_video_decord(self, batchsize=1, video_path=None, caption_nums_per_video=8):
+        # 加载视频
+        video_path = video_path
+        vr = self.loader(video_path)
+        frame_width, frame_height = vr[0].shape[1], vr[0].shape[0]
+
+        # 确定要提取的帧数
+        num_frames = caption_nums_per_video
+        # 计算每隔多少帧提取一次
+        total_frames = len(vr)
+        step = total_frames // num_frames
+
+        # 用于存储提取的图像的tensor
+        frames = []
+        height = width = 0
+
+        # 直接读取指定帧
+        for i in range(num_frames):
+            # 计算要提取的帧的索引
+            idx = i * step
+
+            # 读取该帧
+            frame = vr[idx].asnumpy()
+            frame, new_width, new_height = self.resize_frame(frame)
+
+            if i == 0:
+                height, width = new_height, new_width
+
+            # 转换为PIL Image并进行缩放
+            # Image_frame = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+            Image_frame = Image.fromarray(frame)
+            frame = self.patch_resize_transform(images=Image_frame, return_tensors="pt").pixel_values.unsqueeze(0)
+
+            # 将numpy数组转换为tensor并存储在frames中
+            frames.append(frame)
+
+        # vr.close()
+        frames = torch.cat(frames, 1).squeeze(0)
+        return frames, height, width
+
+
+    def get_frames_from_video_opencv(self, batchsize=1, video_path=None, caption_nums_per_video=8):
+        # 加载视频
+        video_path = video_path
+        cap = cv2.VideoCapture(video_path)
+        frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+
+
+        # 确定要提取的帧数
+        num_frames = caption_nums_per_video
+        # 计算每隔多少帧提取一次
+        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        step = total_frames // num_frames
+
+        # 用于存储提取的图像的tensor
+        # frames = torch.empty(num_frames, 3, frame_height, frame_width)
+        frames = []
+        height = width = 0
+        # frame_idx_list=[]  把视频的帧序号存储下来
+        # 直接读取指定帧
+        for i in range(num_frames):
+            # 计算要提取的帧的索引
+            idx = i * step
+            # frame_idx_list.append(idx)
+            # 设置当前帧为所需的帧
+            cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
+            # 读取该帧
+            ret, frame = cap.read()
+            frame, new_width,  new_height = self.resize_frame(frame) # frame size是按照(w=448, h=576)resize的，但是new_h, new_w是按照原视频宽高比例缩放到短边为256，这样可以保持视频物体比例并且减少内存占用
+            # print(f'{frame_width},{frame_height }===== {frame.shape} ')
+            if i==0:
+                # height, width, _ = frame.shape
+                height, width =  new_height, new_width
+                # print(f'{frame.shape}, {new_width},  {new_height}!!!')  #(576, 448, 3), 256,  455!!!
+            if not ret:
+                break
+            # ret, frame = cap.read() # frame.shape (h,w,3)
+
+            # frame = self.resize_frame(frame)
+            
+            # # print(f'{frame_width},{frame_height }===== {frame.shape} ')
+            # if i==0:
+            #     height, width, _ = frame.shape
+            # if not ret:
+            #     break
+
+            # 转换为PIL Image并进行缩放
+            Image_frame = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+            # frame = self.patch_resize_transform(Image_frame)
+            frame = self.patch_resize_transform(images=Image_frame, return_tensors="pt").pixel_values.unsqueeze(0)
+            # print(frame.shape)
+
+            # 将numpy数组转换为tensor并存储在frames中
+            # frames[i] = frame
+            frames.append(frame)
+
+        # 打印输出frames的形状
+        # print(frames.shape)
+        cap.release()
+        frames = torch.cat(frames, 1).squeeze(0)
+        return frames, height, width
+
+
+    
+
+    def __getitem__(self, idx):
+
+        try:
+        
+            # video_id = self.filter_train_file[idx]
+            # video_path = opj(self.vat_root, video_id)
+            video_path = self.no_caption_id_list[idx]
+            video_id = video_path.split('/')[-1].split('.')[0]
+            # 假如多个程序一起跑，其他已经生成了，就跳过
+            caption_video_json =  video_path.replace('.mp4', '_depth_f8glpn_folder')
+            if os.path.exists(caption_video_json):
+                print('parallel task has process it :{}'.format(caption_video_json))
+                # return '===========', None, torch.random(8,3,self.resolution, self.resolution)
+                return self.skip_sample(idx)
+            if not osp.exists(video_path):
+                print('video {} is not exists and skip this idx! '.format(video_path))
+                return self.skip_sample(idx)
+            video_frames, height, width = self.get_frames_from_video_opencv( video_path = video_path, caption_nums_per_video = args.caption_nums_per_video)
+            # video_frames, height, width = self.get_frames_from_video_decord( video_path = video_path, caption_nums_per_video = args.caption_nums_per_video)
+            return video_id, video_path, video_frames, height, width
+            
+        except Exception as e:
+            print('Read video error in {},{} and we have skip this !, this will not cause error!'.format(idx,e))
+            return self.skip_sample(idx)
+
+
+def synchronize():
+    if not dist.is_available():
+        return
+    if not dist.is_initialized():
+        return
+    world_size = dist.get_world_size()
+    if world_size == 1:
+        return
+    dist.barrier()
+
+
+def depth_estimation(args):
+
+    ########################################  model start #############################
+    """https://huggingface.co/docs/transformers/main/en/model_doc/dpt"""
+    
+    weights_folder = args.weights_folder
+    print(f'args.weights_folder is {args.weights_folder}')
+
+    model_name = 'glpn'
+    if model_name == 'glpn':
+        # glpn
+        feature_extractor = GLPNFeatureExtractor.from_pretrained("vinvino02/glpn-nyu", cache_dir= weights_folder)
+        model = GLPNForDepthEstimation.from_pretrained("vinvino02/glpn-nyu", cache_dir= weights_folder).cuda(args.local_rank)
+    else:
+        # DPT
+        processor = DPTImageProcessor.from_pretrained("Intel/dpt-large", cache_dir= weights_folder)
+        model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large", cache_dir= weights_folder).cuda(args.local_rank)
+
+    if args.rank==0:
+        print('模型初始化完成')
+    model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
+                                                        output_device=args.local_rank)
+    model.eval()
+    if args.rank==0:
+        print('DDP model')
+    ########################################  model over #############################
+
+    ######################################## dataset start #############################
+    if args.rank == 0:
+        print('dataset 初始化')
+
+    train_dataset = my_dataset(args)
+    if args.rank == 0:
+        print('dataset_len: ',train_dataset.__len__())
+        print('loading dataset is complete!')
+    train_sampler = torch.utils.data.distributed.DistributedSampler(
+        train_dataset,
+        num_replicas=args.world_size,
+        rank=args.rank
+    )
+    if args.rank == 0:
+        print('正在同步')
+    synchronize()
+    if args.rank == 0:
+        print('dataloader 初始化')
+    dataloader = torch.utils.data.DataLoader(dataset=train_dataset,
+                                            batch_size=args.batch_size,
+                                            shuffle=False,
+                                            num_workers=args.num_workers,
+                                            pin_memory=True,
+                                            sampler=train_sampler,
+                                            drop_last=True
+                                            )
+    ######################################## dataset over  #############################
+    ######################################## depth_estimation start #############################   
+
+    for index, (video_ids, video_paths,  videos_frames_, h_list, w_list) in enumerate(dataloader):
+        
+        bs, cap_nums, c, h, w = videos_frames_.shape
+        videos_frames = videos_frames_.view(-1, c, h, w ).cuda(args.local_rank)
+        torch.cuda.empty_cache()
+        try:
+            with torch.no_grad():
+                outputs = model(videos_frames)
+                predicted_depth = outputs.predicted_depth # (bs*cap_nums, h, w)
+            
+            
+            predicted_depth = predicted_depth.view(bs, cap_nums, h, w)
+            # print(f'predicted_depth.shape:{predicted_depth.shape}')
+            # interpolate to original size
+            for bs_idx, sample in enumerate(predicted_depth):
+                # import ipdb
+                # ipdb.set_trace()
+                pic_folder = video_paths[bs_idx].replace('.mp4','_depth_f8glpn_folder')
+                os.makedirs(pic_folder, exist_ok=True)
+                for frame_idx, frame in enumerate(predicted_depth[bs_idx]):
+                    prediction = torch.nn.functional.interpolate(
+                        frame.unsqueeze(0).unsqueeze(0), # torch.Size([1, 1, 384, 384])
+                        size=(h_list[bs_idx],w_list[bs_idx]),
+                        mode="bicubic",
+                        align_corners=False,
+                    )  # torch.Size([1, 1, h=480,  w=640])
+                    # print('prediction.shape:{prediction.shape}')
+                    # visualize the prediction
+                    output = prediction.squeeze().cpu().numpy()
+                    # formatted = (output * 255 / np.max(output)).astype("uint8")
+                    # depth = Image.fromarray(formatted) # size (576, 1024)
+                    # depth.save(f"{pic_folder}/{frame_idx}.png")
+                    max_depth = 10
+                    if np.any(output>10):
+                        print(f"{pic_folder} > 10")
+                    output_1k = np.clip(output, 0, max_depth)*1000
+                    cv2.imwrite(f"{pic_folder}/{frame_idx}.png", output_1k.astype("uint16"), [cv2.IMWRITE_PNG_COMPRESSION, 0])
+                print(f'{pic_folder} is succeed!')
+                # sys.exit(0)
+            del videos_frames,  outputs, predicted_depth
+        except Exception as e:
+            print(f'Error:{e}!')
+            del videos_frames
+    ######################################## depth_estimation over  #############################  
+def init_distributed_mode(args):
+
+    args.rank = int(os.environ["RANK"])
+    args.world_size = int(os.environ['WORLD_SIZE'])
+    args.local_rank = int(os.environ['LOCAL_RANK'])
+    torch.cuda.set_device(args.local_rank)
+
+
+    args.dist_backend = 'nccl'
+    args.dist_url = 'env://'
+
+    torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
+                                        world_size=args.world_size, rank=args.rank, timeout=datetime.timedelta(seconds=5400))
+    torch.distributed.barrier()
+
+
+import utils.misc as misc
+
+def main(args):
+    misc.init_distributed_mode(args)
+    if args.rank == 0:
+        print('进程组初始化完成')
+        print("started")
+        print("started caption_json count!")
+        # glob1(json_path=args.exist_caption_id_list_json)  # 'coco_vat_exist_caption_id_list_03141026.json'
+    ###########################################################3
+    import time
+    t1=time.time()
+    depth_estimation(args)
+    t2 = time.time()
+    if args.rank == 0:
+        print('Time : ',t2-t1,' s')
+    dist.destroy_process_group()  # 销毁进程组
+    
+    
+def test_dataset(args):
+
+    train_dataset = my_dataset(args)
+    loader = DataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.num_workers)
+    from time import time
+    for i, sample in enumerate(loader):
+        video_ids, video_paths,  videos_frames, h, w = sample
+        print(i, video_ids, video_paths, videos_frames.shape, h, w)
+        
+def glpn():
+
+    from transformers import GLPNFeatureExtractor, GLPNForDepthEstimation
+    import torch
+    import numpy as np
+    from PIL import Image
+    import requests
+
+    max_depth = 10
+
+    # url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    # image = Image.open(requests.get(url, stream=True).raw)
+    image = Image.open('hallo.png')
+    weights_folder = './glpn'
+    feature_extractor = GLPNFeatureExtractor.from_pretrained("vinvino02/glpn-nyu", cache_dir= weights_folder)
+    model = GLPNForDepthEstimation.from_pretrained("vinvino02/glpn-nyu", cache_dir= weights_folder)
+
+    # prepare image for the model
+    inputs = feature_extractor(images=image, return_tensors="pt")
+    # import ipdb
+    # ipdb.set_trace()
+    # print(inputs.shape)
+
+    with torch.no_grad():
+        outputs = model(**inputs)
+        predicted_depth = outputs.predicted_depth
+
+    # interpolate to original size
+    prediction = torch.nn.functional.interpolate(
+        predicted_depth.unsqueeze(1),
+        size=image.size[::-1],
+        mode="bicubic",
+        align_corners=False,
+    )
+
+    # visualize the prediction
+    output = prediction.squeeze().cpu().numpy()
+    # formatted = (output * 255 / np.max(output)).astype("uint8")
+    # depth = Image.fromarray(formatted)
+    # print(f'min_output:{torch.min(output)}, max_output:{torch.max(output)}!!')
+    output_1k = np.clip(output, 0, max_depth)*1000
+    import ipdb
+    ipdb.set_trace()
+    cv2.imwrite('./saved_10000jpg.jpg', output_1k.astype("uint16"), [cv2.IMWRITE_PNG_COMPRESSION, 0])
+    # cv2.imwrite('./image1.jpg', output_1k.astype("uint16"), [cv2.IMWRITE_PNG_COMPRESSION, 0])
+    cv2.imwrite('./saved_10000png.png', output_1k.astype("uint16"), [cv2.IMWRITE_PNG_COMPRESSION, 0])
+    # cv2.imwrite('./image2.png', output_1k.astype("uint16"), [cv2.IMWRITE_PNG_COMPRESSION, 0])
+
+
+# glpn()
+if __name__ == "__main__":
+
+    
+    import time
+    # time.sleep(10000)
+    import datetime
+    # 获取当前时间
+    now = datetime.datetime.now()
+    # 获取当前月份
+    month = now.month
+    # 获取当前日期
+    day = now.day
+    # 获取当前小时
+    hour = now.hour
+
+    parser = ArgumentParser()
+    parser.add_argument('--caption_nums_per_video', type=int, default=8, help='process rank')
+    parser.add_argument('--batch_size', type=int, default=2)
+    parser.add_argument('--train_file', type=str, required=True)
+    parser.add_argument('--vat_root', type=str,default=None)
+    parser.add_argument('--num_workers', type=int, default=1)
+    parser.add_argument('--resolution', type=int, default=480)
+    # parser.add_argument('--exist_caption_id_list_json', type=str, default=f'coco_vat_exist_caption_id_list_{month}{day}{hour}.json',help='')
+    parser.add_argument('--world_size', default=1, type=int, help='number of distributed processes')
+    parser.add_argument('--local_rank', default=-1, type=int)
+    parser.add_argument('--dist_on_itp', action='store_true')  # --dist_on_itp   ddp
+    parser.add_argument('--dist_url', default='env://', help='url used to set up distributed training')
+    parser.add_argument('--distributed', default=True, help='url used to set up distributed training')
+    parser.add_argument('--gpus', default=[0, 1, 2, 3], help='DP CUDA devices')
+    parser.add_argument('--part_index', default=1, type=int, help='used to split train_file_id into different parts, and generate caption from part_index 1 to ....')
+    parser.add_argument('--part_nums', default=1000, type=int, help='used to split train_file_id into different parts, and generate caption from part_index 1 to ....')
+    parser.add_argument('--weights_folder', type=str,default='/apdcephfs_cq3/share_1311970/A_ofa/glpn')
+    args = parser.parse_args()
+    # test_dataset(args)
+    # import ipdb
+    # ipdb.set_trace()
+    main(args)
+    synchronize()
+    # success_file=f"part_{args.part_index}_success"
+    success_file=f"/apdcephfs_cq3/share_1311970/A_depth_glpn/part_{args.part_index}_success"
+    os.system(f"touch {success_file}")
+
+    """ 
+    HF_DATASETS_OFFLINE=1 python3 -m torch.distributed.launch --nproc_per_node 1 --master_port 29504  depth_ddp_glpn.py \
+    --train_file  /apdcephfs_cq3/share_1311970/A_Youtube/coco_vat_vat0_11_all_id_rootfolder_clsidx_spacy.json   \
+    --num_workers  1   --batch_size 2   \
+    --part_index 92 \
+    --part_nums  10000 \
+    --weights_folder     /apdcephfs_cq3/share_1311970/A_ofa/glpn \
+    --resolution 0 \
+    --caption_nums_per_video 8
+    """
+
+
+