diff --git a/configs/custom_configs/my_custom_dataset_config.yaml b/configs/custom_configs/my_custom_dataset_config.yaml new file mode 100644 index 000000000..8ad61fa1e --- /dev/null +++ b/configs/custom_configs/my_custom_dataset_config.yaml @@ -0,0 +1,75 @@ +TRAIN: + ENABLE: True + BATCH_SIZE: 4 + EVAL_PERIOD: 5 + CHECKPOINT_PERIOD: 1 + AUTO_RESUME: False + CHECKPOINT_FILE_PATH: "" + +TEST: + ENABLE: True + BATCH_SIZE: 8 + +DATA: + PATH_PREFIX: "" + PATH_TO_DATA_DIR: "" + PATH_LABEL_SEPARATOR: "," + DECODING_BACKEND: "pyav" + NUM_FRAMES: 32 + SAMPLING_RATE: 2 + TRAIN_JITTER_SCALES: [256, 320] + TRAIN_CROP_SIZE: 224 + TEST_CROP_SIZE: 256 + INPUT_CHANNEL_NUM: [3, 3] + +MODEL: + NUM_CLASSES: 14 # <-- Change the number of classes here + ARCH: slowfast + MODEL_NAME: SlowFast + LOSS_FUNC: cross_entropy + DROPOUT_RATE: 0.5 + +SLOWFAST: + ALPHA: 4 + BETA_INV: 8 + FUSION_CONV_CHANNEL_RATIO: 2 + FUSION_KERNEL_SZ: 7 + +RESNET: + ZERO_INIT_FINAL_BN: True + WIDTH_PER_GROUP: 64 + NUM_GROUPS: 1 + DEPTH: 50 + TRANS_FUNC: bottleneck_transform + STRIDE_1X1: False + NUM_BLOCK_TEMP_KERNEL: [[3, 3], [4, 4], [6, 6], [3, 3]] + SPATIAL_STRIDES: [[1, 1], [2, 2], [2, 2], [2, 2]] + SPATIAL_DILATIONS: [[1, 1], [1, 1], [1, 1], [1, 1]] + +NONLOCAL: + LOCATION: [[[], []], [[], []], [[], []], [[], []]] + GROUP: [[1, 1], [1, 1], [1, 1], [1, 1]] + INSTANTIATION: dot_product + +BN: + USE_PRECISE_STATS: True + NUM_BATCHES_PRECISE: 200 + +SOLVER: + BASE_LR: 0.0005 + LR_POLICY: cosine + MAX_EPOCH: 50 + MOMENTUM: 0.9 + WEIGHT_DECAY: 1e-4 + WARMUP_EPOCHS: 5.0 + WARMUP_START_LR: 0.0001 + OPTIMIZING_METHOD: sgd + +DATA_LOADER: + NUM_WORKERS: 2 + PIN_MEMORY: True + +NUM_GPUS: 1 +NUM_SHARDS: 1 +RNG_SEED: 0 +OUTPUT_DIR: "./checkpoints/isl_split_model" diff --git a/setup.py b/setup.py index 185c7d64f..33d3f8ff6 100644 --- a/setup.py +++ b/setup.py @@ -23,8 +23,8 @@ "opencv-python", "pandas", "torchvision>=0.4.2", - "PIL", - "sklearn", + "pillow", + "scikit-learn", "tensorboard", "fairscale", ], diff --git a/slowfast/datasets/__init__.py b/slowfast/datasets/__init__.py index 9e14e0a51..a9ed96565 100644 --- a/slowfast/datasets/__init__.py +++ b/slowfast/datasets/__init__.py @@ -7,6 +7,7 @@ from .imagenet import Imagenet # noqa from .kinetics import Kinetics # noqa from .ssv2 import Ssv2 # noqa +from .my_custom_dataset import Custom try: from .ptv_datasets import Ptvcharades, Ptvkinetics, Ptvssv2 # noqa diff --git a/slowfast/datasets/decoder.py b/slowfast/datasets/decoder.py index e531204b0..d34c9e996 100644 --- a/slowfast/datasets/decoder.py +++ b/slowfast/datasets/decoder.py @@ -269,7 +269,8 @@ def torchvision_decode( decode_all_video (bool): if True, the entire video was decoded. """ # Convert the bytes to a tensor. - video_tensor = torch.from_numpy(np.frombuffer(video_handle, dtype=np.uint8)) + # Converted the video_handle into numpy array + video_tensor = torch.from_numpy(np.frombuffer(np.array(video_handle), dtype=np.uint8)) decode_all_video = True video_start_pts, video_end_pts = 0, -1 @@ -418,6 +419,10 @@ def pyav_decode( else: # Perform selective decoding. decode_all_video = False + ## Changed the sampling rate code + sampling_rate = sampling_rate[0] if isinstance(sampling_rate, list) else sampling_rate + num_frames = num_frames[0] if isinstance(num_frames, list) else num_frames + clip_size = np.maximum( 1.0, np.ceil(sampling_rate * (num_frames - 1) / target_fps * fps) ) @@ -572,28 +577,20 @@ def decode( [None] * num_decode, ) augment_vid = gaussian_prob > 0.0 or time_diff_prob > 0.0 + ## Improved the decoding frames for k in range(num_decode): T = num_frames[k] - # Perform temporal sampling from the decoded video. if decode_all_video: - frames = frames_decoded[0] - if augment_vid: - frames = frames.clone() - start_idx, end_idx = ( - start_end_delta_time[k, 0], - start_end_delta_time[k, 1], - ) + frames = frames_decoded[0].clone() if augment_vid else frames_decoded[0] + start_idx, end_idx = start_end_delta_time[k, 0], start_end_delta_time[k, 1] else: frames = frames_decoded[k] - # video is already trimmed so we just need subsampling - start_idx, end_idx, clip_position = get_start_end_idx( - frames.shape[0], clip_sizes[k], 0, 1 - ) + start_idx, end_idx, _ = get_start_end_idx(frames.shape[0], clip_sizes[k], 0, 1) + if augment_vid: - frames, time_diff_aug[k] = transform.augment_raw_frames( - frames, time_diff_prob, gaussian_prob - ) + frames, time_diff_aug[k] = transform.augment_raw_frames(frames, time_diff_prob, gaussian_prob) + frames_k = temporal_sampling(frames, start_idx, end_idx, T) frames_out[k] = frames_k diff --git a/slowfast/datasets/kinetics.py b/slowfast/datasets/kinetics.py index c9be31b69..f3d26a86a 100644 --- a/slowfast/datasets/kinetics.py +++ b/slowfast/datasets/kinetics.py @@ -370,7 +370,11 @@ def __getitem__(self, index): for _ in range(num_aug): idx += 1 f_out[idx] = frames_decoded[i].clone() - time_idx_out[idx] = time_idx_decoded[i, :] + ## Adding time_index only if it's decoded + if time_idx_decoded is not None and isinstance(time_idx_decoded, np.ndarray): + time_idx_out[idx] = time_idx_decoded[i, :] + else: + time_idx_out[idx] = None f_out[idx] = f_out[idx].float() f_out[idx] = f_out[idx] / 255.0 @@ -450,22 +454,35 @@ def __getitem__(self, index): if self.cfg.AUG.GEN_MASK_LOADER: mask = self._gen_mask() f_out[idx] = f_out[idx] + [torch.Tensor(), mask] + # Final outputs + # Ensure everything is collate-compatible frames = f_out[0] if num_out == 1 else f_out - time_idx = np.array(time_idx_out) - if ( - num_aug * num_decode > 1 - and not self.cfg.MODEL.MODEL_NAME == "ContrastiveModel" - ): - label = [label] * num_aug * num_decode - index = [index] * num_aug * num_decode - if self.cfg.DATA.DUMMY_LOAD: - if self.dummy_output is None: - self.dummy_output = (frames, label, index, time_idx, {}) + time_idx = torch.stack([ + torch.tensor(t, dtype=torch.int32) if t is not None else torch.zeros_like(torch.tensor([0], dtype=torch.int32)) + for t in time_idx_out + ]) + + + # Fix label and index + if num_aug * num_decode > 1 and not self.cfg.MODEL.MODEL_NAME == "ContrastiveModel": + label = [label] * (num_aug * num_decode) + index = [index] * (num_aug * num_decode) + + # Convert everything to torch/numpy-compatible types + if isinstance(frames, list): + frames = [f.clone() for f in frames] + elif isinstance(frames, torch.Tensor): + frames = frames.clone() + + label = torch.tensor(label) if isinstance(label, list) else label + index = torch.tensor(index) if isinstance(index, list) else index + + # DO NOT convert time_idx again — already a tensor + # time_idx = torch.tensor(time_idx) + return frames, label, index, time_idx, {} - else: - logger.warning( - "Failed to fetch video after {} retries.".format(self._num_retries) - ) + + def _gen_mask(self): if self.cfg.AUG.MASK_TUBE: diff --git a/slowfast/datasets/loader.py b/slowfast/datasets/loader.py index c40977ab2..ff3963e72 100644 --- a/slowfast/datasets/loader.py +++ b/slowfast/datasets/loader.py @@ -18,6 +18,13 @@ from . import utils as utils from .build import build_dataset +## Adding safe_collate function +def safe_collate(batch): + batch = [b for b in batch if b is not None] + if len(batch) == 0: + return None + return default_collate(batch) + def multiple_samples_collate(batch, fold=False): """ @@ -121,7 +128,7 @@ def construct_loader(cfg, split, is_precise_bn=False): num_workers=cfg.DATA_LOADER.NUM_WORKERS, pin_memory=cfg.DATA_LOADER.PIN_MEMORY, drop_last=drop_last, - collate_fn=detection_collate if cfg.DETECTION.ENABLE else None, + collate_fn=safe_collate, worker_init_fn=utils.loader_worker_init_fn(dataset), ) else: @@ -138,10 +145,18 @@ def construct_loader(cfg, split, is_precise_bn=False): num_workers=cfg.DATA_LOADER.NUM_WORKERS, pin_memory=cfg.DATA_LOADER.PIN_MEMORY, worker_init_fn=utils.loader_worker_init_fn(dataset), + collate_func=safe_collate ) else: # Create a sampler for multi-process training sampler = utils.create_sampler(dataset, shuffle, cfg) + + # Define safe wrapper for multiple_samples_collate + def multiple_samples_safe(batch): + batch = [b for b in batch if b is not None] + if len(batch) == 0: + return None + return multiple_samples_collate(batch, fold="imagenet" in dataset_name) # Create a loader if cfg.DETECTION.ENABLE: collate_func = detection_collate @@ -154,11 +169,11 @@ def construct_loader(cfg, split, is_precise_bn=False): and split in ["train"] and not cfg.MODEL.MODEL_NAME == "ContrastiveModel" ): - collate_func = partial( - multiple_samples_collate, fold="imagenet" in dataset_name - ) + ## Changed here + collate_func = multiple_samples_collate else: - collate_func = None + collate_func = safe_collate + loader = torch.utils.data.DataLoader( dataset, batch_size=batch_size, diff --git a/slowfast/datasets/multigrid_helper.py b/slowfast/datasets/multigrid_helper.py index 4482b4f27..65ffedd3e 100644 --- a/slowfast/datasets/multigrid_helper.py +++ b/slowfast/datasets/multigrid_helper.py @@ -7,13 +7,11 @@ import torch from torch.utils.data.sampler import Sampler -TORCH_MAJOR = int(torch.__version__.split(".")[0]) -TORCH_MINOR = int(torch.__version__.split(".")[1]) - -if TORCH_MAJOR >= 1 and TORCH_MINOR >= 8: - _int_classes = int -else: +# Modified this import +try: from torch._six import int_classes as _int_classes +except ImportError: + _int_classes = (int,) class ShortCycleBatchSampler(Sampler): diff --git a/slowfast/datasets/my_custom_dataset.py b/slowfast/datasets/my_custom_dataset.py new file mode 100644 index 000000000..2a6075b75 --- /dev/null +++ b/slowfast/datasets/my_custom_dataset.py @@ -0,0 +1,8 @@ +from slowfast.datasets.build import DATASET_REGISTRY +from slowfast.datasets.kinetics import Kinetics + +@DATASET_REGISTRY.register() +class Custom(Kinetics): + def __init__(self, cfg, split): + super().__init__(cfg, split) + # You can override methods here if needed