Skip to content

Unknown error when using this in PyTorch DDP #4

@bchuh

Description

@bchuh

I‘m doing Pytorch distributed data parallel training, and I used a generator to traverse the EArray data in my hdf5 dataset. As soon as the program started, I got an "RuntimeError: Unknown error type: 132 when handling execution of <_FuncPtr object at 0x7fb033f58280> with args (b'/nXmxo38xgBw=', 194, 384)". The code where multitables is involved is shown below.


class SignalTrainSetSpeedy(Dataset):

    def __init__(self, augment='None'):
        self.aug = augment
        self.input_path = "/mnt/sdb/zzl/train_data_and_label.hdf5"
        self.stream = None
        self.data = None

    def _yield_data(self):
        for row in self.gen:
            yield row

    def __getitem__(self, index):
        if self.stream is None:
            self.stream = multitables.Streamer(filename=self.input_path)
            self.gen = self.stream.get_generator(path='/data')
            self.data = self._yield_data()
        input_output = next(self.data)
        input = input_output[:16000]
        target = input_output[-61:]
        input, target = torch.Tensor(input), torch.Tensor(target)
        return input, target

    def __len__(self):
        return 2040151

The error message:

Traceback (most recent call last):
  File "train.py", line 244, in <module>
    next_batch = dataset_iter.next()
  File "/home/zhuzengliang/anaconda3/envs/pytorch1/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 530, in __next__
Traceback (most recent call last):
  File "train.py", line 244, in <module>
    data = self._next_data()
  File "/home/zhuzengliang/anaconda3/envs/pytorch1/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 570, in _next_data
    next_batch = dataset_iter.next()
      File "/home/zhuzengliang/anaconda3/envs/pytorch1/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 530, in __next__
data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
  File "/home/zhuzengliang/anaconda3/envs/pytorch1/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 49, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/home/zhuzengliang/anaconda3/envs/pytorch1/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 49, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/mnt/sdf/zzl/FACEGOOD-Audio2Face/code/torch_train/signal_dataset.py", line 114, in __getitem__
    data = self._next_data()
  File "/home/zhuzengliang/anaconda3/envs/pytorch1/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 570, in _next_data
    input_output = next(self.data)
  File "/mnt/sdf/zzl/FACEGOOD-Audio2Face/code/torch_train/signal_dataset.py", line 103, in _yield_data
    for row in self.gen:
  File "/home/zhuzengliang/anaconda3/envs/pytorch1/lib/python3.8/site-packages/multitables/streamer.py", line 247, in get_generator
    data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
  File "/home/zhuzengliang/anaconda3/envs/pytorch1/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 49, in fetch
    q = self.get_queue(path=path, n_procs=n_procs, read_ahead=read_ahead, cyclic=cyclic, block_size=block_size, ordered=ordered, field=field, remainder=remainder)
      File "/home/zhuzengliang/anaconda3/envs/pytorch1/lib/python3.8/site-packages/multitables/streamer.py", line 195, in get_queue
data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/home/zhuzengliang/anaconda3/envs/pytorch1/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 49, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/mnt/sdf/zzl/FACEGOOD-Audio2Face/code/torch_train/signal_dataset.py", line 114, in __getitem__
    stage_pool = stage.StagePool(dataset, block_size, read_ahead, timeout=0.1)
  File "/home/zhuzengliang/anaconda3/envs/pytorch1/lib/python3.8/site-packages/multitables/stage.py", line 104, in __init__
        self._stage_pool.append(StagePool.StagePoolWrapper(dataset.create_stage(stage_size), self))input_output = next(self.data)

  File "/mnt/sdf/zzl/FACEGOOD-Audio2Face/code/torch_train/signal_dataset.py", line 103, in _yield_data
  File "/home/zhuzengliang/anaconda3/envs/pytorch1/lib/python3.8/site-packages/multitables/dataset.py", line 64, in create_stage
    return stage.Stage(numpy_utils._calc_nbytes(self.dtype, self._fill_shape(shape)))
  File "/home/zhuzengliang/anaconda3/envs/pytorch1/lib/python3.8/site-packages/multitables/stage.py", line 29, in __init__
    for row in self.gen:
  File "/home/zhuzengliang/anaconda3/envs/pytorch1/lib/python3.8/site-packages/multitables/streamer.py", line 247, in get_generator
    self._shm_buf = shared_mem.SharedBuffer(map_id=None, size_nbytes=self.size_nbytes)
  File "/home/zhuzengliang/anaconda3/envs/pytorch1/lib/python3.8/site-packages/multitables/shared_mem.py", line 124, in __init__
    q = self.get_queue(path=path, n_procs=n_procs, read_ahead=read_ahead, cyclic=cyclic, block_size=block_size, ordered=ordered, field=field, remainder=remainder)
  File "/home/zhuzengliang/anaconda3/envs/pytorch1/lib/python3.8/site-packages/multitables/streamer.py", line 195, in get_queue
    self._fd = _shm_open(map_id, os.O_CREAT | os.O_EXCL | os.O_RDWR, mode=0o600)
  File "/home/zhuzengliang/anaconda3/envs/pytorch1/lib/python3.8/site-packages/multitables/shared_mem.py", line 64, in _shm_open
        stage_pool = stage.StagePool(dataset, block_size, read_ahead, timeout=0.1)return _posixshmlib.shm_open(name, access, mode)

  File "/home/zhuzengliang/anaconda3/envs/pytorch1/lib/python3.8/site-packages/multitables/stage.py", line 104, in __init__
  File "/home/zhuzengliang/anaconda3/envs/pytorch1/lib/python3.8/site-packages/multitables/shared_mem.py", line 46, in _handle_errno
    raise RuntimeError("Unknown error type: {} when handling execution of {} with args {}".format(erno, func, args))
    self._stage_pool.append(StagePool.StagePoolWrapper(dataset.create_stage(stage_size), self))
  File "/home/zhuzengliang/anaconda3/envs/pytorch1/lib/python3.8/site-packages/multitables/dataset.py", line 64, in create_stage
RuntimeError: Unknown error type: 132 when handling execution of <_FuncPtr object at 0x7f00263a2280> with args (b'/nXmxo38xgBw=', 194, 384)
    return stage.Stage(numpy_utils._calc_nbytes(self.dtype, self._fill_shape(shape)))
  File "/home/zhuzengliang/anaconda3/envs/pytorch1/lib/python3.8/site-packages/multitables/stage.py", line 29, in __init__
    self._shm_buf = shared_mem.SharedBuffer(map_id=None, size_nbytes=self.size_nbytes)
  File "/home/zhuzengliang/anaconda3/envs/pytorch1/lib/python3.8/site-packages/multitables/shared_mem.py", line 124, in __init__
    self._fd = _shm_open(map_id, os.O_CREAT | os.O_EXCL | os.O_RDWR, mode=0o600)
  File "/home/zhuzengliang/anaconda3/envs/pytorch1/lib/python3.8/site-packages/multitables/shared_mem.py", line 64, in _shm_open
    return _posixshmlib.shm_open(name, access, mode)
  File "/home/zhuzengliang/anaconda3/envs/pytorch1/lib/python3.8/site-packages/multitables/shared_mem.py", line 46, in _handle_errno
    raise RuntimeError("Unknown error type: {} when handling execution of {} with args {}".format(erno, func, args))
RuntimeError: Unknown error type: 132 when handling execution of <_FuncPtr object at 0x7f832a6a3280> with args (b'/nXmxo38xgBw=', 194, 384)

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions