Skip to content

Index Error when loading dataset #29

@nhuytan

Description

@nhuytan

I got error when loading dataset. Any suggestion to fix that

IndexError: index 8389024 is out of bounds for axis 0 with size 66462

{
"name": "IndexError",
"message": "index 8389024 is out of bounds for axis 0 with size 66462",
"stack": "---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_22712\2632409830.py in
13 verbose=True,
14 exclude_genes=['Control1', 'Control2', 'Control3', 'Control4', 'Control5','Control6', 'Control7', 'Control8'],
---> 15 z=[-140, 600, 1200, 1810, 2420, 3000, 3600])
16
17 # Have olfactory bulb pointing left.

c:\users\anonymous\fishscale\FISHscale\utils\dataset.py in init(self, data, data_folder, unique_genes, MultiDataset_name, color_input, verbose, grid_layout, columns_layout, x_label, y_label, z_label, gene_label, other_columns, exclude_genes, z, pixel_size, x_offset, y_offset, z_offset, polygon, select_valid, reparse, parse_num_threads)
732 self.load_from_files(data, x_label, y_label, z_label, z, gene_label, other_columns, unique_genes, exclude_genes,
733 pixel_size, x_offset, y_offset, z_offset, polygon, select_valid, reparse, color_input,
--> 734 parse_num_threads)
735 else:
736 raise Exception(f'Input for "data" not understood. Should be list with initiated Datasets or valid path to files.')

c:\users\anonymous\fishscale\FISHscale\utils\dataset.py in load_from_files(self, filepath, x_label, y_label, z_label, z, gene_label, other_columns, unique_genes, exclude_genes, pixel_size, x_offset, y_offset, z_offset, polygon, select_valid, reparse, color_input, num_threads)
920 part_of_multidataset=True)
921 lazy_result.append(lr)
--> 922 futures = dask.persist(*lazy_result, num_workers=1, num_threads = num_threads)
923 self.datasets = dask.compute(*futures)
924 self.datasets_names = [d.dataset_name for d in self.datasets]

c:\Anaconda\envs\my_env\lib\site-packages\dask\base.py in persist(traverse, optimize_graph, scheduler, *args, **kwargs)
833 postpersists.append((rebuild, a_keys, state))
834
--> 835 results = schedule(dsk, keys, **kwargs)
836 d = dict(zip(keys, results))
837 results2 = [r({k: d[k] for k in ks}, *s) for r, ks, s in postpersists]

c:\Anaconda\envs\my_env\lib\site-packages\dask\threaded.py in get(dsk, result, cache, num_workers, pool, **kwargs)
87 get_id=_thread_get_id,
88 pack_exception=pack_exception,
---> 89 **kwargs,
90 )
91

c:\Anaconda\envs\my_env\lib\site-packages\dask\local.py in get_async(submit, num_workers, dsk, result, cache, get_id, rerun_exceptions_locally, pack_exception, raise_exception, callbacks, dumps, loads, chunksize, **kwargs)
504 _execute_task(task, data) # Re-execute locally
505 else:
--> 506 raise_exception(exc, tb)
507 res, worker_id = loads(res_info)
508 state["cache"][key] = res

c:\Anaconda\envs\my_env\lib\site-packages\dask\local.py in reraise(exc, tb)
312 if exc.traceback is not tb:
313 raise exc.with_traceback(tb)
--> 314 raise exc
315
316

c:\Anaconda\envs\my_env\lib\site-packages\dask\local.py in execute_task(key, task_info, dumps, loads, get_id, pack_exception)
217 try:
218 task, data = loads(task_info)
--> 219 result = _execute_task(task, data)
220 id = get_id()
221 result = dumps((result, id))

c:\Anaconda\envs\my_env\lib\site-packages\dask\core.py in _execute_task(arg, cache, dsk)
117 # temporaries by their reference count and can execute certain
118 # operations in-place.
--> 119 return func(*(_execute_task(a, cache) for a in args))
120 elif not ishashable(arg):
121 return arg

c:\Anaconda\envs\my_env\lib\site-packages\dask\utils.py in apply(func, args, kwargs)
37 def apply(func, args, kwargs=None):
38 if kwargs:
---> 39 return func(*args, **kwargs)
40 else:
41 return func(*args)

c:\users\anonymous\fishscale\FISHscale\utils\dataset.py in init(self, filename, x_label, y_label, z_label, z, gene_label, other_columns, unique_genes, exclude_genes, pixel_size, x_offset, y_offset, z_offset, polygon, select_valid, reparse, color_input, working_selection, verbose, part_of_multidataset, image)
206 self.load_data(self.filename, x_label, y_label, gene_label, self.other_columns, x_offset, y_offset, z_offset,
207 self.pixel_size.magnitude, unique_genes, exclude_genes, self.polygon, self.select_valid,
--> 208 reparse, z_label)
209
210 #Gene metadata

c:\users\anonymous\fishscale\FISHscale\utils\data_handling.py in load_data(self, filename, x_label, y_label, gene_label, other_columns, x_offset, y_offset, z_offset, pixel_size, unique_genes, exclude_genes, polygon, select_valid, reparse, z_label)
522 #Load selected genes
523 self.df = dd.read_parquet(filter_filelist)
--> 524 self.shape = (self.df.shape[0].compute(), self.df.shape[1])
525 else:
526 #Load all genes

c:\Anaconda\envs\my_env\lib\site-packages\dask\base.py in compute(self, **kwargs)
288 dask.base.compute
289 """
--> 290 (result,) = compute(self, traverse=False, **kwargs)
291 return result
292

c:\Anaconda\envs\my_env\lib\site-packages\dask\base.py in compute(traverse, optimize_graph, scheduler, get, *args, **kwargs)
571 postcomputes.append(x.dask_postcompute())
572
--> 573 results = schedule(dsk, keys, **kwargs)
574 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
575

c:\Anaconda\envs\my_env\lib\site-packages\dask\threaded.py in get(dsk, result, cache, num_workers, pool, **kwargs)
87 get_id=_thread_get_id,
88 pack_exception=pack_exception,
---> 89 **kwargs,
90 )
91

c:\Anaconda\envs\my_env\lib\site-packages\dask\local.py in get_async(submit, num_workers, dsk, result, cache, get_id, rerun_exceptions_locally, pack_exception, raise_exception, callbacks, dumps, loads, chunksize, **kwargs)
504 _execute_task(task, data) # Re-execute locally
505 else:
--> 506 raise_exception(exc, tb)
507 res, worker_id = loads(res_info)
508 state["cache"][key] = res

c:\Anaconda\envs\my_env\lib\site-packages\dask\local.py in reraise(exc, tb)
312 if exc.traceback is not tb:
313 raise exc.with_traceback(tb)
--> 314 raise exc
315
316

c:\Anaconda\envs\my_env\lib\site-packages\dask\local.py in execute_task(key, task_info, dumps, loads, get_id, pack_exception)
217 try:
218 task, data = loads(task_info)
--> 219 result = _execute_task(task, data)
220 id = get_id()
221 result = dumps((result, id))

c:\Anaconda\envs\my_env\lib\site-packages\dask\core.py in _execute_task(arg, cache, dsk)
117 # temporaries by their reference count and can execute certain
118 # operations in-place.
--> 119 return func(*(_execute_task(a, cache) for a in args))
120 elif not ishashable(arg):
121 return arg

c:\Anaconda\envs\my_env\lib\site-packages\dask\optimization.py in call(self, *args)
967 if not len(args) == len(self.inkeys):
968 raise ValueError("Expected %d args, got %d" % (len(self.inkeys), len(args)))
--> 969 return core.get(self.dsk, self.outkey, dict(zip(self.inkeys, args)))
970
971 def reduce(self):

c:\Anaconda\envs\my_env\lib\site-packages\dask\core.py in get(dsk, out, cache)
147 for key in toposort(dsk):
148 task = dsk[key]
--> 149 result = _execute_task(task, cache)
150 cache[key] = result
151 result = _execute_task(out, cache)

c:\Anaconda\envs\my_env\lib\site-packages\dask\core.py in _execute_task(arg, cache, dsk)
117 # temporaries by their reference count and can execute certain
118 # operations in-place.
--> 119 return func(*(_execute_task(a, cache) for a in args))
120 elif not ishashable(arg):
121 return arg

c:\Anaconda\envs\my_env\lib\site-packages\dask\dataframe\io\parquet\core.py in call(self, part)
95 self.columns,
96 self.index,
---> 97 self.common_kwargs,
98 )
99

c:\Anaconda\envs\my_env\lib\site-packages\dask\dataframe\io\parquet\core.py in read_parquet_part(fs, engine, meta, part, columns, index, kwargs)
501 dfs = [
502 func(fs, rg, columns.copy(), index, **toolz.merge(kwargs, kw))
--> 503 for (rg, kw) in part
504 ]
505 df = concat(dfs, axis=0) if len(dfs) > 1 else dfs[0]

c:\Anaconda\envs\my_env\lib\site-packages\dask\dataframe\io\parquet\core.py in (.0)
501 dfs = [
502 func(fs, rg, columns.copy(), index, **toolz.merge(kwargs, kw))
--> 503 for (rg, kw) in part
504 ]
505 df = concat(dfs, axis=0) if len(dfs) > 1 else dfs[0]

c:\Anaconda\envs\my_env\lib\site-packages\dask\dataframe\io\parquet\fastparquet.py in read_partition(cls, fs, pieces, columns, index, categories, root_cats, root_file_scheme, base_path, **kwargs)
1031 categories=categories,
1032 index=index,
-> 1033 **kwargs.get("read", {}),
1034 )
1035

c:\Anaconda\envs\my_env\lib\site-packages\dask\dataframe\io\parquet\fastparquet.py in pf_to_pandas(cls, pf, fs, columns, categories, index, open_file_options, **kwargs)
1125 partition_meta=pf.partition_meta,
1126 infile=infile,
-> 1127 **kwargs,
1128 )
1129 start += thislen

c:\Anaconda\envs\my_env\lib\site-packages\fastparquet\api.py in read_row_group_file(self, rg, columns, categories, index, assign, partition_meta, row_filter, infile)
363 selfmade=self.selfmade, index=index,
364 assign=assign, scheme=self.file_scheme, partition_meta=partition_meta,
--> 365 row_filter=row_filter
366 )
367 if ret:

c:\Anaconda\envs\my_env\lib\site-packages\fastparquet\core.py in read_row_group(file, rg, columns, categories, schema_helper, cats, selfmade, index, assign, scheme, partition_meta, row_filter)
607 raise RuntimeError('Going with pre-allocation!')
608 read_row_group_arrays(file, rg, columns, categories, schema_helper,
--> 609 cats, selfmade, assign=assign, row_filter=row_filter)
610
611 for cat in cats:

c:\Anaconda\envs\my_env\lib\site-packages\fastparquet\core.py in read_row_group_arrays(file, rg, columns, categories, schema_helper, cats, selfmade, assign, row_filter)
581 selfmade=selfmade, assign=out[name],
582 catdef=out.get(name+'-catdef', None),
--> 583 row_filter=row_filter)
584
585 if _is_map_like(schema_helper, column):

c:\Anaconda\envs\my_env\lib\site-packages\fastparquet\core.py in read_col(column, schema_helper, infile, use_cat, selfmade, assign, catdef, row_filter)
547 piece[:] = i.codes
548 elif d and not use_cat:
--> 549 piece[:] = dic[val]
550 elif not use_cat:
551 piece[:] = convert(val, se)

IndexError: index 8389024 is out of bounds for axis 0 with size 66462"
}

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions