From 93a197c319ce2e27302d9c3b9ec4752ddf148e7c Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Mon, 25 Apr 2022 09:48:54 -0400 Subject: [PATCH 01/18] Create DatasetSeacher class. --- nimare/dataset.py | 1011 +++++++++++++++++++++++---------------------- 1 file changed, 511 insertions(+), 500 deletions(-) diff --git a/nimare/dataset.py b/nimare/dataset.py index 4ba6893df..b98c20bba 100755 --- a/nimare/dataset.py +++ b/nimare/dataset.py @@ -26,639 +26,650 @@ LGR = logging.getLogger(__name__) -class Dataset(NiMAREBase): - """Storage container for a coordinate- and/or image-based meta-analytic dataset/database. +class DatasetSeacher(NiMAREBase): + """A tool for searching Datasets.""" - .. versionchanged:: 0.0.9 + def get(self, dataset, dict_, drop_invalid=True): + """Retrieve files and/or metadata from the current Dataset. - * [ENH] Add merge method to Dataset class + Parameters + ---------- + dict_ : :obj:`dict` + Dictionary specifying images or metadata to collect. + Keys should be variables to be used as keys for results dictionary. + Values should be tuples with two values: + type (e.g., 'image' or 'metadata') and specific field corresponding + to column of type-specific DataFrame (e.g., 'z' or 'sample_sizes'). + drop_invalid : :obj:`bool`, optional + Whether to automatically ignore any studies without the required data or not. + Default is False. - .. versionchanged:: 0.0.8 + Returns + ------- + results : :obj:`dict` + A dictionary of lists of requested data. Keys correspond to the keys in ``dict_``. - * [FIX] Set ``nimare.dataset.Dataset.basepath`` in :func:`update_path` using absolute path. + Examples + -------- + >>> dset.get({'z_maps': ('image', 'z'), 'sample_sizes': ('metadata', 'sample_sizes')}) + >>> dset.get({'coordinates': ('coordinates', None)}) + """ + results = {} + results["id"] = dataset.ids + keep_idx = np.arange(len(dataset.ids), dtype=int) + for k, vals in dict_.items(): + if vals[0] == "image": + temp = dataset.get_images(imtype=vals[1]) + elif vals[0] == "metadata": + temp = dataset.get_metadata(field=vals[1]) + elif vals[0] == "coordinates": + # Break DataFrame down into a list of study-specific DataFrames + temp = [ + dataset.coordinates.loc[dataset.coordinates["id"] == id_] + for id_ in dataset.ids + ] + # Replace empty DataFrames with Nones + temp = [t if t.size else None for t in temp] + elif vals[0] == "annotations": + # Break DataFrame down into a list of study-specific DataFrames + temp = [ + dataset.annotations.loc[dataset.annotations["id"] == id_] + for id_ in dataset.ids + ] + # Replace empty DataFrames with Nones + temp = [t if t.size else None for t in temp] + else: + raise ValueError(f"Input '{vals[0]}' not understood.") - Parameters - ---------- - source : :obj:`str` or :obj:`dict` - JSON file containing dictionary with database information or the dict() - object + results[k] = temp + temp_keep_idx = np.where([t is not None for t in temp])[0] + keep_idx = np.intersect1d(keep_idx, temp_keep_idx) - target : :obj:`str`, optional - Desired coordinate space for coordinates. Names follow NIDM convention. - Default is 'mni152_2mm' (MNI space with 2x2x2 voxels). - This parameter has no impact on images. + # reduce + if drop_invalid and (len(keep_idx) != len(self.ids)): + LGR.info(f"Retaining {len(keep_idx)}/{len(self.ids)} studies") + elif len(keep_idx) != len(self.ids): + raise Exception( + f"Only {len(keep_idx)}/{len(self.ids)} in Dataset contain the necessary data. " + "If you want to analyze the subset of studies with required data, " + "set `drop_invalid` to True." + ) - mask : :obj:`str`, :class:`~nibabel.nifti1.Nifti1Image`, \ - :class:`~nilearn.input_data.NiftiMasker` or similar, or None, optional - Mask(er) to use. If None, uses the target space image, with all - non-zero voxels included in the mask. + for k in results: + results[k] = [results[k][i] for i in keep_idx] + if dict_.get(k, [None])[0] in ("coordinates", "annotations"): + results[k] = pd.concat(results[k]) - Attributes - ---------- - space : :obj:`str` - Standard space. Same as ``target`` parameter. + return results - Notes - ----- - Images loaded into a Dataset are assumed to be in the same space. - If images have different resolutions or affines from the Dataset's masker, - then they will be resampled automatically, at the point where they're used, - by :obj:`Dataset.masker`. - """ + def _generic_column_getter(self, dataset, attr, ids=None, column=None, ignore_columns=None): + """Extract information from DataFrame-based attributes. - _id_cols = ["id", "study_id", "contrast_id"] + Parameters + ---------- + attr : :obj:`str` + The name of the DataFrame-format Dataset attribute to search. + ids : :obj:`list` or None, optional + A list of study IDs within which to extract values. + If None, extract values for all studies in the Dataset. + Default is None. + column : :obj:`str` or None, optional + The column from which to extract values. + If None, a list of all columns with valid values will be returned. + Must be a column within Dataset.[attr]. + ignore_columns : :obj:`list` or None, optional + A list of columns to ignore. Only used if ``column`` is None. - def __init__(self, source, target="mni152_2mm", mask=None): - if isinstance(source, str): - with open(source, "r") as f_obj: - data = json.load(f_obj) - elif isinstance(source, dict): - data = source + Returns + ------- + result : :obj:`list` or :obj:`str` + A list of values or a string, depending on if ids is a list (or None) or a string. + """ + if ignore_columns is None: + ignore_columns = dataset._id_cols else: - raise Exception("`source` needs to be a file path or a dictionary") + ignore_columns += dataset._id_cols - # Datasets are organized by study, then experiment - # To generate unique IDs, we combine study ID with experiment ID - # build list of ids - id_columns = ["id", "study_id", "contrast_id"] - all_ids = [] - for pid in data.keys(): - for expid in data[pid]["contrasts"].keys(): - id_ = f"{pid}-{expid}" - all_ids.append([id_, pid, expid]) - id_df = pd.DataFrame(columns=id_columns, data=all_ids) - id_df = id_df.set_index("id", drop=False) - self._ids = id_df.index.values + df = getattr(dataset, attr) + return_first = False - # Set up Masker - if mask is None: - mask = get_template(target, mask="brain") - self.masker = mask - self.space = target + if isinstance(ids, str) and column is not None: + return_first = True + ids = _listify(ids) - self.annotations = _dict_to_df(id_df, data, key="labels") - self.coordinates = _dict_to_coordinates(data, masker=self.masker, space=self.space) - self.images = _dict_to_df(id_df, data, key="images") - self.metadata = _dict_to_df(id_df, data, key="metadata") - self.texts = _dict_to_df(id_df, data, key="text") - self.basepath = None + available_types = [c for c in df.columns if c not in dataset._id_cols] + if (column is not None) and (column not in available_types): + raise ValueError( + f"{column} not found in {attr}.\nAvailable types: {', '.join(available_types)}" + ) - def __repr__(self): - """Show basic Dataset representation. + if column is not None: + if ids is not None: + result = df[column].loc[df["id"].isin(ids)].tolist() + else: + result = df[column].tolist() + else: + if ids is not None: + result = {v: df[v].loc[df["id"].isin(ids)].tolist() for v in available_types} + result = {k: v for k, v in result.items() if any(v)} + else: + result = {v: df[v].tolist() for v in available_types} + result = list(result.keys()) - It's basically the same as the NiMAREBase representation, but with the number of - experiments in the Dataset represented as well. + if return_first: + return result[0] + else: + return result + + def get_labels(self, dataset, ids=None): + """Extract list of labels for which studies in Dataset have annotations. + + Parameters + ---------- + ids : :obj:`list`, optional + A list of IDs in the Dataset for which to find labels. Default is + None, in which case all labels are returned. + + Returns + ------- + labels : :obj:`list` + List of labels for which there are annotations in the Dataset. """ - # Get default parameter values for the object - signature = inspect.signature(self.__init__) - defaults = { - k: v.default - for k, v in signature.parameters.items() - if v.default is not inspect.Parameter.empty - } + if not isinstance(ids, list) and ids is not None: + ids = _listify(ids) - # Eliminate any sub-parameters (e.g., parameters for a MetaEstimator's KernelTransformer), - # as well as default values - params = self.get_params() - params = {k: v for k, v in params.items() if "__" not in k} - # Parameter "target" is stored as attribute "space" - # and we want to show it regardless of whether it's the default or not - params["space"] = self.space - params.pop("target") - params = {k: v for k, v in params.items() if defaults.get(k) != v} + result = [c for c in dataset.annotations.columns if c not in dataset._id_cols] + if ids is not None: + temp_annotations = dataset.annotations.loc[dataset.annotations["id"].isin(ids)] + res = temp_annotations[result].any(axis=0) + result = res.loc[res].index.tolist() - # Convert to strings - param_strs = [] - for k, v in params.items(): - if isinstance(v, str): - # Wrap string values in single quotes - param_str = f"{k}='{v}'" - else: - # Keep everything else as-is based on its own repr - param_str = f"{k}={v}" - param_strs.append(param_str) + return result - params_str = ", ".join(param_strs) - params_str = f"{len(self.ids)} experiments{', ' if params_str else ''}{params_str}" - rep = f"{self.__class__.__name__}({params_str})" - return rep + def get_texts(self, dataset, ids=None, text_type=None): + """Extract list of texts of a given type for selected IDs. - @property - def ids(self): - """numpy.ndarray: 1D array of identifiers in Dataset. + Parameters + ---------- + ids : :obj:`list`, optional + A list of IDs in the Dataset for which to find texts. Default is + None, in which case all texts of requested type are returned. + text_type : :obj:`str`, optional + Type of text to extract. Corresponds to column name in + Dataset.texts DataFrame. Default is None. - The associated setter for this property is private, as ``Dataset.ids`` is immutable. + Returns + ------- + texts : :obj:`list` + List of texts of requested type for selected IDs. """ - return self.__ids + result = self._generic_column_getter(dataset, "texts", ids=ids, column=text_type) + return result - @ids.setter - def _ids(self, ids): - ids = np.sort(np.asarray(ids)) - assert isinstance(ids, np.ndarray) and ids.ndim == 1 - self.__ids = ids + def get_metadata(self, dataset, ids=None, field=None): + """Get metadata from Dataset. - @property - def masker(self): - """:class:`nilearn.input_data.NiftiMasker` or similar: Masker object. + Parameters + ---------- + ids : :obj:`list`, optional + A list of IDs in the Dataset for which to find metadata. Default is + None, in which case all metadata of requested type are returned. + field : :obj:`str`, optional + Metadata field to extract. Corresponds to column name in + Dataset.metadata DataFrame. Default is None. - Defines the space and location of the area of interest (e.g., 'brain'). + Returns + ------- + metadata : :obj:`list` + List of values of requested type for selected IDs. """ - return self.__masker + result = dataset._generic_column_getter(dataset, "metadata", ids=ids, column=field) + return result - @masker.setter - def masker(self, mask): - mask = get_masker(mask) - if hasattr(self, "masker") and not np.array_equal( - self.masker.mask_img.affine, mask.mask_img.affine - ): - # This message does not have an associated effect, - # since matrix indices are calculated as necessary - LGR.warning("New masker does not match old masker. Space is assumed to be the same.") - - self.__masker = mask + def get_images(self, dataset, ids=None, imtype=None): + """Get images of a certain type for a subset of studies in the dataset. - @property - def annotations(self): - """:class:`pandas.DataFrame`: Labels describing studies in the dataset. + Parameters + ---------- + ids : :obj:`list`, optional + A list of IDs in the Dataset for which to find images. Default is + None, in which case all images of requested type are returned. + imtype : :obj:`str`, optional + Type of image to extract. Corresponds to column name in + Dataset.images DataFrame. Default is None. - Each study/experiment has its own row. - Columns correspond to individual labels (e.g., 'emotion'), and may - be prefixed with a feature group including two underscores - (e.g., 'Neurosynth_TFIDF__emotion'). + Returns + ------- + images : :obj:`list` + List of images of requested type for selected IDs. """ - return self.__annotations - - @annotations.setter - def annotations(self, df): - _validate_df(df) - self.__annotations = df.sort_values(by="id") + ignore_columns = ["space"] + ignore_columns += [c for c in dataset.images.columns if c.endswith("__relative")] + result = self._generic_column_getter( + dataset, + "images", + ids=ids, + column=imtype, + ignore_columns=ignore_columns, + ) + return result - @property - def coordinates(self): - """:class:`pandas.DataFrame`: Coordinates in the dataset. + def get_studies_by_label(self, dataset, labels=None, label_threshold=0.001): + """Extract list of studies with a given label. .. versionchanged:: 0.0.10 - The coordinates attribute no longer includes the associated matrix indices - (columns 'i', 'j', and 'k'). These columns are calculated as needed. - - Each study has one row for each peak. - Columns include ['x', 'y', 'z'] (peak locations in mm) and 'space' (Dataset's space). - """ - return self.__coordinates - - @coordinates.setter - def coordinates(self, df): - _validate_df(df) - self.__coordinates = df.sort_values(by="id") - - @property - def images(self): - """:class:`pandas.DataFrame`: Images in the dataset. - - Each image type has its own column (e.g., 'z') with absolute paths to - files and each study has its own row. - Additionally, relative paths to image files are stored in columns with - the suffix '__relative' (e.g., 'z__relative'). + Fix bug in which all IDs were returned when a label wasn't present in the Dataset. - Warnings - -------- - Images are assumed to be in the same space, although they may have - different resolutions and affines. Images will be resampled as needed - at the point where they are used, via :obj:`Dataset.masker`. - """ - return self.__images + .. versionchanged:: 0.0.9 - @images.setter - def images(self, df): - _validate_df(df) - self.__images = _validate_images_df(df).sort_values(by="id") + Default value for label_threshold changed to 0.001. - @property - def metadata(self): - """:class:`pandas.DataFrame`: Metadata describing studies in the dataset. + Parameters + ---------- + labels : :obj:`list`, optional + List of labels to use to search Dataset. If a contrast has all of + the labels above the threshold, it will be returned. + Default is None. + label_threshold : :obj:`float`, optional + Default is 0.5. - Each metadata field has its own column (e.g., 'sample_sizes') and each study - has its own row. + Returns + ------- + found_ids : :obj:`list` + A list of IDs from the Dataset found by the search criteria. """ - return self.__metadata - - @metadata.setter - def metadata(self, df): - _validate_df(df) - self.__metadata = df.sort_values(by="id") + if isinstance(labels, str): + labels = [labels] + elif not isinstance(labels, list): + raise ValueError(f"Argument 'labels' cannot be {type(labels)}") - @property - def texts(self): - """:class:`pandas.DataFrame`: Texts in the dataset. + missing_labels = [label for label in labels if label not in dataset.annotations.columns] + if missing_labels: + raise ValueError(f"Missing label(s): {', '.join(missing_labels)}") - Each text type has its own column (e.g., 'abstract') and each study - has its own row. - """ - return self.__texts + temp_annotations = dataset.annotations[dataset._id_cols + labels] + found_rows = (temp_annotations[labels] >= label_threshold).all(axis=1) + if any(found_rows): + found_ids = temp_annotations.loc[found_rows, "id"].tolist() + else: + found_ids = [] - @texts.setter - def texts(self, df): - _validate_df(df) - self.__texts = df.sort_values(by="id") + return found_ids - def slice(self, ids): - """Create a new dataset with only requested IDs. + def get_studies_by_mask(self, dataset, mask): + """Extract list of studies with at least one coordinate in mask. Parameters ---------- - ids : array_like - List of study IDs to include in new dataset + mask : img_like + Mask across which to search for coordinates. Returns ------- - new_dset : :obj:`~nimare.dataset.Dataset` - Reduced Dataset containing only requested studies. + found_ids : :obj:`list` + A list of IDs from the Dataset with at least one focus in the mask. """ - new_dset = copy.deepcopy(self) - new_dset._ids = ids - for attribute in ("annotations", "coordinates", "images", "metadata", "texts"): - df = getattr(new_dset, attribute) - df = df.loc[df["id"].isin(ids)] - setattr(new_dset, attribute, df) + from scipy.spatial.distance import cdist - return new_dset + mask = load_niimg(mask) - def merge(self, right): - """Merge two Datasets. + dset_mask = dataset.masker.mask_img + if not np.array_equal(dset_mask.affine, mask.affine): + LGR.warning("Mask affine does not match Dataset affine. Assuming same space.") - .. versionadded:: 0.0.9 + dset_ijk = mm2vox(dataset.coordinates[["x", "y", "z"]].values, mask.affine) + mask_ijk = np.vstack(np.where(mask.get_fdata())).T + distances = cdist(mask_ijk, dset_ijk) + distances = np.any(distances == 0, axis=0) + found_ids = list(dataset.coordinates.loc[distances, "id"].unique()) + return found_ids + + def get_studies_by_coordinate(self, dataset, xyz, r=20): + """Extract list of studies with at least one focus within radius of requested coordinates. Parameters ---------- - right : :obj:`~nimare.dataset.Dataset` - Dataset to merge with. + xyz : (X x 3) array_like + List of coordinates against which to find studies. + r : :obj:`float`, optional + Radius (in mm) within which to find studies. Default is 20mm. Returns ------- - :obj:`~nimare.dataset.Dataset` - A Dataset of the two merged Datasets. + found_ids : :obj:`list` + A list of IDs from the Dataset with at least one focus within + radius r of requested coordinates. """ - assert isinstance(right, Dataset) - shared_ids = np.intersect1d(self.ids, right.ids) - if shared_ids.size: - raise Exception("Duplicate IDs detected in both datasets.") + from scipy.spatial.distance import cdist - all_ids = np.concatenate((self.ids, right.ids)) - new_dset = copy.deepcopy(self) - new_dset._ids = all_ids + xyz = np.array(xyz) + assert xyz.shape[1] == 3 and xyz.ndim == 2 + distances = cdist(xyz, dataset.coordinates[["x", "y", "z"]].values) + distances = np.any(distances <= r, axis=0) + found_ids = list(dataset.coordinates.loc[distances, "id"].unique()) + return found_ids - for attribute in ("annotations", "coordinates", "images", "metadata", "texts"): - df1 = getattr(self, attribute) - df2 = getattr(right, attribute) - new_df = df1.append(df2, ignore_index=True, sort=False) - new_df.sort_values(by="id", inplace=True) - new_df.reset_index(drop=True, inplace=True) - new_df = new_df.where(~new_df.isna(), None) - setattr(new_dset, attribute, new_df) - new_dset.coordinates = _transform_coordinates_to_space( - new_dset.coordinates, - self.masker, - self.space, - ) +class Dataset(NiMAREBase): + """Storage container for a coordinate- and/or image-based meta-analytic dataset/database. - return new_dset + .. versionchanged:: 0.0.9 - def update_path(self, new_path): - """Update paths to images. + * [ENH] Add merge method to Dataset class - Prepends new path to the relative path for files in Dataset.images. + .. versionchanged:: 0.0.8 - Parameters - ---------- - new_path : :obj:`str` - Path to prepend to relative paths of files in Dataset.images. - """ - self.basepath = op.abspath(new_path) - df = self.images - relative_path_cols = [c for c in df if c.endswith("__relative")] - for col in relative_path_cols: - abs_col = col.replace("__relative", "") - if abs_col in df.columns: - LGR.info(f"Overwriting images column {abs_col}") - df[abs_col] = df[col].apply(_try_prepend, prefix=self.basepath) - self.images = df + * [FIX] Set ``nimare.dataset.Dataset.basepath`` in :func:`update_path` using absolute path. - def copy(self): - """Create a copy of the Dataset.""" - return copy.deepcopy(self) + Parameters + ---------- + source : :obj:`str` or :obj:`dict` + JSON file containing dictionary with database information or the dict() + object - def get(self, dict_, drop_invalid=True): - """Retrieve files and/or metadata from the current Dataset. + target : :obj:`str`, optional + Desired coordinate space for coordinates. Names follow NIDM convention. + Default is 'mni152_2mm' (MNI space with 2x2x2 voxels). + This parameter has no impact on images. - Parameters - ---------- - dict_ : :obj:`dict` - Dictionary specifying images or metadata to collect. - Keys should be variables to be used as keys for results dictionary. - Values should be tuples with two values: - type (e.g., 'image' or 'metadata') and specific field corresponding - to column of type-specific DataFrame (e.g., 'z' or 'sample_sizes'). - drop_invalid : :obj:`bool`, optional - Whether to automatically ignore any studies without the required data or not. - Default is False. + mask : :obj:`str`, :class:`~nibabel.nifti1.Nifti1Image`, \ + :class:`~nilearn.input_data.NiftiMasker` or similar, or None, optional + Mask(er) to use. If None, uses the target space image, with all + non-zero voxels included in the mask. - Returns - ------- - results : :obj:`dict` - A dictionary of lists of requested data. Keys correspond to the keys in ``dict_``. + Attributes + ---------- + space : :obj:`str` + Standard space. Same as ``target`` parameter. - Examples - -------- - >>> dset.get({'z_maps': ('image', 'z'), 'sample_sizes': ('metadata', 'sample_sizes')}) - >>> dset.get({'coordinates': ('coordinates', None)}) - """ - results = {} - results["id"] = self.ids - keep_idx = np.arange(len(self.ids), dtype=int) - for k, vals in dict_.items(): - if vals[0] == "image": - temp = self.get_images(imtype=vals[1]) - elif vals[0] == "metadata": - temp = self.get_metadata(field=vals[1]) - elif vals[0] == "coordinates": - # Break DataFrame down into a list of study-specific DataFrames - temp = [self.coordinates.loc[self.coordinates["id"] == id_] for id_ in self.ids] - # Replace empty DataFrames with Nones - temp = [t if t.size else None for t in temp] - elif vals[0] == "annotations": - # Break DataFrame down into a list of study-specific DataFrames - temp = [self.annotations.loc[self.annotations["id"] == id_] for id_ in self.ids] - # Replace empty DataFrames with Nones - temp = [t if t.size else None for t in temp] - else: - raise ValueError(f"Input '{vals[0]}' not understood.") + Notes + ----- + Images loaded into a Dataset are assumed to be in the same space. + If images have different resolutions or affines from the Dataset's masker, + then they will be resampled automatically, at the point where they're used, + by :obj:`Dataset.masker`. + """ - results[k] = temp - temp_keep_idx = np.where([t is not None for t in temp])[0] - keep_idx = np.intersect1d(keep_idx, temp_keep_idx) + _id_cols = ["id", "study_id", "contrast_id"] - # reduce - if drop_invalid and (len(keep_idx) != len(self.ids)): - LGR.info(f"Retaining {len(keep_idx)}/{len(self.ids)} studies") - elif len(keep_idx) != len(self.ids): - raise Exception( - f"Only {len(keep_idx)}/{len(self.ids)} in Dataset contain the necessary data. " - "If you want to analyze the subset of studies with required data, " - "set `drop_invalid` to True." - ) + def __init__(self, source, target="mni152_2mm", mask=None): + if isinstance(source, str): + with open(source, "r") as f_obj: + data = json.load(f_obj) + elif isinstance(source, dict): + data = source + else: + raise Exception("`source` needs to be a file path or a dictionary") - for k in results: - results[k] = [results[k][i] for i in keep_idx] - if dict_.get(k, [None])[0] in ("coordinates", "annotations"): - results[k] = pd.concat(results[k]) + # Datasets are organized by study, then experiment + # To generate unique IDs, we combine study ID with experiment ID + # build list of ids + id_columns = ["id", "study_id", "contrast_id"] + all_ids = [] + for pid in data.keys(): + for expid in data[pid]["contrasts"].keys(): + id_ = f"{pid}-{expid}" + all_ids.append([id_, pid, expid]) + id_df = pd.DataFrame(columns=id_columns, data=all_ids) + id_df = id_df.set_index("id", drop=False) + self._ids = id_df.index.values - return results + # Set up Masker + if mask is None: + mask = get_template(target, mask="brain") + self.masker = mask + self.space = target - def _generic_column_getter(self, attr, ids=None, column=None, ignore_columns=None): - """Extract information from DataFrame-based attributes. + self.annotations = _dict_to_df(id_df, data, key="labels") + self.coordinates = _dict_to_coordinates(data, masker=self.masker, space=self.space) + self.images = _dict_to_df(id_df, data, key="images") + self.metadata = _dict_to_df(id_df, data, key="metadata") + self.texts = _dict_to_df(id_df, data, key="text") + self.basepath = None - Parameters - ---------- - attr : :obj:`str` - The name of the DataFrame-format Dataset attribute to search. - ids : :obj:`list` or None, optional - A list of study IDs within which to extract values. - If None, extract values for all studies in the Dataset. - Default is None. - column : :obj:`str` or None, optional - The column from which to extract values. - If None, a list of all columns with valid values will be returned. - Must be a column within Dataset.[attr]. - ignore_columns : :obj:`list` or None, optional - A list of columns to ignore. Only used if ``column`` is None. + def __repr__(self): + """Show basic Dataset representation. - Returns - ------- - result : :obj:`list` or :obj:`str` - A list of values or a string, depending on if ids is a list (or None) or a string. + It's basically the same as the NiMAREBase representation, but with the number of + experiments in the Dataset represented as well. """ - if ignore_columns is None: - ignore_columns = self._id_cols - else: - ignore_columns += self._id_cols + # Get default parameter values for the object + signature = inspect.signature(self.__init__) + defaults = { + k: v.default + for k, v in signature.parameters.items() + if v.default is not inspect.Parameter.empty + } - df = getattr(self, attr) - return_first = False + # Eliminate any sub-parameters (e.g., parameters for a MetaEstimator's KernelTransformer), + # as well as default values + params = self.get_params() + params = {k: v for k, v in params.items() if "__" not in k} + # Parameter "target" is stored as attribute "space" + # and we want to show it regardless of whether it's the default or not + params["space"] = self.space + params.pop("target") + params = {k: v for k, v in params.items() if defaults.get(k) != v} - if isinstance(ids, str) and column is not None: - return_first = True - ids = _listify(ids) + # Convert to strings + param_strs = [] + for k, v in params.items(): + if isinstance(v, str): + # Wrap string values in single quotes + param_str = f"{k}='{v}'" + else: + # Keep everything else as-is based on its own repr + param_str = f"{k}={v}" + param_strs.append(param_str) - available_types = [c for c in df.columns if c not in self._id_cols] - if (column is not None) and (column not in available_types): - raise ValueError( - f"{column} not found in {attr}.\nAvailable types: {', '.join(available_types)}" - ) + params_str = ", ".join(param_strs) + params_str = f"{len(self.ids)} experiments{', ' if params_str else ''}{params_str}" + rep = f"{self.__class__.__name__}({params_str})" + return rep - if column is not None: - if ids is not None: - result = df[column].loc[df["id"].isin(ids)].tolist() - else: - result = df[column].tolist() - else: - if ids is not None: - result = {v: df[v].loc[df["id"].isin(ids)].tolist() for v in available_types} - result = {k: v for k, v in result.items() if any(v)} - else: - result = {v: df[v].tolist() for v in available_types} - result = list(result.keys()) + @property + def ids(self): + """numpy.ndarray: 1D array of identifiers in Dataset. - if return_first: - return result[0] - else: - return result + The associated setter for this property is private, as ``Dataset.ids`` is immutable. + """ + return self.__ids + + @ids.setter + def _ids(self, ids): + ids = np.sort(np.asarray(ids)) + assert isinstance(ids, np.ndarray) and ids.ndim == 1 + self.__ids = ids + + @property + def masker(self): + """:class:`nilearn.input_data.NiftiMasker` or similar: Masker object. + + Defines the space and location of the area of interest (e.g., 'brain'). + """ + return self.__masker - def get_labels(self, ids=None): - """Extract list of labels for which studies in Dataset have annotations. + @masker.setter + def masker(self, mask): + mask = get_masker(mask) + if hasattr(self, "masker") and not np.array_equal( + self.masker.mask_img.affine, mask.mask_img.affine + ): + # This message does not have an associated effect, + # since matrix indices are calculated as necessary + LGR.warning("New masker does not match old masker. Space is assumed to be the same.") - Parameters - ---------- - ids : :obj:`list`, optional - A list of IDs in the Dataset for which to find labels. Default is - None, in which case all labels are returned. + self.__masker = mask - Returns - ------- - labels : :obj:`list` - List of labels for which there are annotations in the Dataset. + @property + def annotations(self): + """:class:`pandas.DataFrame`: Labels describing studies in the dataset. + + Each study/experiment has its own row. + Columns correspond to individual labels (e.g., 'emotion'), and may + be prefixed with a feature group including two underscores + (e.g., 'Neurosynth_TFIDF__emotion'). """ - if not isinstance(ids, list) and ids is not None: - ids = _listify(ids) + return self.__annotations - result = [c for c in self.annotations.columns if c not in self._id_cols] - if ids is not None: - temp_annotations = self.annotations.loc[self.annotations["id"].isin(ids)] - res = temp_annotations[result].any(axis=0) - result = res.loc[res].index.tolist() + @annotations.setter + def annotations(self, df): + _validate_df(df) + self.__annotations = df.sort_values(by="id") - return result + @property + def coordinates(self): + """:class:`pandas.DataFrame`: Coordinates in the dataset. - def get_texts(self, ids=None, text_type=None): - """Extract list of texts of a given type for selected IDs. + .. versionchanged:: 0.0.10 - Parameters - ---------- - ids : :obj:`list`, optional - A list of IDs in the Dataset for which to find texts. Default is - None, in which case all texts of requested type are returned. - text_type : :obj:`str`, optional - Type of text to extract. Corresponds to column name in - Dataset.texts DataFrame. Default is None. + The coordinates attribute no longer includes the associated matrix indices + (columns 'i', 'j', and 'k'). These columns are calculated as needed. - Returns - ------- - texts : :obj:`list` - List of texts of requested type for selected IDs. + Each study has one row for each peak. + Columns include ['x', 'y', 'z'] (peak locations in mm) and 'space' (Dataset's space). """ - result = self._generic_column_getter("texts", ids=ids, column=text_type) - return result + return self.__coordinates - def get_metadata(self, ids=None, field=None): - """Get metadata from Dataset. + @coordinates.setter + def coordinates(self, df): + _validate_df(df) + self.__coordinates = df.sort_values(by="id") - Parameters - ---------- - ids : :obj:`list`, optional - A list of IDs in the Dataset for which to find metadata. Default is - None, in which case all metadata of requested type are returned. - field : :obj:`str`, optional - Metadata field to extract. Corresponds to column name in - Dataset.metadata DataFrame. Default is None. + @property + def images(self): + """:class:`pandas.DataFrame`: Images in the dataset. - Returns - ------- - metadata : :obj:`list` - List of values of requested type for selected IDs. + Each image type has its own column (e.g., 'z') with absolute paths to + files and each study has its own row. + Additionally, relative paths to image files are stored in columns with + the suffix '__relative' (e.g., 'z__relative'). + + Warnings + -------- + Images are assumed to be in the same space, although they may have + different resolutions and affines. Images will be resampled as needed + at the point where they are used, via :obj:`Dataset.masker`. """ - result = self._generic_column_getter("metadata", ids=ids, column=field) - return result + return self.__images - def get_images(self, ids=None, imtype=None): - """Get images of a certain type for a subset of studies in the dataset. + @images.setter + def images(self, df): + _validate_df(df) + self.__images = _validate_images_df(df).sort_values(by="id") - Parameters - ---------- - ids : :obj:`list`, optional - A list of IDs in the Dataset for which to find images. Default is - None, in which case all images of requested type are returned. - imtype : :obj:`str`, optional - Type of image to extract. Corresponds to column name in - Dataset.images DataFrame. Default is None. + @property + def metadata(self): + """:class:`pandas.DataFrame`: Metadata describing studies in the dataset. - Returns - ------- - images : :obj:`list` - List of images of requested type for selected IDs. + Each metadata field has its own column (e.g., 'sample_sizes') and each study + has its own row. """ - ignore_columns = ["space"] - ignore_columns += [c for c in self.images.columns if c.endswith("__relative")] - result = self._generic_column_getter( - "images", - ids=ids, - column=imtype, - ignore_columns=ignore_columns, - ) - return result + return self.__metadata - def get_studies_by_label(self, labels=None, label_threshold=0.001): - """Extract list of studies with a given label. + @metadata.setter + def metadata(self, df): + _validate_df(df) + self.__metadata = df.sort_values(by="id") - .. versionchanged:: 0.0.10 + @property + def texts(self): + """:class:`pandas.DataFrame`: Texts in the dataset. - Fix bug in which all IDs were returned when a label wasn't present in the Dataset. + Each text type has its own column (e.g., 'abstract') and each study + has its own row. + """ + return self.__texts - .. versionchanged:: 0.0.9 + @texts.setter + def texts(self, df): + _validate_df(df) + self.__texts = df.sort_values(by="id") - Default value for label_threshold changed to 0.001. + def slice(self, ids): + """Create a new dataset with only requested IDs. Parameters ---------- - labels : :obj:`list`, optional - List of labels to use to search Dataset. If a contrast has all of - the labels above the threshold, it will be returned. - Default is None. - label_threshold : :obj:`float`, optional - Default is 0.5. + ids : array_like + List of study IDs to include in new dataset Returns ------- - found_ids : :obj:`list` - A list of IDs from the Dataset found by the search criteria. + new_dset : :obj:`~nimare.dataset.Dataset` + Reduced Dataset containing only requested studies. """ - if isinstance(labels, str): - labels = [labels] - elif not isinstance(labels, list): - raise ValueError(f"Argument 'labels' cannot be {type(labels)}") - - missing_labels = [label for label in labels if label not in self.annotations.columns] - if missing_labels: - raise ValueError(f"Missing label(s): {', '.join(missing_labels)}") + new_dset = copy.deepcopy(self) + new_dset._ids = ids + for attribute in ("annotations", "coordinates", "images", "metadata", "texts"): + df = getattr(new_dset, attribute) + df = df.loc[df["id"].isin(ids)] + setattr(new_dset, attribute, df) - temp_annotations = self.annotations[self._id_cols + labels] - found_rows = (temp_annotations[labels] >= label_threshold).all(axis=1) - if any(found_rows): - found_ids = temp_annotations.loc[found_rows, "id"].tolist() - else: - found_ids = [] + return new_dset - return found_ids + def merge(self, right): + """Merge two Datasets. - def get_studies_by_mask(self, mask): - """Extract list of studies with at least one coordinate in mask. + .. versionadded:: 0.0.9 Parameters ---------- - mask : img_like - Mask across which to search for coordinates. + right : :obj:`~nimare.dataset.Dataset` + Dataset to merge with. Returns ------- - found_ids : :obj:`list` - A list of IDs from the Dataset with at least one focus in the mask. + :obj:`~nimare.dataset.Dataset` + A Dataset of the two merged Datasets. """ - from scipy.spatial.distance import cdist + assert isinstance(right, Dataset) + shared_ids = np.intersect1d(self.ids, right.ids) + if shared_ids.size: + raise Exception("Duplicate IDs detected in both datasets.") - mask = load_niimg(mask) + all_ids = np.concatenate((self.ids, right.ids)) + new_dset = copy.deepcopy(self) + new_dset._ids = all_ids - dset_mask = self.masker.mask_img - if not np.array_equal(dset_mask.affine, mask.affine): - LGR.warning("Mask affine does not match Dataset affine. Assuming same space.") + for attribute in ("annotations", "coordinates", "images", "metadata", "texts"): + df1 = getattr(self, attribute) + df2 = getattr(right, attribute) + new_df = df1.append(df2, ignore_index=True, sort=False) + new_df.sort_values(by="id", inplace=True) + new_df.reset_index(drop=True, inplace=True) + new_df = new_df.where(~new_df.isna(), None) + setattr(new_dset, attribute, new_df) - dset_ijk = mm2vox(self.coordinates[["x", "y", "z"]].values, mask.affine) - mask_ijk = np.vstack(np.where(mask.get_fdata())).T - distances = cdist(mask_ijk, dset_ijk) - distances = np.any(distances == 0, axis=0) - found_ids = list(self.coordinates.loc[distances, "id"].unique()) - return found_ids + new_dset.coordinates = _transform_coordinates_to_space( + new_dset.coordinates, + self.masker, + self.space, + ) - def get_studies_by_coordinate(self, xyz, r=20): - """Extract list of studies with at least one focus within radius of requested coordinates. + return new_dset + + def update_path(self, new_path): + """Update paths to images. + + Prepends new path to the relative path for files in Dataset.images. Parameters ---------- - xyz : (X x 3) array_like - List of coordinates against which to find studies. - r : :obj:`float`, optional - Radius (in mm) within which to find studies. Default is 20mm. - - Returns - ------- - found_ids : :obj:`list` - A list of IDs from the Dataset with at least one focus within - radius r of requested coordinates. + new_path : :obj:`str` + Path to prepend to relative paths of files in Dataset.images. """ - from scipy.spatial.distance import cdist + self.basepath = op.abspath(new_path) + df = self.images + relative_path_cols = [c for c in df if c.endswith("__relative")] + for col in relative_path_cols: + abs_col = col.replace("__relative", "") + if abs_col in df.columns: + LGR.info(f"Overwriting images column {abs_col}") + df[abs_col] = df[col].apply(_try_prepend, prefix=self.basepath) + self.images = df - xyz = np.array(xyz) - assert xyz.shape[1] == 3 and xyz.ndim == 2 - distances = cdist(xyz, self.coordinates[["x", "y", "z"]].values) - distances = np.any(distances <= r, axis=0) - found_ids = list(self.coordinates.loc[distances, "id"].unique()) - return found_ids + def copy(self): + """Create a copy of the Dataset.""" + return copy.deepcopy(self) From 1a9c1a6f28c8744429f61156991266283e05dd32 Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Mon, 25 Apr 2022 11:07:29 -0400 Subject: [PATCH 02/18] Incorporate new searcher into Estimator. --- nimare/base.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/nimare/base.py b/nimare/base.py index 63ab94c80..1bcb88959 100644 --- a/nimare/base.py +++ b/nimare/base.py @@ -13,8 +13,9 @@ from nilearn._utils.niimg_conversions import _check_same_fov from nilearn.image import concat_imgs, resample_to_img -from .results import MetaResult -from .utils import get_masker, mm2vox +from nimare.dataset import DatasetSearcher +from nimare.results import MetaResult +from nimare.utils import get_masker, mm2vox LGR = logging.getLogger(__name__) @@ -259,7 +260,8 @@ def _validate_input(self, dataset, drop_invalid=True): ) if self._required_inputs: - data = dataset.get(self._required_inputs, drop_invalid=drop_invalid) + searcher = DatasetSearcher() + data = searcher.get(dataset, self._required_inputs, drop_invalid=drop_invalid) # Do not overwrite existing inputs_ attribute. # This is necessary for PairwiseCBMAEstimator, which validates two sets of coordinates # in the same object. @@ -484,7 +486,8 @@ def _validate_input(self, dataset, drop_invalid=True): ) if self._required_inputs: - data = dataset.get(self._required_inputs, drop_invalid=drop_invalid) + searcher = DatasetSearcher() + data = searcher.get(dataset, self._required_inputs, drop_invalid=drop_invalid) # Do not overwrite existing inputs_ attribute. # This is necessary for PairwiseCBMAEstimator, which validates two sets of coordinates # in the same object. From afca5f82721a15237ef8f32f2c3dec45586cd7f3 Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Mon, 25 Apr 2022 11:07:36 -0400 Subject: [PATCH 03/18] Update docstring example. --- nimare/dataset.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/nimare/dataset.py b/nimare/dataset.py index b98c20bba..07921c31e 100755 --- a/nimare/dataset.py +++ b/nimare/dataset.py @@ -51,8 +51,11 @@ def get(self, dataset, dict_, drop_invalid=True): Examples -------- - >>> dset.get({'z_maps': ('image', 'z'), 'sample_sizes': ('metadata', 'sample_sizes')}) - >>> dset.get({'coordinates': ('coordinates', None)}) + >>> searcher = DatasetSearcher() + >>> searcher.get( + >>> dset, {'z_maps': ('image', 'z'), 'sample_sizes': ('metadata', 'sample_sizes')} + >>> ) + >>> searcher.get(dset, {'coordinates': ('coordinates', None)}) """ results = {} results["id"] = dataset.ids From 32bd9dbd1e6706081bcef7eede4b4eaef3070627 Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Mon, 25 Apr 2022 11:07:50 -0400 Subject: [PATCH 04/18] Update get_labels. --- nimare/tests/test_decode_continuous.py | 7 +++++-- nimare/tests/test_decode_discrete.py | 10 +++++++--- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/nimare/tests/test_decode_continuous.py b/nimare/tests/test_decode_continuous.py index c4a31738d..ee70239aa 100644 --- a/nimare/tests/test_decode_continuous.py +++ b/nimare/tests/test_decode_continuous.py @@ -5,6 +5,7 @@ import pandas as pd import pytest +from nimare.dataset import DatasetSearcher from nimare.decode import continuous from nimare.meta import kernel, mkda @@ -12,7 +13,8 @@ def test_CorrelationDecoder_smoke(testdata_laird): """Smoke test for continuous.CorrelationDecoder.""" testdata_laird = testdata_laird.copy() - features = testdata_laird.get_labels(ids=testdata_laird.ids[0])[:5] + searcher = DatasetSearcher() + features = searcher.get_labels(testdata_laird, ids=testdata_laird.ids[0])[:5] decoder = continuous.CorrelationDecoder(features=features) decoder.fit(testdata_laird) @@ -29,7 +31,8 @@ def test_CorrelationDistributionDecoder_smoke(testdata_laird, tmp_path_factory): tmpdir = tmp_path_factory.mktemp("test_CorrelationDistributionDecoder") testdata_laird = testdata_laird.copy() - features = testdata_laird.get_labels(ids=testdata_laird.ids[0])[:5] + searcher = DatasetSearcher() + features = searcher.get_labels(testdata_laird, ids=testdata_laird.ids[0])[:5] decoder = continuous.CorrelationDistributionDecoder(features=features) diff --git a/nimare/tests/test_decode_discrete.py b/nimare/tests/test_decode_discrete.py index 17b65c771..d607dbddc 100644 --- a/nimare/tests/test_decode_discrete.py +++ b/nimare/tests/test_decode_discrete.py @@ -5,6 +5,7 @@ import pandas as pd import pytest +from nimare.dataset import DatasetSearcher from nimare.decode import discrete @@ -38,8 +39,9 @@ def test_brainmap_decode(testdata_laird): def test_NeurosynthDecoder(testdata_laird): """Smoke test for discrete.NeurosynthDecoder.""" + searcher = DatasetSearcher() ids = testdata_laird.ids[:5] - labels = testdata_laird.get_labels(ids=testdata_laird.ids) + labels = searcher.get_labels(testdata_laird, ids=testdata_laird.ids) decoder = discrete.NeurosynthDecoder(features=labels) decoder.fit(testdata_laird) decoded_df = decoder.transform(ids=ids) @@ -65,8 +67,9 @@ def test_NeurosynthDecoder_featuregroup_failure(testdata_laird): def test_BrainMapDecoder(testdata_laird): """Smoke test for discrete.BrainMapDecoder.""" + searcher = DatasetSearcher() ids = testdata_laird.ids[:5] - labels = testdata_laird.get_labels(ids=testdata_laird.ids) + labels = searcher.get_labels(testdata_laird, ids=testdata_laird.ids) decoder = discrete.BrainMapDecoder(features=labels) decoder.fit(testdata_laird) decoded_df = decoder.transform(ids=ids) @@ -83,7 +86,8 @@ def test_BrainMapDecoder_failure(testdata_laird): def test_ROIAssociationDecoder(testdata_laird, roi_img): """Smoke test for discrete.ROIAssociationDecoder.""" - labels = testdata_laird.get_labels(ids=testdata_laird.ids) + searcher = DatasetSearcher() + labels = searcher.get_labels(testdata_laird, ids=testdata_laird.ids) decoder = discrete.ROIAssociationDecoder(masker=roi_img, features=labels) decoder.fit(testdata_laird) decoded_df = decoder.transform() From 0f11583b6e77ad99cc58c03a7a816eaf5a050fba Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Mon, 25 Apr 2022 11:10:59 -0400 Subject: [PATCH 05/18] Update get_metadata. --- nimare/dataset.py | 4 ++-- nimare/utils.py | 10 ++++++---- nimare/workflows/ale.py | 8 +++++--- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/nimare/dataset.py b/nimare/dataset.py index 07921c31e..fad67adb4 100755 --- a/nimare/dataset.py +++ b/nimare/dataset.py @@ -62,9 +62,9 @@ def get(self, dataset, dict_, drop_invalid=True): keep_idx = np.arange(len(dataset.ids), dtype=int) for k, vals in dict_.items(): if vals[0] == "image": - temp = dataset.get_images(imtype=vals[1]) + temp = self.get_images(dataset, imtype=vals[1]) elif vals[0] == "metadata": - temp = dataset.get_metadata(field=vals[1]) + temp = self.get_metadata(dataset, field=vals[1]) elif vals[0] == "coordinates": # Break DataFrame down into a list of study-specific DataFrames temp = [ diff --git a/nimare/utils.py b/nimare/utils.py index c5ae8d4e9..c04c609e7 100755 --- a/nimare/utils.py +++ b/nimare/utils.py @@ -15,8 +15,9 @@ import pandas as pd from nilearn.input_data import NiftiMasker -from . import references -from .due import due +from nimare import references +from nimare.dataset import DatasetSearcher +from nimare.due import due LGR = logging.getLogger(__name__) @@ -841,10 +842,11 @@ def _add_metadata_to_dataframe( Updated DataFrame with ``target_column`` added. """ dataframe = dataframe.copy() + searcher = DatasetSearcher() - if metadata_field in dataset.get_metadata(): + if metadata_field in searcher.get_metadata(dataset): # Collect metadata from Dataset - metadata = dataset.get_metadata(field=metadata_field, ids=dataset.ids) + metadata = searcher.get_metadata(dataset, field=metadata_field, ids=dataset.ids) metadata = [[m] for m in metadata] # Create a DataFrame with the metadata metadata = pd.DataFrame( diff --git a/nimare/workflows/ale.py b/nimare/workflows/ale.py index acc572d0a..54d56d7fe 100644 --- a/nimare/workflows/ale.py +++ b/nimare/workflows/ale.py @@ -7,6 +7,7 @@ import numpy as np from nimare.correct import FWECorrector +from nimare.dataset import DatasetSearcher from nimare.diagnostics import FocusCounter from nimare.io import convert_sleuth_to_dataset from nimare.meta import ALE, ALESubtraction @@ -26,6 +27,7 @@ def ale_sleuth_workflow( ): """Perform ALE meta-analysis from Sleuth text file.""" LGR.info("Loading coordinates...") + searcher = DatasetSearcher() if fwhm: fwhm_str = f"of {fwhm} mm" @@ -34,7 +36,7 @@ def ale_sleuth_workflow( if not sleuth_file2: dset = convert_sleuth_to_dataset(sleuth_file, target="ale_2mm") - n_subs = dset.get_metadata(field="sample_sizes") + n_subs = searcher.get_metadata(dset, field="sample_sizes") n_subs = np.sum(n_subs) boilerplate = """ @@ -115,9 +117,9 @@ def ale_sleuth_workflow( else: dset1 = convert_sleuth_to_dataset(sleuth_file, target="ale_2mm") dset2 = convert_sleuth_to_dataset(sleuth_file2, target="ale_2mm") - n_subs1 = dset1.get_metadata(field="sample_sizes") + n_subs1 = searcher.get_metadata(dset1, field="sample_sizes") n_subs1 = np.sum(n_subs1) - n_subs2 = dset2.get_metadata(field="sample_sizes") + n_subs2 = searcher.get_metadata(dset2, field="sample_sizes") n_subs2 = np.sum(n_subs2) boilerplate = """ From 6c8ef33e751038238e284515a668f6bbbc3ca0a9 Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Mon, 25 Apr 2022 11:15:36 -0400 Subject: [PATCH 06/18] Update get_images. --- examples/01_datasets/01_plot_dataset_io.py | 15 ++++++++------ nimare/base.py | 4 +++- nimare/meta/kernel.py | 23 ++++++++++++++++------ nimare/tests/test_workflows.py | 7 +++++-- 4 files changed, 34 insertions(+), 15 deletions(-) diff --git a/examples/01_datasets/01_plot_dataset_io.py b/examples/01_datasets/01_plot_dataset_io.py index f83a214d2..9dcb9c962 100644 --- a/examples/01_datasets/01_plot_dataset_io.py +++ b/examples/01_datasets/01_plot_dataset_io.py @@ -14,7 +14,7 @@ # ----------------------------------------------------------------------------- import os -from nimare.dataset import Dataset +from nimare.dataset import Dataset, DatasetSearcher from nimare.extract import download_nidm_pain from nimare.transforms import ImageTransformer from nimare.utils import get_resource_path @@ -127,8 +127,11 @@ dset.images[["id", "varcope"]].head() ############################################################################### -# Datasets support many search methods +# The DatasetSearcher class can search Datasets # ----------------------------------------------------------------------------- +searcher = DatasetSearcher() + +############################################################################### # There are ``get_[X]`` and ``get_studies_by_[X]`` methods for a range of # possible search criteria. # The ``get_[X]`` methods allow you to search for specific metadata, while the @@ -139,7 +142,7 @@ # by default, and for every requested study if the ``ids`` argument is provided. # If a study does not have the data requested, the returned list will have # ``None`` for that study. -z_images = dset.get_images(imtype="z") +z_images = searcher.get_images(dset, imtype="z") z_images = [str(z) for z in z_images] print("\n".join(z_images)) @@ -148,16 +151,16 @@ # ````````````````````````````````````````````````````````````````````````````` z_transformer = ImageTransformer(target="z") dset = z_transformer.transform(dset) -z_images = dset.get_images(imtype="z") +z_images = searcher.get_images(dset, imtype="z") z_images = [str(z) for z in z_images] print("\n".join(z_images)) ############################################################################### -# Datasets can also search for studies matching criteria +# DatasetSearchers can also search for studies matching criteria # ----------------------------------------------------------------------------- # ``get_studies_by_[X]`` methods return a list of study identifiers matching # the criteria, such as reporting a peak coordinate near a search coordinate. -sel_studies = dset.get_studies_by_coordinate(xyz=[[0, 0, 0]], r=20) +sel_studies = searcher.get_studies_by_coordinate(dset, xyz=[[0, 0, 0]], r=20) print("\n".join(sel_studies)) ############################################################################### diff --git a/nimare/base.py b/nimare/base.py index 1bcb88959..e4f5e2da8 100644 --- a/nimare/base.py +++ b/nimare/base.py @@ -361,6 +361,7 @@ def __init__(self, *args, **kwargs): def _preprocess_input(self, dataset): """Preprocess inputs to the Estimator from the Dataset as needed.""" masker = self.masker or dataset.masker + searcher = DatasetSearcher() mask_img = masker.mask_img or masker.labels_img if isinstance(mask_img, str): @@ -420,7 +421,8 @@ def _preprocess_input(self, dataset): if hasattr(self, "kernel_transformer"): self.kernel_transformer._infer_names(affine=md5(mask_img.affine).hexdigest()) if self.kernel_transformer.image_type in dataset.images.columns: - files = dataset.get_images( + files = searcher.get_images( + dataset, ids=self.inputs_["id"], imtype=self.kernel_transformer.image_type, ) diff --git a/nimare/meta/kernel.py b/nimare/meta/kernel.py index 31d73e8c8..862da96e1 100644 --- a/nimare/meta/kernel.py +++ b/nimare/meta/kernel.py @@ -16,17 +16,23 @@ import pandas as pd from nilearn import image -from .. import references -from ..base import Transformer -from ..due import due -from ..utils import ( +from nimare import references +from nimare.base import Transformer +from nimare.dataset import DatasetSearcher +from nimare.due import due +from nimare.meta.utils import ( + compute_ale_ma, + compute_kda_ma, + compute_p2m_ma, + get_ale_kernel, +) +from nimare.utils import ( _add_metadata_to_dataframe, _safe_transform, mm2vox, use_memmap, vox2mm, ) -from .utils import compute_ale_ma, compute_kda_ma, compute_p2m_ma, get_ale_kernel LGR = logging.getLogger(__name__) @@ -132,6 +138,7 @@ def transform(self, dataset, masker=None, return_type="image"): # but has different affine, from original IJK. coordinates[["i", "j", "k"]] = mm2vox(coordinates[["x", "y", "z"]], mask.affine) else: + searcher = DatasetSearcher() masker = dataset.masker if not masker else masker mask = masker.mask_img coordinates = dataset.coordinates.copy() @@ -143,7 +150,11 @@ def transform(self, dataset, masker=None, return_type="image"): # Use coordinates to get IDs instead of Dataset.ids bc of possible # mismatch between full Dataset and contrasts with coordinates. if self.image_type in dataset.images.columns: - files = dataset.get_images(ids=coordinates["id"].unique(), imtype=self.image_type) + files = searcher.get_images( + dataset, + ids=coordinates["id"].unique(), + imtype=self.image_type, + ) if all(f is not None for f in files): LGR.debug("Files already exist. Using them.") if return_type == "array": diff --git a/nimare/tests/test_workflows.py b/nimare/tests/test_workflows.py index 526ec90ac..0fffb4c52 100644 --- a/nimare/tests/test_workflows.py +++ b/nimare/tests/test_workflows.py @@ -2,6 +2,7 @@ import os.path as op from nimare import cli, workflows +from nimare.dataset import DatasetSearcher from nimare.tests.utils import get_test_data_path @@ -127,7 +128,8 @@ def test_conperm_workflow_function_smoke(testdata_ibma, tmp_path_factory): """Run smoke test of the contrast permutation workflow as a function.""" tmpdir = tmp_path_factory.mktemp("test_conperm_workflow_function_smoke") dset = testdata_ibma - files = dset.get_images(imtype="beta") + searcher = DatasetSearcher() + files = searcher.get_images(dset, imtype="beta") mask_image = op.join(get_test_data_path(), "test_pain_dataset", "mask.nii.gz") prefix = "test" @@ -142,7 +144,8 @@ def test_conperm_workflow_cli_smoke(testdata_ibma, tmp_path_factory): """Run smoke test of the contrast permutation workflow as a CLI.""" tmpdir = tmp_path_factory.mktemp("test_conperm_workflow_cli_smoke") dset = testdata_ibma - files = dset.get_images(imtype="beta") + searcher = DatasetSearcher() + files = searcher.get_images(dset, imtype="beta") mask_image = op.join(get_test_data_path(), "test_pain_dataset", "mask.nii.gz") prefix = "test" From a4646854a1ab495f254a0d06300cf8c609b62390 Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Mon, 25 Apr 2022 11:18:59 -0400 Subject: [PATCH 07/18] Update get_studies_by_mask. --- examples/02_meta-analyses/07_macm.py | 5 ++-- .../04_decoding/01_plot_discrete_decoders.py | 5 ++-- nimare/decode/continuous.py | 26 +++++++++++-------- nimare/workflows/macm.py | 5 ++-- 4 files changed, 24 insertions(+), 17 deletions(-) diff --git a/examples/02_meta-analyses/07_macm.py b/examples/02_meta-analyses/07_macm.py index ee331d363..39f9ac032 100644 --- a/examples/02_meta-analyses/07_macm.py +++ b/examples/02_meta-analyses/07_macm.py @@ -17,7 +17,7 @@ from nilearn import datasets, image, plotting from nimare.correct import FWECorrector -from nimare.dataset import Dataset +from nimare.dataset import Dataset, DatasetSearcher from nimare.meta.cbma.ale import SCALE from nimare.meta.cbma.mkda import MKDAChi2 @@ -44,7 +44,8 @@ ############################################################################### # Select studies with a reported coordinate in the ROI # ----------------------------------------------------------------------------- -roi_ids = dset.get_studies_by_mask(roi_img) +searcher = DatasetSearcher() +roi_ids = searcher.get_studies_by_mask(dset, roi_img) dset_sel = dset.slice(roi_ids) print(f"{len(roi_ids)}/{len(dset.ids)} studies report at least one coordinate in the ROI") diff --git a/examples/04_decoding/01_plot_discrete_decoders.py b/examples/04_decoding/01_plot_discrete_decoders.py index 123eec0d3..890d09a9e 100644 --- a/examples/04_decoding/01_plot_discrete_decoders.py +++ b/examples/04_decoding/01_plot_discrete_decoders.py @@ -17,7 +17,7 @@ import numpy as np from nilearn.plotting import plot_roi -from nimare.dataset import Dataset +from nimare.dataset import Dataset, DatasetSearcher from nimare.decode import discrete from nimare.utils import get_resource_path @@ -40,7 +40,8 @@ plot_roi(mask_img, draw_cross=False) # Get studies with voxels in the mask -ids = dset.get_studies_by_mask(mask_img) +searcher = DatasetSearcher() +ids = searcher.get_studies_by_mask(dset, mask_img) ############################################################################### # diff --git a/nimare/decode/continuous.py b/nimare/decode/continuous.py index 457acf528..393d66c1a 100755 --- a/nimare/decode/continuous.py +++ b/nimare/decode/continuous.py @@ -8,14 +8,15 @@ from nilearn.masking import apply_mask from tqdm.auto import tqdm -from .. import references -from ..base import Decoder -from ..due import due -from ..meta.cbma.base import CBMAEstimator -from ..meta.cbma.mkda import MKDAChi2 -from ..stats import pearson -from ..utils import _check_type, _safe_transform -from .utils import weight_priors +from nimare import references +from nimare.base import Decoder +from nimare.dataset import DatasetSearcher +from nimare.decode.utils import weight_priors +from nimare.due import due +from nimare.meta.cbma.base import CBMAEstimator +from nimare.meta.cbma.mkda import MKDAChi2 +from nimare.stats import pearson +from nimare.utils import _check_type, _safe_transform LGR = logging.getLogger(__name__) @@ -182,10 +183,12 @@ def _fit(self, dataset): Masked meta-analytic maps """ self.masker = dataset.masker + searcher = DatasetSearcher() n_features = len(self.features_) for i_feature, feature in enumerate(tqdm(self.features_, total=n_features)): - feature_ids = dataset.get_studies_by_label( + feature_ids = searcher.get_studies_by_label( + dataset, labels=[feature], label_threshold=self.frequency_threshold, ) @@ -292,11 +295,12 @@ def _fit(self, dataset): Masked meta-analytic maps """ self.masker = dataset.masker + searcher = DatasetSearcher() images_ = {} for feature in self.features_: - feature_ids = dataset.get_studies_by_label( - labels=[feature], label_threshold=self.frequency_threshold + feature_ids = searcher.get_studies_by_label( + dataset, labels=[feature], label_threshold=self.frequency_threshold ) selected_ids = sorted(list(set(feature_ids).intersection(self.inputs_["id"]))) selected_id_idx = [ diff --git a/nimare/workflows/macm.py b/nimare/workflows/macm.py index 6f368c037..1c9fb2213 100644 --- a/nimare/workflows/macm.py +++ b/nimare/workflows/macm.py @@ -5,7 +5,7 @@ from shutil import copyfile from ..correct import FWECorrector -from ..dataset import Dataset +from ..dataset import Dataset, DatasetSearcher from ..meta import ALE LGR = logging.getLogger(__name__) @@ -17,7 +17,8 @@ def macm_workflow( """Perform MACM with ALE algorithm.""" LGR.info("Loading coordinates...") dset = Dataset(dataset_file) - sel_ids = dset.get_studies_by_mask(mask_file) + searcher = DatasetSearcher() + sel_ids = searcher.get_studies_by_mask(dset, mask_file) sel_dset = dset.slice(sel_ids) # override sample size From 396a43a1e01b99346f13c6ad361aceb04b77d99e Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Mon, 25 Apr 2022 12:03:58 -0400 Subject: [PATCH 08/18] Add test. Still need to deal with circular imports. --- nimare/base.py | 7 ++++++- nimare/tests/test_dataset.py | 34 +++++++++++++++++++--------------- 2 files changed, 25 insertions(+), 16 deletions(-) diff --git a/nimare/base.py b/nimare/base.py index e4f5e2da8..59f01c231 100644 --- a/nimare/base.py +++ b/nimare/base.py @@ -13,7 +13,6 @@ from nilearn._utils.niimg_conversions import _check_same_fov from nilearn.image import concat_imgs, resample_to_img -from nimare.dataset import DatasetSearcher from nimare.results import MetaResult from nimare.utils import get_masker, mm2vox @@ -254,6 +253,8 @@ class Estimator(NiMAREBase): def _validate_input(self, dataset, drop_invalid=True): """Search for, and validate, required inputs as necessary.""" + from nimare.dataset import DatasetSearcher + if not hasattr(dataset, "slice"): raise ValueError( f"Argument 'dataset' must be a valid Dataset object, not a {type(dataset)}." @@ -360,6 +361,8 @@ def __init__(self, *args, **kwargs): def _preprocess_input(self, dataset): """Preprocess inputs to the Estimator from the Dataset as needed.""" + from nimare.dataset import DatasetSearcher + masker = self.masker or dataset.masker searcher = DatasetSearcher() @@ -482,6 +485,8 @@ class Decoder(NiMAREBase): def _validate_input(self, dataset, drop_invalid=True): """Search for, and validate, required inputs as necessary.""" + from nimare.dataset import DatasetSearcher + if not hasattr(dataset, "slice"): raise ValueError( f"Argument 'dataset' must be a valid Dataset object, not a {type(dataset)}." diff --git a/nimare/tests/test_dataset.py b/nimare/tests/test_dataset.py index 65bc26238..cbb5864d8 100644 --- a/nimare/tests/test_dataset.py +++ b/nimare/tests/test_dataset.py @@ -10,6 +10,25 @@ from nimare.tests.utils import get_test_data_path +def test_DatasetSearcher(testdata_laird): + dset = testdata_laird.copy() + searcher = dataset.DatasetSearcher + METHODS = [searcher.get_images, searcher.get_labels, searcher.get_metadata, searcher.get_texts] + for method in METHODS: + assert isinstance(method(dset), list) + assert isinstance(method(dset, ids=dset.ids[:5]), list) + assert isinstance(method(dset, ids=dset.ids[0]), list) + + assert isinstance(searcher.get_images(dset, imtype="beta"), list) + assert isinstance(searcher.get_metadata(dset, field="sample_sizes"), list) + assert isinstance(searcher.get_studies_by_label(dset, "cogat_cognitive_control"), list) + assert isinstance(searcher.get_studies_by_coordinate(dset, np.array([[20, 20, 20]])), list) + + # If label is not available, raise ValueError + with pytest.raises(ValueError): + searcher.get_studies_by_label(dset, "dog") + + def test_dataset_smoke(): """Smoke test for nimare.dataset.Dataset initialization and get methods.""" db_file = op.join(get_test_data_path(), "neurosynth_dset.json") @@ -19,21 +38,6 @@ def test_dataset_smoke(): # Test that Dataset.masker is portable assert not nib.is_proxy(dset.masker.mask_img_.dataobj) - methods = [dset.get_images, dset.get_labels, dset.get_metadata, dset.get_texts] - for method in methods: - assert isinstance(method(), list) - assert isinstance(method(ids=dset.ids[:5]), list) - assert isinstance(method(ids=dset.ids[0]), list) - - assert isinstance(dset.get_images(imtype="beta"), list) - assert isinstance(dset.get_metadata(field="sample_sizes"), list) - assert isinstance(dset.get_studies_by_label("cogat_cognitive_control"), list) - assert isinstance(dset.get_studies_by_coordinate(np.array([[20, 20, 20]])), list) - - # If label is not available, raise ValueError - with pytest.raises(ValueError): - dset.get_studies_by_label("dog") - mask_data = np.zeros(dset.masker.mask_img.shape, int) mask_data[40, 40, 40] = 1 mask_img = nib.Nifti1Image(mask_data, dset.masker.mask_img.affine) From c81785d5f2d49c1ccaf7fc0e89a5370b63e586e4 Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Thu, 28 Apr 2022 12:41:19 -0400 Subject: [PATCH 09/18] Fix name. --- nimare/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nimare/dataset.py b/nimare/dataset.py index 2240caa17..cbb0855be 100755 --- a/nimare/dataset.py +++ b/nimare/dataset.py @@ -26,7 +26,7 @@ LGR = logging.getLogger(__name__) -class DatasetSeacher(NiMAREBase): +class DatasetSearcher(NiMAREBase): """A tool for searching Datasets.""" def get(self, dataset, dict_, drop_invalid=True): From 09bf9f3a0015e4563eb7d3c81cbff513545d0bfc Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Thu, 28 Apr 2022 12:48:50 -0400 Subject: [PATCH 10/18] Move some functions around. --- nimare/annotate/cogat.py | 2 +- nimare/extract/utils.py | 30 ++++++++++++- nimare/meta/cbma/base.py | 15 +++---- nimare/meta/kernel.py | 9 +--- nimare/meta/utils.py | 71 +++++++++++++++++++++++++++++ nimare/utils.py | 97 ---------------------------------------- 6 files changed, 109 insertions(+), 115 deletions(-) diff --git a/nimare/annotate/cogat.py b/nimare/annotate/cogat.py index a6264598a..6c6365a99 100755 --- a/nimare/annotate/cogat.py +++ b/nimare/annotate/cogat.py @@ -9,7 +9,7 @@ from nimare.annotate import utils from nimare.due import due from nimare.extract import download_cognitive_atlas -from nimare.utils import _uk_to_us +from nimare.extract.utils import _uk_to_us LGR = logging.getLogger(__name__) diff --git a/nimare/extract/utils.py b/nimare/extract/utils.py index 710dccafe..5341fa6be 100644 --- a/nimare/extract/utils.py +++ b/nimare/extract/utils.py @@ -4,13 +4,14 @@ import logging import os import os.path as op +import re import numpy as np import pandas as pd import requests from fuzzywuzzy import fuzz -from nimare.utils import _uk_to_us +from nimare.utils import get_resource_path LGR = logging.getLogger(__name__) @@ -292,3 +293,30 @@ def _expand_df(df): df["ratio"] = df[["alias", "name"]].apply(_get_ratio, axis=1) df = df.sort_values(by=["length", "ratio"], ascending=[False, False]) return df + + +def _uk_to_us(text): + """Convert UK spellings to US based on a converter. + + .. versionadded:: 0.0.2 + + Parameters + ---------- + text : :obj:`str` + + Returns + ------- + text : :obj:`str` + + Notes + ----- + The english_spellings.csv file is from http://www.tysto.com/uk-us-spelling-list.html. + """ + SPELL_DF = pd.read_csv(op.join(get_resource_path(), "english_spellings.csv"), index_col="UK") + SPELL_DICT = SPELL_DF["US"].to_dict() + + if isinstance(text, str): + # Convert British to American English + pattern = re.compile(r"\b(" + "|".join(SPELL_DICT.keys()) + r")\b") + text = pattern.sub(lambda x: SPELL_DICT[x.group()], text) + return text diff --git a/nimare/meta/cbma/base.py b/nimare/meta/cbma/base.py index 7b8665949..0d4b5077f 100644 --- a/nimare/meta/cbma/base.py +++ b/nimare/meta/cbma/base.py @@ -10,18 +10,15 @@ from nimare.base import MetaEstimator from nimare.meta.kernel import KernelTransformer -from nimare.meta.utils import _calculate_cluster_measures, _get_last_bin +from nimare.meta.utils import ( + _add_metadata_to_dataframe, + _calculate_cluster_measures, + _get_last_bin, +) from nimare.results import MetaResult from nimare.stats import null_to_p, nullhist_to_p from nimare.transforms import p_to_z -from nimare.utils import ( - _add_metadata_to_dataframe, - _check_type, - _safe_transform, - tqdm_joblib, - use_memmap, - vox2mm, -) +from nimare.utils import _check_type, _safe_transform, tqdm_joblib, use_memmap, vox2mm LGR = logging.getLogger(__name__) diff --git a/nimare/meta/kernel.py b/nimare/meta/kernel.py index 862da96e1..57e49d948 100644 --- a/nimare/meta/kernel.py +++ b/nimare/meta/kernel.py @@ -21,18 +21,13 @@ from nimare.dataset import DatasetSearcher from nimare.due import due from nimare.meta.utils import ( + _add_metadata_to_dataframe, compute_ale_ma, compute_kda_ma, compute_p2m_ma, get_ale_kernel, ) -from nimare.utils import ( - _add_metadata_to_dataframe, - _safe_transform, - mm2vox, - use_memmap, - vox2mm, -) +from nimare.utils import _safe_transform, mm2vox, use_memmap, vox2mm LGR = logging.getLogger(__name__) diff --git a/nimare/meta/utils.py b/nimare/meta/utils.py index 38f54283a..d04fbf16c 100755 --- a/nimare/meta/utils.py +++ b/nimare/meta/utils.py @@ -5,9 +5,11 @@ import nibabel as nib import numpy as np import numpy.linalg as npl +import pandas as pd from scipy import ndimage from nimare import references +from nimare.dataset import DatasetSearcher from nimare.due import due from nimare.extract import download_peaks2maps_model from nimare.utils import _determine_chunk_size @@ -16,6 +18,75 @@ LGR = logging.getLogger(__name__) +def _add_metadata_to_dataframe( + dataset, + dataframe, + metadata_field, + target_column, + filter_func=np.mean, +): + """Add metadata from a Dataset to a DataFrame. + + .. versionadded:: 0.0.8 + + This is particularly useful for kernel transformers or estimators where a given metadata field + is necessary (e.g., ALEKernel with "sample_size"), but we want to just use the coordinates + DataFrame instead of passing the full Dataset. + + Parameters + ---------- + dataset : :obj:`~nimare.dataset.Dataset` + Dataset containing study IDs and metadata to feed into dataframe. + dataframe : :obj:`pandas.DataFrame` + DataFrame containing study IDs, into which Dataset metadata will be merged. + metadata_field : :obj:`str` + Metadata field in ``dataset``. + target_column : :obj:`str` + Name of the column that will be added to ``dataframe``, containing information from the + Dataset. + filter_func : :obj:`function`, optional + Function to apply to the metadata so that it fits as a column in a DataFrame. + Default is ``numpy.mean``. + + Returns + ------- + dataframe : :obj:`pandas.DataFrame` + Updated DataFrame with ``target_column`` added. + """ + dataframe = dataframe.copy() + searcher = DatasetSearcher() + + if metadata_field in searcher.get_metadata(dataset): + # Collect metadata from Dataset + metadata = searcher.get_metadata(dataset, field=metadata_field, ids=dataset.ids) + metadata = [[m] for m in metadata] + # Create a DataFrame with the metadata + metadata = pd.DataFrame( + index=dataset.ids, + data=metadata, + columns=[metadata_field], + ) + # Reduce the metadata (if in list/array format) to single values + metadata[target_column] = metadata[metadata_field].apply(filter_func) + # Merge metadata df into coordinates df + dataframe = dataframe.merge( + right=metadata, + left_on="id", + right_index=True, + sort=False, + validate="many_to_one", + suffixes=(False, False), + how="left", + ) + else: + LGR.warning( + f"Metadata field '{metadata_field}' not found. " + "Set a constant value for this field as an argument, if possible." + ) + + return dataframe + + def model_fn(features, labels, mode, params): """Run model function used internally by peaks2maps. diff --git a/nimare/utils.py b/nimare/utils.py index c04c609e7..cdf3149eb 100755 --- a/nimare/utils.py +++ b/nimare/utils.py @@ -16,7 +16,6 @@ from nilearn.input_data import NiftiMasker from nimare import references -from nimare.dataset import DatasetSearcher from nimare.due import due LGR = logging.getLogger(__name__) @@ -625,33 +624,6 @@ def _find_stem(arr): return res -def _uk_to_us(text): - """Convert UK spellings to US based on a converter. - - .. versionadded:: 0.0.2 - - Parameters - ---------- - text : :obj:`str` - - Returns - ------- - text : :obj:`str` - - Notes - ----- - The english_spellings.csv file is from http://www.tysto.com/uk-us-spelling-list.html. - """ - SPELL_DF = pd.read_csv(op.join(get_resource_path(), "english_spellings.csv"), index_col="UK") - SPELL_DICT = SPELL_DF["US"].to_dict() - - if isinstance(text, str): - # Convert British to American English - pattern = re.compile(r"\b(" + "|".join(SPELL_DICT.keys()) + r")\b") - text = pattern.sub(lambda x: SPELL_DICT[x.group()], text) - return text - - def use_memmap(logger, n_files=1): """Memory-map array to a file, and perform cleanup after. @@ -806,75 +778,6 @@ def _safe_transform(imgs, masker, memory_limit="1gb", dtype="auto", memfile=None return masked_data -def _add_metadata_to_dataframe( - dataset, - dataframe, - metadata_field, - target_column, - filter_func=np.mean, -): - """Add metadata from a Dataset to a DataFrame. - - .. versionadded:: 0.0.8 - - This is particularly useful for kernel transformers or estimators where a given metadata field - is necessary (e.g., ALEKernel with "sample_size"), but we want to just use the coordinates - DataFrame instead of passing the full Dataset. - - Parameters - ---------- - dataset : :obj:`~nimare.dataset.Dataset` - Dataset containing study IDs and metadata to feed into dataframe. - dataframe : :obj:`pandas.DataFrame` - DataFrame containing study IDs, into which Dataset metadata will be merged. - metadata_field : :obj:`str` - Metadata field in ``dataset``. - target_column : :obj:`str` - Name of the column that will be added to ``dataframe``, containing information from the - Dataset. - filter_func : :obj:`function`, optional - Function to apply to the metadata so that it fits as a column in a DataFrame. - Default is ``numpy.mean``. - - Returns - ------- - dataframe : :obj:`pandas.DataFrame` - Updated DataFrame with ``target_column`` added. - """ - dataframe = dataframe.copy() - searcher = DatasetSearcher() - - if metadata_field in searcher.get_metadata(dataset): - # Collect metadata from Dataset - metadata = searcher.get_metadata(dataset, field=metadata_field, ids=dataset.ids) - metadata = [[m] for m in metadata] - # Create a DataFrame with the metadata - metadata = pd.DataFrame( - index=dataset.ids, - data=metadata, - columns=[metadata_field], - ) - # Reduce the metadata (if in list/array format) to single values - metadata[target_column] = metadata[metadata_field].apply(filter_func) - # Merge metadata df into coordinates df - dataframe = dataframe.merge( - right=metadata, - left_on="id", - right_index=True, - sort=False, - validate="many_to_one", - suffixes=(False, False), - how="left", - ) - else: - LGR.warning( - f"Metadata field '{metadata_field}' not found. " - "Set a constant value for this field as an argument, if possible." - ) - - return dataframe - - def _check_type(obj, clss, **kwargs): """Check variable type and initialize if necessary. From 841cadf1d1837c20d7ef51c3c57bcf5334849dba Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Thu, 28 Apr 2022 12:50:00 -0400 Subject: [PATCH 11/18] Update test_dataset.py --- nimare/tests/test_dataset.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/nimare/tests/test_dataset.py b/nimare/tests/test_dataset.py index cbb5864d8..c8464c282 100644 --- a/nimare/tests/test_dataset.py +++ b/nimare/tests/test_dataset.py @@ -24,6 +24,11 @@ def test_DatasetSearcher(testdata_laird): assert isinstance(searcher.get_studies_by_label(dset, "cogat_cognitive_control"), list) assert isinstance(searcher.get_studies_by_coordinate(dset, np.array([[20, 20, 20]])), list) + mask_data = np.zeros(dset.masker.mask_img.shape, int) + mask_data[40, 40, 40] = 1 + mask_img = nib.Nifti1Image(mask_data, dset.masker.mask_img.affine) + assert isinstance(dset.get_studies_by_mask(mask_img), list) + # If label is not available, raise ValueError with pytest.raises(ValueError): searcher.get_studies_by_label(dset, "dog") @@ -38,11 +43,6 @@ def test_dataset_smoke(): # Test that Dataset.masker is portable assert not nib.is_proxy(dset.masker.mask_img_.dataobj) - mask_data = np.zeros(dset.masker.mask_img.shape, int) - mask_data[40, 40, 40] = 1 - mask_img = nib.Nifti1Image(mask_data, dset.masker.mask_img.affine) - assert isinstance(dset.get_studies_by_mask(mask_img), list) - dset1 = dset.slice(dset.ids[:5]) dset2 = dset.slice(dset.ids[5:]) assert isinstance(dset1, dataset.Dataset) From 96ada78665b0058176d9c8e2338138517fbb10bc Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Thu, 28 Apr 2022 12:58:27 -0400 Subject: [PATCH 12/18] Update test_dataset.py --- nimare/tests/test_dataset.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nimare/tests/test_dataset.py b/nimare/tests/test_dataset.py index c8464c282..c753dceab 100644 --- a/nimare/tests/test_dataset.py +++ b/nimare/tests/test_dataset.py @@ -11,6 +11,7 @@ def test_DatasetSearcher(testdata_laird): + """Test the DatasetSearcher class.""" dset = testdata_laird.copy() searcher = dataset.DatasetSearcher METHODS = [searcher.get_images, searcher.get_labels, searcher.get_metadata, searcher.get_texts] From f0cca2dbc2cc8da91deb36a05e5d95e6996537d1 Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Thu, 28 Apr 2022 13:20:06 -0400 Subject: [PATCH 13/18] Fix things. --- nimare/dataset.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/nimare/dataset.py b/nimare/dataset.py index cbb0855be..52e74d289 100755 --- a/nimare/dataset.py +++ b/nimare/dataset.py @@ -89,11 +89,11 @@ def get(self, dataset, dict_, drop_invalid=True): keep_idx = np.intersect1d(keep_idx, temp_keep_idx) # reduce - if drop_invalid and (len(keep_idx) != len(self.ids)): - LGR.info(f"Retaining {len(keep_idx)}/{len(self.ids)} studies") - elif len(keep_idx) != len(self.ids): + if drop_invalid and (len(keep_idx) != len(dataset.ids)): + LGR.info(f"Retaining {len(keep_idx)}/{len(dataset.ids)} studies") + elif len(keep_idx) != len(dataset.ids): raise Exception( - f"Only {len(keep_idx)}/{len(self.ids)} in Dataset contain the necessary data. " + f"Only {len(keep_idx)}/{len(dataset.ids)} in Dataset contain the necessary data. " "If you want to analyze the subset of studies with required data, " "set `drop_invalid` to True." ) @@ -226,7 +226,7 @@ def get_metadata(self, dataset, ids=None, field=None): metadata : :obj:`list` List of values of requested type for selected IDs. """ - result = dataset._generic_column_getter(dataset, "metadata", ids=ids, column=field) + result = self._generic_column_getter(dataset, "metadata", ids=ids, column=field) return result def get_images(self, dataset, ids=None, imtype=None): @@ -357,6 +357,10 @@ def get_studies_by_coordinate(self, dataset, xyz, r=20): class Dataset(NiMAREBase): """Storage container for a coordinate- and/or image-based meta-analytic dataset/database. + .. versionchanged:: 0.0.12 + + All search methods have been moved out of Dataset and into DatasetSearcher. + .. versionchanged:: 0.0.9 * [ENH] Add merge method to Dataset class From 40e470f897c4625df7ec1b13efb3b1e60ff0b8e1 Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Thu, 28 Apr 2022 13:27:20 -0400 Subject: [PATCH 14/18] Fix more. --- nimare/tests/test_dataset.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/nimare/tests/test_dataset.py b/nimare/tests/test_dataset.py index c753dceab..b102dd659 100644 --- a/nimare/tests/test_dataset.py +++ b/nimare/tests/test_dataset.py @@ -13,22 +13,25 @@ def test_DatasetSearcher(testdata_laird): """Test the DatasetSearcher class.""" dset = testdata_laird.copy() - searcher = dataset.DatasetSearcher + searcher = dataset.DatasetSearcher() METHODS = [searcher.get_images, searcher.get_labels, searcher.get_metadata, searcher.get_texts] for method in METHODS: assert isinstance(method(dset), list) assert isinstance(method(dset, ids=dset.ids[:5]), list) assert isinstance(method(dset, ids=dset.ids[0]), list) - assert isinstance(searcher.get_images(dset, imtype="beta"), list) - assert isinstance(searcher.get_metadata(dset, field="sample_sizes"), list) - assert isinstance(searcher.get_studies_by_label(dset, "cogat_cognitive_control"), list) + # This test dataset has no images + with pytest.raises(ValueError): + searcher.get_images(dset, imtype="beta") + + assert isinstance(searcher.get_metadata(dset, field="journal"), list) + assert isinstance(searcher.get_studies_by_label(dset, "Neurosynth_TFIDF__analyze"), list) assert isinstance(searcher.get_studies_by_coordinate(dset, np.array([[20, 20, 20]])), list) mask_data = np.zeros(dset.masker.mask_img.shape, int) mask_data[40, 40, 40] = 1 mask_img = nib.Nifti1Image(mask_data, dset.masker.mask_img.affine) - assert isinstance(dset.get_studies_by_mask(mask_img), list) + assert isinstance(searcher.get_studies_by_mask(dset, mask=mask_img), list) # If label is not available, raise ValueError with pytest.raises(ValueError): From 560dad94d72d2d9793da9ef4b81d375a0f06d7d6 Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Thu, 5 May 2022 14:23:05 -0400 Subject: [PATCH 15/18] Re-import. --- nimare/base.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/nimare/base.py b/nimare/base.py index 1143199cc..b6b244a51 100644 --- a/nimare/base.py +++ b/nimare/base.py @@ -260,6 +260,8 @@ def _collect_inputs(self, dataset, drop_invalid=True): ) if self._required_inputs: + from nimare.dataset import DatasetSearcher + searcher = DatasetSearcher() data = searcher.get(dataset, self._required_inputs, drop_invalid=drop_invalid) # Do not overwrite existing inputs_ attribute. From 963656227df148f7cc695c5331d66d4a4debb19a Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Thu, 5 May 2022 14:36:40 -0400 Subject: [PATCH 16/18] Update base.py --- nimare/meta/cbma/base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/nimare/meta/cbma/base.py b/nimare/meta/cbma/base.py index 16f92d4b3..6bf957f36 100644 --- a/nimare/meta/cbma/base.py +++ b/nimare/meta/cbma/base.py @@ -21,7 +21,6 @@ from nimare.stats import null_to_p, nullhist_to_p from nimare.transforms import p_to_z from nimare.utils import ( - _add_metadata_to_dataframe, _check_ncores, _check_type, _safe_transform, From 6712d57e0199d3f89a17da0a36fa9d2fb0d747bd Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Thu, 5 May 2022 15:41:48 -0400 Subject: [PATCH 17/18] Fix. --- nimare/decode/base.py | 4 +++- nimare/meta/cbma/base.py | 5 ++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/nimare/decode/base.py b/nimare/decode/base.py index 7c401ff38..4f2ada899 100644 --- a/nimare/decode/base.py +++ b/nimare/decode/base.py @@ -3,6 +3,7 @@ from abc import abstractmethod from nimare.base import NiMAREBase +from nimare.dataset import DatasetSearcher LGR = logging.getLogger(__name__) @@ -28,7 +29,8 @@ def _collect_inputs(self, dataset, drop_invalid=True): ) if self._required_inputs: - data = dataset.get(self._required_inputs, drop_invalid=drop_invalid) + searcher = DatasetSearcher() + data = searcher.get(dataset, self._required_inputs, drop_invalid=drop_invalid) # Do not overwrite existing inputs_ attribute. # This is necessary for PairwiseCBMAEstimator, which validates two sets of coordinates # in the same object. diff --git a/nimare/meta/cbma/base.py b/nimare/meta/cbma/base.py index 6bf957f36..9b478db48 100644 --- a/nimare/meta/cbma/base.py +++ b/nimare/meta/cbma/base.py @@ -11,6 +11,7 @@ from tqdm.auto import tqdm from nimare.base import Estimator +from nimare.dataset import DatasetSearcher from nimare.meta.kernel import KernelTransformer from nimare.meta.utils import ( _add_metadata_to_dataframe, @@ -105,6 +106,7 @@ def _preprocess_input(self, dataset): (2) IJK coordinates will be added based on the mask image's affine, and (3) sample sizes may be added to the "coordinates" key, as needed. """ + searcher = DatasetSearcher() masker = self.masker or dataset.masker mask_img = masker.mask_img or masker.labels_img @@ -117,7 +119,8 @@ def _preprocess_input(self, dataset): if hasattr(self, "kernel_transformer"): self.kernel_transformer._infer_names(affine=md5(mask_img.affine).hexdigest()) if self.kernel_transformer.image_type in dataset.images.columns: - files = dataset.get_images( + files = searcher.get_images( + dataset, ids=self.inputs_["id"], imtype=self.kernel_transformer.image_type, ) From bc67dbc3cc770e6a56a2db6e0771e06e97270362 Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Tue, 31 May 2022 11:56:58 -0400 Subject: [PATCH 18/18] Update api.rst --- docs/api.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/api.rst b/docs/api.rst index ad6c6c1d8..71bf7414a 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -17,6 +17,7 @@ API :template: class.rst dataset.Dataset + dataset.DatasetSearcher .. _api_meta_ref: