diff --git a/src/pmotools/pmo_engine/pmo_processor.py b/src/pmotools/pmo_engine/pmo_processor.py index 44288e6..fe73076 100644 --- a/src/pmotools/pmo_engine/pmo_processor.py +++ b/src/pmotools/pmo_engine/pmo_processor.py @@ -1481,3 +1481,117 @@ def extract_panels_insert_bed_loc( ) bed_loc_out[panel_id] = bed_loc_out_per_panel return bed_loc_out + + @staticmethod + def update_specimen_meta( + pmo, + adding_meta: pd.DataFrame, + specimen_name_col: str = "specimen_name", + replace_current_meta: bool = False, + meta_types: dict[str, type] | None = None, + ): + """ + Update specimen_info with more information about specimens + + :param pmo: the pmo to update, this will be modified by this function so if an original copy of the PMO is desired deepcopy it before passing the pmo to change to this function + :param adding_meta: a pandas dataframe with additional meta information, needs to have a column with the specimen name + :param specimen_name_col: the column name for the specimen name + :param replace_current_meta: whether to replace the current meta information if it already exists + :param meta_types: an optional dictionary of desired output types for the new meta to properly set it in the pmo + :return: the updated pmo + """ + # check if the adding dataframe is empty + if adding_meta is None or adding_meta.empty: + raise ValueError( + "adding_meta is empty — cannot update specimen metadata from an empty table." + ) + + # check to see if the number of column is less than 2 + if len(adding_meta.columns) <= 1: + raise ValueError( + f"adding_meta contains only {len(adding_meta.columns)} column(s). " + f"A valid metadata table must have at least '{specimen_name_col}' plus ≥1 metadata column." + ) + # check to see if the column for the specimen_name is there + if specimen_name_col not in adding_meta.columns: + raise ValueError( + f"Required specimen name column '{specimen_name_col}' is missing from metadata. " + f"Columns present: {list(adding_meta.columns)}" + ) + specimen_names_in_pmo = set(PMOProcessor.get_specimen_names(pmo)) + specimen_names_in_adding_meta = set( + adding_meta[specimen_name_col].astype(str).tolist() + ) + # check to see if provided new metadata info for a specimen that cannot be found in PMO + missing_meta_specs = specimen_names_in_adding_meta - specimen_names_in_pmo + + if missing_meta_specs: + raise ValueError( + f"Provided traveler info for the following specimens but they are missing from the PMO: {sorted(missing_meta_specs)}" + ) + spec_indexs = PMOProcessor.get_index_key_of_specimen_names(pmo) + adding_meta_records = adding_meta.to_dict(orient="records") + + # optional type mapping for columns + if meta_types is None: + meta_types = {} + + allowed_types = {str, int, float, bool} + for col, t in meta_types.items(): + if t not in allowed_types: + raise ValueError( + f"Invalid type {t!r} for column '{col}'. " + f"Allowed types are: {allowed_types}" + ) + if col not in adding_meta.columns: + raise ValueError( + f"type is beting set for column '{col}' but is missing from metadata. " + ) + for rec in adding_meta_records: + specimen_name = rec[specimen_name_col] + rec.pop(specimen_name_col, None) + for field, raw_value in rec.items(): + # determine desired type for this field (default is str) + desired_type = meta_types.get(field, str) + + # handle NA / missing + if pd.isna(raw_value): + cast_value = None + else: + # cast according to desired_type + try: + if desired_type is bool: + # when casting to bools, handle bool types coming as various different strings + if isinstance(raw_value, str): + v = raw_value.strip().lower() + if v in ("true", "1", "yes"): + cast_value = True + elif v in ("false", "0", "no"): + cast_value = False + else: + raise ValueError( + f"Cannot interpret '{raw_value}' as bool " + f"for field '{field}' in specimen '{specimen_name}'" + ) + else: + cast_value = bool(raw_value) + else: + cast_value = desired_type(raw_value) + except Exception as e: + raise ValueError( + f"Failed to cast value '{raw_value}' for field '{field}' " + f"in specimen '{specimen_name}' to type {desired_type.__name__}" + ) from e + + # check for existing field and replace policy + # NOTE: using 'specimen_info' here to match your earlier PMO structure + if ( + field in pmo["specimen_info"][spec_indexs[specimen_name]] + and not replace_current_meta + ): + raise ValueError( + f"Already have field '{field}' for specimen '{specimen_name}'. " + f"Set replace_current_meta=True to replace." + ) + pmo["specimen_info"][spec_indexs[specimen_name]][field] = cast_value + return pmo diff --git a/tests/test_pmo_engine/test_pmo_processor.py b/tests/test_pmo_engine/test_pmo_processor.py index b6e0fba..133ce6e 100755 --- a/tests/test_pmo_engine/test_pmo_processor.py +++ b/tests/test_pmo_engine/test_pmo_processor.py @@ -873,6 +873,115 @@ def test_get_panel_names(self): names = PMOProcessor.get_panel_names(pmo_data_combined) self.assertEqual(["heomev1"], names) + def test_update_specimen_meta(self): + test_pmo = { + "specimen_info": [{"specimen_name": "spec1"}, {"specimen_name": "spec2"}], + } + adding_meta = pd.DataFrame( + { + "specimen_name": ["spec1", "spec2"], + "collection_country": ["Kenya", "Uganda"], + "collection_date": ["2023-10-13", "2024-11-14"], + } + ) + PMOProcessor.update_specimen_meta(test_pmo, adding_meta) + test_out_pmo = { + "specimen_info": [ + { + "specimen_name": "spec1", + "collection_country": "Kenya", + "collection_date": "2023-10-13", + }, + { + "specimen_name": "spec2", + "collection_country": "Uganda", + "collection_date": "2024-11-14", + }, + ] + } + self.assertEqual(test_pmo, test_out_pmo) + + def test_update_specimen_meta_set_type(self): + test_pmo = { + "specimen_info": [{"specimen_name": "spec1"}, {"specimen_name": "spec2"}], + } + adding_meta = pd.DataFrame( + { + "specimen_name": ["spec1", "spec2"], + "collection_country": ["Kenya", "Uganda"], + "collection_date": ["2023-10-13", "2024-11-14"], + "host_age": [10.5, 20], + } + ) + PMOProcessor.update_specimen_meta( + test_pmo, adding_meta, meta_types={"host_age": float} + ) + test_out_pmo = { + "specimen_info": [ + { + "specimen_name": "spec1", + "collection_country": "Kenya", + "collection_date": "2023-10-13", + "host_age": 10.5, + }, + { + "specimen_name": "spec2", + "collection_country": "Uganda", + "collection_date": "2024-11-14", + "host_age": 20.0, + }, + ] + } + self.assertEqual(test_pmo, test_out_pmo) + + def test_update_specimen_meta_replace(self): + test_pmo = { + "specimen_info": [ + {"specimen_name": "spec1", "collection_date": "2023"}, + {"specimen_name": "spec2"}, + ], + } + adding_meta = pd.DataFrame( + { + "specimen_name": ["spec1", "spec2"], + "collection_country": ["Kenya", "Uganda"], + "collection_date": ["2023-10-13", "2024-11-14"], + } + ) + PMOProcessor.update_specimen_meta( + test_pmo, adding_meta, replace_current_meta=True + ) + test_out_pmo = { + "specimen_info": [ + { + "specimen_name": "spec1", + "collection_country": "Kenya", + "collection_date": "2023-10-13", + }, + { + "specimen_name": "spec2", + "collection_country": "Uganda", + "collection_date": "2024-11-14", + }, + ] + } + self.assertEqual(test_pmo, test_out_pmo) + + def test_update_specimen_meta_raise_no_specimen(self): + test_pmo = { + "specimen_info": [{"specimen_name": "spec1"}, {"specimen_name": "spec2"}], + } + adding_meta = pd.DataFrame( + { + "specimen_name": ["spec1", "spec3"], + "collection_country": ["Kenya", "Uganda"], + "collection_date": ["2023-10-13", "2024-11-14"], + } + ) + + with self.assertRaises(ValueError): + PMOProcessor.update_specimen_meta(test_pmo, adding_meta) + if __name__ == "__main__": unittest.main()