src/pmotools/pmo_engine/pmo_processor.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -1481,3 +1481,117 @@ def extract_panels_insert_bed_loc( @@
                         )
                 bed_loc_out[panel_id] = bed_loc_out_per_panel
             return bed_loc_out
+        @staticmethod
+        def update_specimen_meta(
+            pmo,
+            adding_meta: pd.DataFrame,
+            specimen_name_col: str = "specimen_name",
+            replace_current_meta: bool = False,
+            meta_types: dict[str, type] | None = None,
+        ):
+            """
+            Update specimen_info with more information about specimens
+            :param pmo: the pmo to update, this will be modified by this function so if an original copy of the PMO is desired deepcopy it before passing the pmo to change to this function
+            :param adding_meta: a pandas dataframe with additional meta information, needs to have a column with the specimen name
+            :param specimen_name_col: the column name for the specimen name
+            :param replace_current_meta: whether to replace the current meta information if it already exists
+            :param meta_types: an optional dictionary of desired output types for the new meta to properly set it in the pmo
+            :return: the updated pmo
+            """
+            # check if the adding dataframe is empty
+            if adding_meta is None or adding_meta.empty:
+                raise ValueError(
+                    "adding_meta is empty — cannot update specimen metadata from an empty table."
+                )
+            # check to see if the number of column is less than 2
+            if len(adding_meta.columns) <= 1:
+                raise ValueError(
+                    f"adding_meta contains only {len(adding_meta.columns)} column(s). "
+                    f"A valid metadata table must have at least '{specimen_name_col}' plus ≥1 metadata column."
+                )
+            # check to see if the column for the specimen_name is there
+            if specimen_name_col not in adding_meta.columns:
+                raise ValueError(
+                    f"Required specimen name column '{specimen_name_col}' is missing from metadata. "
+                    f"Columns present: {list(adding_meta.columns)}"
+                )
+            specimen_names_in_pmo = set(PMOProcessor.get_specimen_names(pmo))
+            specimen_names_in_adding_meta = set(
+                adding_meta[specimen_name_col].astype(str).tolist()
+            )
+            # check to see if provided new metadata info for a specimen that cannot be found in PMO
+            missing_meta_specs = specimen_names_in_adding_meta - specimen_names_in_pmo
+            if missing_meta_specs:
+                raise ValueError(
+                    f"Provided traveler info for the following specimens but they are missing from the PMO: {sorted(missing_meta_specs)}"
+                )
+            spec_indexs = PMOProcessor.get_index_key_of_specimen_names(pmo)
+            adding_meta_records = adding_meta.to_dict(orient="records")
+            # optional type mapping for columns
+            if meta_types is None:
+                meta_types = {}
+            allowed_types = {str, int, float, bool}
+            for col, t in meta_types.items():
+                if t not in allowed_types:
+                    raise ValueError(
+                        f"Invalid type {t!r} for column '{col}'. "
+                        f"Allowed types are: {allowed_types}"
+                    )
+                if col not in adding_meta.columns:
+                    raise ValueError(
+                        f"type is beting set for column '{col}' but is missing from metadata. "
+                    )
+            for rec in adding_meta_records:
+                specimen_name = rec[specimen_name_col]
+                rec.pop(specimen_name_col, None)
+                for field, raw_value in rec.items():
+                    # determine desired type for this field (default is str)
+                    desired_type = meta_types.get(field, str)
+                    # handle NA / missing
+                    if pd.isna(raw_value):
+                        cast_value = None
+                    else:
+                        # cast according to desired_type
+                        try:
+                            if desired_type is bool:
+                                # when casting to bools, handle bool types coming as various different strings
+                                if isinstance(raw_value, str):
+                                    v = raw_value.strip().lower()
+                                    if v in ("true", "1", "yes"):
+                                        cast_value = True
+                                    elif v in ("false", "0", "no"):
+                                        cast_value = False
+                                    else:
+                                        raise ValueError(
+                                            f"Cannot interpret '{raw_value}' as bool "
+                                            f"for field '{field}' in specimen '{specimen_name}'"
+                                        )
+                                else:
+                                    cast_value = bool(raw_value)
+                            else:
+                                cast_value = desired_type(raw_value)
+                        except Exception as e:
+                            raise ValueError(
+                                f"Failed to cast value '{raw_value}' for field '{field}' "
+                                f"in specimen '{specimen_name}' to type {desired_type.__name__}"
+                            ) from e
+                    # check for existing field and replace policy
+                    # NOTE: using 'specimen_info' here to match your earlier PMO structure
+                    if (
+                        field in pmo["specimen_info"][spec_indexs[specimen_name]]
+                        and not replace_current_meta
+                    ):
+                        raise ValueError(
+                            f"Already have field '{field}' for specimen '{specimen_name}'. "
+                            f"Set replace_current_meta=True to replace."
+                        )
+                    pmo["specimen_info"][spec_indexs[specimen_name]][field] = cast_value
+            return pmo

tests/test_pmo_engine/test_pmo_processor.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -873,6 +873,115 @@ def test_get_panel_names(self): @@
             names = PMOProcessor.get_panel_names(pmo_data_combined)
             self.assertEqual(["heomev1"], names)
+        def test_update_specimen_meta(self):
+            test_pmo = {
+                "specimen_info": [{"specimen_name": "spec1"}, {"specimen_name": "spec2"}],
+            }
+            adding_meta = pd.DataFrame(
+                {
+                    "specimen_name": ["spec1", "spec2"],
+                    "collection_country": ["Kenya", "Uganda"],
+                    "collection_date": ["2023-10-13", "2024-11-14"],
+                }
+            )
+            PMOProcessor.update_specimen_meta(test_pmo, adding_meta)
+            test_out_pmo = {
+                "specimen_info": [
+                    {
+                        "specimen_name": "spec1",
+                        "collection_country": "Kenya",
+                        "collection_date": "2023-10-13",
+                    },
+                    {
+                        "specimen_name": "spec2",
+                        "collection_country": "Uganda",
+                        "collection_date": "2024-11-14",
+                    },
+                ]
+            }
+            self.assertEqual(test_pmo, test_out_pmo)
+        def test_update_specimen_meta_set_type(self):
+            test_pmo = {
+                "specimen_info": [{"specimen_name": "spec1"}, {"specimen_name": "spec2"}],
+            }
+            adding_meta = pd.DataFrame(
+                {
+                    "specimen_name": ["spec1", "spec2"],
+                    "collection_country": ["Kenya", "Uganda"],
+                    "collection_date": ["2023-10-13", "2024-11-14"],
+                    "host_age": [10.5, 20],
+                }
+            )
+            PMOProcessor.update_specimen_meta(
+                test_pmo, adding_meta, meta_types={"host_age": float}
+            )
+            test_out_pmo = {
+                "specimen_info": [
+                    {
+                        "specimen_name": "spec1",
+                        "collection_country": "Kenya",
+                        "collection_date": "2023-10-13",
+                        "host_age": 10.5,
+                    },
+                    {
+                        "specimen_name": "spec2",
+                        "collection_country": "Uganda",
+                        "collection_date": "2024-11-14",
+                        "host_age": 20.0,
+                    },
+                ]
+            }
+            self.assertEqual(test_pmo, test_out_pmo)
+        def test_update_specimen_meta_replace(self):
+            test_pmo = {
+                "specimen_info": [
+                    {"specimen_name": "spec1", "collection_date": "2023"},
+                    {"specimen_name": "spec2"},
+                ],
+            }
+            adding_meta = pd.DataFrame(
+                {
+                    "specimen_name": ["spec1", "spec2"],
+                    "collection_country": ["Kenya", "Uganda"],
+                    "collection_date": ["2023-10-13", "2024-11-14"],
+                }
+            )
+            PMOProcessor.update_specimen_meta(
+                test_pmo, adding_meta, replace_current_meta=True
+            )
+            test_out_pmo = {
+                "specimen_info": [
+                    {
+                        "specimen_name": "spec1",
+                        "collection_country": "Kenya",
+                        "collection_date": "2023-10-13",
+                    },
+                    {
+                        "specimen_name": "spec2",
+                        "collection_country": "Uganda",
+                        "collection_date": "2024-11-14",
+                    },
+                ]
+            }
+            self.assertEqual(test_pmo, test_out_pmo)
+        def test_update_specimen_meta_raise_no_specimen(self):
+            test_pmo = {
+                "specimen_info": [{"specimen_name": "spec1"}, {"specimen_name": "spec2"}],
+            }
+            adding_meta = pd.DataFrame(
+                {
+                    "specimen_name": ["spec1", "spec3"],
+                    "collection_country": ["Kenya", "Uganda"],
+                    "collection_date": ["2023-10-13", "2024-11-14"],
+                }
+            )
+            with self.assertRaises(ValueError):
+                PMOProcessor.update_specimen_meta(test_pmo, adding_meta)
     if __name__ == "__main__":
         unittest.main()

added function to be able to update already present specimens; #60

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft

nickjhathaway wants to merge 1 commit into develop from feature/add_update_specimen_meta

-Original file line number
+Diff line change
@@ Expand Up / @@ -1481,3 +1481,117 @@ def extract_panels_insert_bed_loc( @@
                         )
                 bed_loc_out[panel_id] = bed_loc_out_per_panel
             return bed_loc_out
+        @staticmethod
+        def update_specimen_meta(
+            pmo,
+            adding_meta: pd.DataFrame,
+            specimen_name_col: str = "specimen_name",
+            replace_current_meta: bool = False,
+            meta_types: dict[str, type] | None = None,
+        ):
+            """
+            Update specimen_info with more information about specimens
+            :param pmo: the pmo to update, this will be modified by this function so if an original copy of the PMO is desired deepcopy it before passing the pmo to change to this function
+            :param adding_meta: a pandas dataframe with additional meta information, needs to have a column with the specimen name
+            :param specimen_name_col: the column name for the specimen name
+            :param replace_current_meta: whether to replace the current meta information if it already exists
+            :param meta_types: an optional dictionary of desired output types for the new meta to properly set it in the pmo
+            :return: the updated pmo
+            """
+            # check if the adding dataframe is empty
+            if adding_meta is None or adding_meta.empty:
+                raise ValueError(
+                    "adding_meta is empty — cannot update specimen metadata from an empty table."
+                )
+            # check to see if the number of column is less than 2
+            if len(adding_meta.columns) <= 1:
+                raise ValueError(
+                    f"adding_meta contains only {len(adding_meta.columns)} column(s). "
+                    f"A valid metadata table must have at least '{specimen_name_col}' plus ≥1 metadata column."
+                )
+            # check to see if the column for the specimen_name is there
+            if specimen_name_col not in adding_meta.columns:
+                raise ValueError(
+                    f"Required specimen name column '{specimen_name_col}' is missing from metadata. "
+                    f"Columns present: {list(adding_meta.columns)}"
+                )
+            specimen_names_in_pmo = set(PMOProcessor.get_specimen_names(pmo))
+            specimen_names_in_adding_meta = set(
+                adding_meta[specimen_name_col].astype(str).tolist()
+            )
+            # check to see if provided new metadata info for a specimen that cannot be found in PMO
+            missing_meta_specs = specimen_names_in_adding_meta - specimen_names_in_pmo
+            if missing_meta_specs:
+                raise ValueError(
+                    f"Provided traveler info for the following specimens but they are missing from the PMO: {sorted(missing_meta_specs)}"
+                )
+            spec_indexs = PMOProcessor.get_index_key_of_specimen_names(pmo)
+            adding_meta_records = adding_meta.to_dict(orient="records")
+            # optional type mapping for columns
+            if meta_types is None:
+                meta_types = {}
+            allowed_types = {str, int, float, bool}
+            for col, t in meta_types.items():
+                if t not in allowed_types:
+                    raise ValueError(
+                        f"Invalid type {t!r} for column '{col}'. "
+                        f"Allowed types are: {allowed_types}"
+                    )
+                if col not in adding_meta.columns:
+                    raise ValueError(
+                        f"type is beting set for column '{col}' but is missing from metadata. "
+                    )
+            for rec in adding_meta_records:
+                specimen_name = rec[specimen_name_col]
+                rec.pop(specimen_name_col, None)
+                for field, raw_value in rec.items():
+                    # determine desired type for this field (default is str)
+                    desired_type = meta_types.get(field, str)
+                    # handle NA / missing
+                    if pd.isna(raw_value):
+                        cast_value = None
+                    else:
+                        # cast according to desired_type
+                        try:
+                            if desired_type is bool:
+                                # when casting to bools, handle bool types coming as various different strings
+                                if isinstance(raw_value, str):
+                                    v = raw_value.strip().lower()
+                                    if v in ("true", "1", "yes"):
+                                        cast_value = True
+                                    elif v in ("false", "0", "no"):
+                                        cast_value = False
+                                    else:
+                                        raise ValueError(
+                                            f"Cannot interpret '{raw_value}' as bool "
+                                            f"for field '{field}' in specimen '{specimen_name}'"
+                                        )
+                                else:
+                                    cast_value = bool(raw_value)
+                            else:
+                                cast_value = desired_type(raw_value)
+                        except Exception as e:
+                            raise ValueError(
+                                f"Failed to cast value '{raw_value}' for field '{field}' "
+                                f"in specimen '{specimen_name}' to type {desired_type.__name__}"
+                            ) from e
+                    # check for existing field and replace policy
+                    # NOTE: using 'specimen_info' here to match your earlier PMO structure
+                    if (
+                        field in pmo["specimen_info"][spec_indexs[specimen_name]]
+                        and not replace_current_meta
+                    ):
+                        raise ValueError(
+                            f"Already have field '{field}' for specimen '{specimen_name}'. "
+                            f"Set replace_current_meta=True to replace."
+                        )
+                    pmo["specimen_info"][spec_indexs[specimen_name]][field] = cast_value
+            return pmo

-Original file line number
+Diff line change
@@ Expand Up / @@ -873,6 +873,115 @@ def test_get_panel_names(self): @@
             names = PMOProcessor.get_panel_names(pmo_data_combined)
             self.assertEqual(["heomev1"], names)
+        def test_update_specimen_meta(self):
+            test_pmo = {
+                "specimen_info": [{"specimen_name": "spec1"}, {"specimen_name": "spec2"}],
+            }
+            adding_meta = pd.DataFrame(
+                {
+                    "specimen_name": ["spec1", "spec2"],
+                    "collection_country": ["Kenya", "Uganda"],
+                    "collection_date": ["2023-10-13", "2024-11-14"],
+                }
+            )
+            PMOProcessor.update_specimen_meta(test_pmo, adding_meta)
+            test_out_pmo = {
+                "specimen_info": [
+                    {
+                        "specimen_name": "spec1",
+                        "collection_country": "Kenya",
+                        "collection_date": "2023-10-13",
+                    },
+                    {
+                        "specimen_name": "spec2",
+                        "collection_country": "Uganda",
+                        "collection_date": "2024-11-14",
+                    },
+                ]
+            }
+            self.assertEqual(test_pmo, test_out_pmo)
+        def test_update_specimen_meta_set_type(self):
+            test_pmo = {
+                "specimen_info": [{"specimen_name": "spec1"}, {"specimen_name": "spec2"}],
+            }
+            adding_meta = pd.DataFrame(
+                {
+                    "specimen_name": ["spec1", "spec2"],
+                    "collection_country": ["Kenya", "Uganda"],
+                    "collection_date": ["2023-10-13", "2024-11-14"],
+                    "host_age": [10.5, 20],
+                }
+            )
+            PMOProcessor.update_specimen_meta(
+                test_pmo, adding_meta, meta_types={"host_age": float}
+            )
+            test_out_pmo = {
+                "specimen_info": [
+                    {
+                        "specimen_name": "spec1",
+                        "collection_country": "Kenya",
+                        "collection_date": "2023-10-13",
+                        "host_age": 10.5,
+                    },
+                    {
+                        "specimen_name": "spec2",
+                        "collection_country": "Uganda",
+                        "collection_date": "2024-11-14",
+                        "host_age": 20.0,
+                    },
+                ]
+            }
+            self.assertEqual(test_pmo, test_out_pmo)
+        def test_update_specimen_meta_replace(self):
+            test_pmo = {
+                "specimen_info": [
+                    {"specimen_name": "spec1", "collection_date": "2023"},
+                    {"specimen_name": "spec2"},
+                ],
+            }
+            adding_meta = pd.DataFrame(
+                {
+                    "specimen_name": ["spec1", "spec2"],
+                    "collection_country": ["Kenya", "Uganda"],
+                    "collection_date": ["2023-10-13", "2024-11-14"],
+                }
+            )
+            PMOProcessor.update_specimen_meta(
+                test_pmo, adding_meta, replace_current_meta=True
+            )
+            test_out_pmo = {
+                "specimen_info": [
+                    {
+                        "specimen_name": "spec1",
+                        "collection_country": "Kenya",
+                        "collection_date": "2023-10-13",
+                    },
+                    {
+                        "specimen_name": "spec2",
+                        "collection_country": "Uganda",
+                        "collection_date": "2024-11-14",
+                    },
+                ]
+            }
+            self.assertEqual(test_pmo, test_out_pmo)
+        def test_update_specimen_meta_raise_no_specimen(self):
+            test_pmo = {
+                "specimen_info": [{"specimen_name": "spec1"}, {"specimen_name": "spec2"}],
+            }
+            adding_meta = pd.DataFrame(
+                {
+                    "specimen_name": ["spec1", "spec3"],
+                    "collection_country": ["Kenya", "Uganda"],
+                    "collection_date": ["2023-10-13", "2024-11-14"],
+                }
+            )
+            with self.assertRaises(ValueError):
+                PMOProcessor.update_specimen_meta(test_pmo, adding_meta)
     if __name__ == "__main__":
         unittest.main()

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

added function to be able to update already present specimens; #60

Uh oh!

Diff view

Diff view

There are no files selected for viewing

added function to be able to update already present specimens; #60

Are you sure you want to change the base?

Uh oh!

added function to be able to update already present specimens; #60

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing