Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 2 additions & 91 deletions src/pmotools/pmo_engine/pmo_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,7 @@
import os
from typing import NamedTuple
import copy
import re


import pandas
import pandas as pd
from collections import defaultdict
from pmotools.pmo_engine.pmo_checker import PMOChecker
Expand Down Expand Up @@ -409,7 +407,7 @@ def list_library_sample_names_per_specimen_name(
pmodata,
select_specimen_ids: list[int] = None,
select_specimen_names: list[str] = None,
) -> pd.DataFrame:
) -> pandas.DataFrame:
"""
List all the library_sample_names per specimen_name
:param pmodata: the PMO
Expand Down Expand Up @@ -1483,90 +1481,3 @@ def extract_panels_insert_bed_loc(
)
bed_loc_out[panel_id] = bed_loc_out_per_panel
return bed_loc_out

@staticmethod
def update_specimen_meta_with_traveler_info(
pmo,
traveler_info: pd.DataFrame,
specimen_name_col: str = "specimen_name",
travel_country_col: str = "travel_country",
travel_start_col: str = "travel_start_date",
travel_end_col: str = "travel_end_date",
bed_net_usage_col: str = None,
geo_admin1_col: str = None,
geo_admin2_col: str = None,
geo_admin3_col: str = None,
lat_lon_col: str = None,
replace_current_traveler_info: bool = False,
):
required_cols = [
specimen_name_col,
travel_country_col,
travel_start_col,
travel_end_col,
]
if bed_net_usage_col is not None:
required_cols.append(bed_net_usage_col)
if geo_admin1_col is not None:
required_cols.append(geo_admin1_col)
if geo_admin2_col is not None:
required_cols.append(geo_admin2_col)
if geo_admin3_col is not None:
required_cols.append(geo_admin3_col)
if lat_lon_col is not None:
required_cols.append(lat_lon_col)

if not set(required_cols).issubset(traveler_info.columns):
raise Exception(
"missing traveler_info columns: " + ",".join(required_cols),
" columns in table: " + ",".join(traveler_info.columns),
)

specimen_names_in_pmo = set(PMOProcessor.get_specimen_names(pmo))
specimen_names_in_traveler_info = set(
traveler_info[specimen_name_col].astype(str).tolist()
)

# check to see if provided traveler info for a specimen that cannot be found in PMO
missing_traveler_specs = specimen_names_in_traveler_info - specimen_names_in_pmo

if missing_traveler_specs:
raise ValueError(
f"Provided traveler info for the following specimens but they are missing from the PMO: {sorted(missing_traveler_specs)}"
)
# Matches YYYY-MM or YYYY-MM-DD
date_regex = re.compile(r"^\d{4}-\d{2}(-\d{2})?$")
traveler_info_records = traveler_info[required_cols].to_dict(orient="records")
spec_indexs = PMOProcessor.get_index_key_of_specimen_names(pmo)

# prep traveler info lists, clear the list if we are replacing or start an empty list to append to if none exist already
for specimen_name in specimen_names_in_traveler_info:
if (
replace_current_traveler_info
or "travel_out_six_month"
not in pmo["specimen_info"][spec_indexs[specimen_name]]
):
pmo["specimen_info"][spec_indexs[specimen_name]][
"travel_out_six_month"
] = []

for travel_rec in traveler_info_records:
specimen_name = str(travel_rec[specimen_name_col])
# Validate date formats
for date_col in (travel_start_col, travel_end_col):
val = travel_rec[date_col]
if pd.isna(val):
raise ValueError(
f"Missing required date value in column '{date_col}' for specimen '{specimen_name}'"
)
val_str = str(val)
if not date_regex.match(val_str):
raise ValueError(
f"Invalid date format in '{date_col}' for specimen '{specimen_name}': '{val_str}'. "
f"Expected YYYY-MM or YYYY-MM-DD"
)
# add in travel_rec
travel_rec.pop(specimen_name_col, None)
pmo["specimen_info"][spec_indexs[specimen_name]][
"travel_out_six_month"
].append(travel_rec)
207 changes: 0 additions & 207 deletions tests/test_pmo_engine/test_pmo_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -873,213 +873,6 @@ def test_get_panel_names(self):
names = PMOProcessor.get_panel_names(pmo_data_combined)
self.assertEqual(["heomev1"], names)

def test_update_specimen_meta_with_traveler_info(self):
test_pmo = {
"specimen_info": [{"specimen_name": "spec1"}, {"specimen_name": "spec2"}],
}
traveler_info = pd.DataFrame(
{
"specimen_name": ["spec1", "spec1", "spec2"],
"travel_country": ["Kenya", "Kenya", "Tanzania"],
"travel_start_date": ["2024-01", "2024-04", "2024-02-15"],
"travel_end_date": ["2024-02", "2024-06", "2024-02-27"],
}
)

PMOProcessor.update_specimen_meta_with_traveler_info(test_pmo, traveler_info)
test_out_pmo = {
"specimen_info": [
{
"specimen_name": "spec1",
"travel_out_six_month": [
{
"travel_country": "Kenya",
"travel_start_date": "2024-01",
"travel_end_date": "2024-02",
},
{
"travel_country": "Kenya",
"travel_start_date": "2024-04",
"travel_end_date": "2024-06",
},
],
},
{
"specimen_name": "spec2",
"travel_out_six_month": [
{
"travel_country": "Tanzania",
"travel_start_date": "2024-02-15",
"travel_end_date": "2024-02-27",
}
],
},
]
}
self.assertEqual(test_out_pmo, test_pmo)

def test_update_specimen_meta_with_traveler_info_raises(self):
test_pmo = {
"specimen_info": [{"specimen_name": "spec1"}, {"specimen_name": "spec2"}],
}
traveler_info = pd.DataFrame(
{
"specimen_name": ["spec1", "spec2"],
"travel_country": ["Kenya", "Tanzania"],
"travel_start_date": ["24-01", "2024-02"], # BAD: "24-01"
"travel_end_date": ["2024-02-05", "2024-03"],
}
)

with self.assertRaises(ValueError):
PMOProcessor.update_specimen_meta_with_traveler_info(
test_pmo, traveler_info
)

def test_update_specimen_meta_with_traveler_info_with_optional(self):
test_pmo = {
"specimen_info": [{"specimen_name": "spec1"}, {"specimen_name": "spec2"}],
}
traveler_info = pd.DataFrame(
{
"specimen_name": ["spec1", "spec2"],
"travel_country": ["Kenya", "Tanzania"],
"travel_start_date": ["2024-01", "2024-02"],
"travel_end_date": ["2024-01-20", "2024-02-15"],
"bed_net": [0.50, 0.0],
"admin1": ["Nairobi", "Dar es Salaam"],
"admin2": ["SubCounty1", "SubCounty2"],
"admin3": ["Ward1", "Ward2"],
"latlon": ["-1.2921,36.8219", "-6.7924,39.2083"],
}
)

PMOProcessor.update_specimen_meta_with_traveler_info(
test_pmo,
traveler_info,
bed_net_usage_col="bed_net",
geo_admin1_col="admin1",
geo_admin2_col="admin2",
geo_admin3_col="admin3",
lat_lon_col="latlon",
)
test_out_pmo = {
"specimen_info": [
{
"specimen_name": "spec1",
"travel_out_six_month": [
{
"travel_country": "Kenya",
"travel_start_date": "2024-01",
"travel_end_date": "2024-01-20",
"bed_net": 0.5,
"admin1": "Nairobi",
"admin2": "SubCounty1",
"admin3": "Ward1",
"latlon": "-1.2921,36.8219",
}
],
},
{
"specimen_name": "spec2",
"travel_out_six_month": [
{
"travel_country": "Tanzania",
"travel_start_date": "2024-02",
"travel_end_date": "2024-02-15",
"bed_net": 0.0,
"admin1": "Dar es Salaam",
"admin2": "SubCounty2",
"admin3": "Ward2",
"latlon": "-6.7924,39.2083",
}
],
},
]
}
self.assertEqual(test_out_pmo, test_pmo)

def test_update_specimen_meta_with_traveler_info_with_optional_replace_old(self):
test_pmo = {
"specimen_info": [
{
"specimen_name": "spec1",
"travel_out_six_month": [
{
"travel_country": "Kenya",
"travel_start_date": "2024-01",
"travel_end_date": "2024-02",
},
{
"travel_country": "Kenya",
"travel_start_date": "2024-04",
"travel_end_date": "2024-06",
},
],
},
{"specimen_name": "spec2"},
],
}
traveler_info = pd.DataFrame(
{
"specimen_name": ["spec1", "spec2"],
"travel_country": ["Kenya", "Tanzania"],
"travel_start_date": ["2024-01", "2024-02"],
"travel_end_date": ["2024-01-20", "2024-02-15"],
"bed_net": [0.50, 0.0],
"admin1": ["Nairobi", "Dar es Salaam"],
"admin2": ["SubCounty1", "SubCounty2"],
"admin3": ["Ward1", "Ward2"],
"latlon": ["-1.2921,36.8219", "-6.7924,39.2083"],
}
)

PMOProcessor.update_specimen_meta_with_traveler_info(
test_pmo,
traveler_info,
bed_net_usage_col="bed_net",
geo_admin1_col="admin1",
geo_admin2_col="admin2",
geo_admin3_col="admin3",
lat_lon_col="latlon",
replace_current_traveler_info=True,
)
test_out_pmo = {
"specimen_info": [
{
"specimen_name": "spec1",
"travel_out_six_month": [
{
"travel_country": "Kenya",
"travel_start_date": "2024-01",
"travel_end_date": "2024-01-20",
"bed_net": 0.5,
"admin1": "Nairobi",
"admin2": "SubCounty1",
"admin3": "Ward1",
"latlon": "-1.2921,36.8219",
}
],
},
{
"specimen_name": "spec2",
"travel_out_six_month": [
{
"travel_country": "Tanzania",
"travel_start_date": "2024-02",
"travel_end_date": "2024-02-15",
"bed_net": 0.0,
"admin1": "Dar es Salaam",
"admin2": "SubCounty2",
"admin3": "Ward2",
"latlon": "-6.7924,39.2083",
}
],
},
]
}
self.assertEqual(test_out_pmo, test_pmo)


if __name__ == "__main__":
unittest.main()