From 64b507b411c73861c5bedd15388c5388bcc47935 Mon Sep 17 00:00:00 2001 From: Marcel Levstek <62072754+marcellevstek@users.noreply.github.com> Date: Tue, 15 Apr 2025 10:24:05 +0200 Subject: [PATCH] Fix parsing of one-to-one metadata for samples with no data objects of certain type --- src/resdk/tables/base.py | 36 ++++++++++++++++++++++++++---------- test_meta.py | 10 ++++++++++ 2 files changed, 36 insertions(+), 10 deletions(-) create mode 100644 test_meta.py diff --git a/src/resdk/tables/base.py b/src/resdk/tables/base.py index 2eb4c251..d880d181 100644 --- a/src/resdk/tables/base.py +++ b/src/resdk/tables/base.py @@ -351,7 +351,7 @@ def _get_relations(self) -> pd.DataFrame: for partition in relation.partitions: value = "" if partition["label"] and partition["position"]: - value = f'{partition["label"]} / {partition["position"]}' + value = f"{partition['label']} / {partition['position']}" elif partition["label"]: value = partition["label"] elif partition["position"]: @@ -371,6 +371,19 @@ def _get_orange_object(self) -> Data: ) def _get_orange_data(self) -> pd.DataFrame: + def map_and_filter_samples( + df: pd.DataFrame, column_name: str, mapping: dict + ) -> pd.DataFrame: + """Map a predefined column name to a column named ``sample_id``, drop the original + and return the constructed DataFrame. + Omit samples for which mapping is not defined. These samples do not + have any data objects with the defined process type. + """ + df = df[df[column_name].isin(mapping.keys())] + df["sample_id"] = df[column_name].map(mapping) + df = df.drop(columns=[column_name]) + return df + try: orange_meta = self._get_orange_object() except LookupError: @@ -402,29 +415,32 @@ def _get_orange_data(self) -> pd.DataFrame: df = df.rename(columns={"mS#Sample ID": "sample_id"}) elif "Sample slug" in df.columns: mapping = {s.slug: s.id for s in self._samples} - df["sample_id"] = [mapping[value] for value in df["Sample slug"]] - df = df.drop(columns=["Sample slug"]) + df = map_and_filter_samples( + df=df, column_name="Sample slug", mapping=mapping + ) elif "mS#Sample slug" in df.columns: mapping = {s.slug: s.id for s in self._samples} - df["sample_id"] = [mapping[value] for value in df["mS#Sample slug"]] - df = df.drop(columns=["mS#Sample slug"]) + df = map_and_filter_samples( + df=df, column_name="mS#Sample slug", mapping=mapping + ) elif "Sample name" in df.columns or "Sample name" in df.columns: mapping = {s.name: s.id for s in self._samples} if len(mapping) != len(self._samples): raise ValueError( "Duplicate sample names. Cannot map orange table data to other metadata" ) - df["sample_id"] = [mapping[value] for value in df["Sample name"]] - df = df.drop(columns=["Sample name"]) + df = map_and_filter_samples( + df=df, column_name="Sample name", mapping=mapping + ) elif "mS#Sample name" in df.columns: mapping = {s.name: s.id for s in self._samples} if len(mapping) != len(self._samples): raise ValueError( "Duplicate sample names. Cannot map orange table data to other metadata" ) - df["sample_id"] = [mapping[value] for value in df["mS#Sample name"]] - df = df.drop(columns=["mS#Sample name"]) - + df = map_and_filter_samples( + df=df, column_name="mS#Sample name", mapping=mapping + ) return df.set_index("sample_id") def _download_metadata(self) -> pd.DataFrame: diff --git a/test_meta.py b/test_meta.py new file mode 100644 index 00000000..bb8f371f --- /dev/null +++ b/test_meta.py @@ -0,0 +1,10 @@ +import resdk +from resdk.tables import QCTables + +res = resdk.Resolwe(url="https://qa.genialis.io") +res.login() + +collection = res.collection.get("validation-run-208") +qt = QCTables(collection=collection) + +qt.meta