Skip to content
Draft
1 change: 1 addition & 0 deletions relecov_tools/conf/laboratory_address.json
Original file line number Diff line number Diff line change
Expand Up @@ -19909,4 +19909,5 @@
"collecting_institution_phone": "975206000",
"collecting_institution": "Gerencia de Salud de Area de Soria"
}

}
26 changes: 9 additions & 17 deletions relecov_tools/institution_scripts/ISCIII.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
#!/usr/bin/env python
import sys

import logging
import rich.console
import relecov_tools.utils
Expand All @@ -26,7 +24,6 @@ def replace_originating_lab(metadata, f_data, mapped_fields, heading):
except KeyError as e:
log.error("Value %s does not exist ", e)
stderr.print(f"[red] Value {e} does not exist")
sys.exit(1)
return metadata


Expand All @@ -41,7 +38,7 @@ def added_seq_inst_model(metadata, f_data, mapped_fields, heading):
except KeyError as e:
log.error("Value %s does not exist ", e)
stderr.print(f"[red] Value {e} does not exist")
sys.exit(1)
continue
if "nextseq" in run_name:
row[m_idx] = "Illumina NextSeq 500"
elif "next_seq" in run_name:
Expand All @@ -55,7 +52,6 @@ def added_seq_inst_model(metadata, f_data, mapped_fields, heading):
else:
log.error("Value %s is not defined in the mapping ", run_name)
stderr.print(f"[red] Value {run_name} is not defined in the mapping")
sys.exit(1)
return metadata


Expand All @@ -70,20 +66,19 @@ def translate_gender_to_english(metadata, f_data, mapped_fields, heading):
"unknown": "Not Provided",
}
for row in metadata[1:]:
for key, val in mapped_fields.items():
for key, _ in mapped_fields.items():
m_idx = heading.index(key)
if row[m_idx] is None or row[m_idx] == "":
row[m_idx] = "Not Provided"
continue
item = row[m_idx].lower()
item = str(row[m_idx]).lower()
if item in map_dict:
row[m_idx] = map_dict[item]
else:
log.error("The '%s' is not a valid data for translation", row[m_idx])
log.error("The %s is not a valid data for translation", row[m_idx])
stderr.print(
"f[red] The '{row[m_idx]}' is not a valid data for translation"
f"[red] The '{row[m_idx]}' is not a valid data for translation"
)
sys.exit(1)
return metadata


Expand All @@ -93,7 +88,7 @@ def translate_specimen_source(metadata, f_data, mapped_fields, heading):
for key, val in mapped_fields.items():
m_idx = heading.index(key)
if row[m_idx] is None:
row[m_idx] = "not provided"
row[m_idx] = "Not Provided"
elif "ASPIRADO NASOFARÍNGEO" in row[m_idx].upper():
row[m_idx] = "Nasopharynx Aspiration"
elif "ASPIRADO BRONQUIAL" in row[m_idx].upper():
Expand All @@ -103,19 +98,18 @@ def translate_specimen_source(metadata, f_data, mapped_fields, heading):
elif "EXTRACTO" in row[m_idx].upper():
row[m_idx] = "Scraping"
elif "EXUDADO FARÍNGEO" in row[m_idx].upper():
row[m_idx] = "Pharynx Swabbing"
row[m_idx] = "Pharynx Swab"
elif "EXUDADO NASOFARÍNGEO" in row[m_idx].upper():
row[m_idx] = "Nasopharynx Swabbing"
row[m_idx] = "Nasopharynx swab"
elif "EXUDADO OROFARINGEO" in row[m_idx].upper():
row[m_idx] = "Oropharynx Swabbing"
row[m_idx] = "Oropharynx Swab"
elif "PLACENTA" in row[m_idx].upper():
row[m_idx] = "Placenta"
elif "SALIVA" in row[m_idx].upper():
row[m_idx] = "Saliva"
else:
log.error("The field is not correctly written or is not filled")
stderr.print("The field is not correctly written or not filled")
sys.exit(1)
return metadata


Expand Down Expand Up @@ -157,7 +151,6 @@ def translate_purpose_seq_to_english(metadata, f_data, mapped_fields, heading):
stderr.print(
"f[red] The {row[m_idx]} is not a valid data for translation"
)
sys.exit(1)
return metadata


Expand Down Expand Up @@ -195,5 +188,4 @@ def findout_library_layout(metadata, f_data, mapped_fields, heading):
stderr.print(
f"[red] {e} is not defined in function findout_library_layout"
)
sys.exit(1)
return metadata
104 changes: 47 additions & 57 deletions relecov_tools/metadata_homogeneizer.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
#!/usr/bin/env python
import os
import sys
import logging
import importlib
import rich.console

import relecov_tools.utils
from relecov_tools.config_json import ConfigJson
from relecov_tools.base_module import BaseModule

log = logging.getLogger(__name__)
stderr = rich.console.Console(
stderr=True,
style="dim",
Expand All @@ -16,17 +15,20 @@
)


class MetadataHomogeneizer:
class MetadataHomogeneizer(BaseModule):
"""MetadataHomogeneizer object"""

def __init__(self, institution=None, directory=None, output_dir=None):
super().__init__(output_directory=output_dir, called_module=__name__)
# open config
self.config_json = ConfigJson()
# read heading from config
self.heading = self.config_json.get_topic_data(
"read_lab_metadata", "metadata_lab_heading"
)

self.metadata_processing = self.config_json.get_topic_data(
"sftp_handle", "metadata_processing"
)
# handle institution
if institution is None:
self.institution = relecov_tools.utils.prompt_selection(
Expand All @@ -50,27 +52,26 @@ def __init__(self, institution=None, directory=None, output_dir=None):
msg="Select the directory which contains additional files for metadata"
)
if not os.path.exists(directory):
log.error("Folder for additional files %s does not exist ", directory)
stderr.print(
"[red] Folder for additional files " + directory + " does not exist"
)
sys.exit(1)
errtxt = f"Folder for additional files {directory} does not exist"
self.log.error(errtxt)
stderr.print(f"[red]{errtxt}")
raise FileNotFoundError({errtxt})

try:
lab_metadata = self.mapping_json_data["required_files"]["metadata_file"][
"file_name"
]
except KeyError:
log.error("Metadata File is not defined in schema")
self.log.error("Metadata File is not defined in schema")
stderr.print("[red] Metadata File is not defined in schema")
sys.exit(1)
raise ValueError("Metadata file is not defined in schema")

metadata_path = os.path.join(directory, lab_metadata)

if not os.path.isfile(metadata_path):
log.error("Metadata File %s does not exists", metadata_path)
self.log.error("Metadata File %s does not exists", metadata_path)
stderr.print("[red] Metadata File " + metadata_path + "does not exists")
sys.exit(1)
raise FileNotFoundError(f"Metadata File {metadata_path} does not exists")
self.lab_metadata = self.mapping_json_data["required_files"]["metadata_file"]
self.lab_metadata["file_name"] = metadata_path

Expand All @@ -85,9 +86,9 @@ def __init__(self, institution=None, directory=None, output_dir=None):
continue
f_path = os.path.join(directory, values["file_name"])
if not os.path.isfile(f_path):
log.error("Additional file %s does not exist ", f_path)
self.log.error("Additional file %s does not exist ", f_path)
stderr.print("[red] Additional file " + f_path + " does not exist")
sys.exit(1)
raise FileNotFoundError(f"Additional file {f_path} does not exist ")
values["file_name"] = f_path
self.additional_files.append(values)

Expand All @@ -101,13 +102,10 @@ def __init__(self, institution=None, directory=None, output_dir=None):
os.path.dirname(__file__), "institution_scripts", function_file
)
if not os.path.isfile(self.function_file):
log.error("File with functions %s does not exist ", self.function_file)
stderr.print(
"[red] File with functions "
+ self.function_file
+ " does not exist"
)
sys.exit(1)
errtxt = f"File with functions {self.function_file} does not exist"
self.log.error(errtxt)
stderr.print(f"[red]{errtxt}")
raise FileNotFoundError(errtxt)
if output_dir is None:
self.output_dir = relecov_tools.utils.prompt_path(
msg="Select the output folder"
Expand Down Expand Up @@ -162,16 +160,15 @@ def handling_files(self, file_data, data_to_add):
elif f_name.endswith(".csv"):
data = relecov_tools.utils.read_csv_file_return_dict(f_name, ",")
elif f_name.endswith(".xlsx"):
header_flag = self.metadata_processing.get("header_flag")
data = relecov_tools.utils.read_excel_file(
f_name, "Sheet", header_flag, leave_empty=True
excel_sheet = self.metadata_processing.get("excel_sheet")
data, _ = relecov_tools.utils.read_excel_file(
f_name, excel_sheet, "ID CNM", leave_empty=True
)
else:
log.error("Additional file extension %s is not supported ", f_name)
stderr.print(
"[red] Additional file extension " + f_name + " is not supported"
)
sys.exit(1)
errtxt = f"Additional file extension {f_name} is not supported"
self.log.error(errtxt)
stderr.print(f"[red]{errtxt}")
raise ValueError(errtxt)
else:
data = ""
if not self.processed_metadata:
Expand All @@ -186,44 +183,37 @@ def handling_files(self, file_data, data_to_add):
try:
item_data = data[s_value]
except KeyError:
log.info(
"Additional file %s does not have the information for %s ",
f_name,
s_value,
)
stderr.print(
"[yellow] Additional file "
+ f_name
+ " does not have information for "
+ str(s_value)
errtxt = (
f"Additional file {f_name} does not have the information for {s_value} ",
)
self.log.info(errtxt)
stderr.print(f"[yellow]{errtxt}")
continue
# sys.exit(1)

for m_field, f_field in file_data["mapped_fields"].items():
try:
meta_idx = self.heading.index(m_field)
except ValueError as e:
log.error("Field %s does not exist in Metadata ", e)
self.log.error(
"Field %s does not exist in Metadata heading, check config",
e,
)
stderr.print(f"[red] Field {e} does not exist")
sys.exit(1)
continue
row[meta_idx] = item_data[f_field]

else:
if data == {"ERROR": "not valid format"}:
raise ValueError(
f"Unknown error during processing of {file_data['file_name']}"
)
func_name = file_data["function"]
stderr.print("[yellow] Start processing function " + func_name)
exec(
"from relecov_tools.institution_scripts."
+ self.institution
+ " import "
+ func_name
)
# somehow this overrides additional_data working as a pointer
eval(
func_name
+ "(data_to_add, data, file_data['mapped_fields'], self.heading)"
)

stderr.print("[green] Succesful processing of additional file ")
import_statement = f"relecov_tools.institution_scripts.{self.institution}"
module = importlib.import_module(import_statement)
func_obj = getattr(module, func_name)
data = func_obj(data_to_add, data, file_data["mapped_fields"], self.heading)
stderr.print("[green]Succesful processing of additional file")
return data_to_add

def converting_metadata(self):
Expand Down
10 changes: 1 addition & 9 deletions relecov_tools/schema/institution_schemas/ISCIII.json
Original file line number Diff line number Diff line change
Expand Up @@ -88,14 +88,6 @@
"mapped_key": "Runid",
"function": "None"
},
"read_length": {
"file_name": "samples_run_services_length.tsv",
"mapped_fields": {
"Read Length" : "read1_cycles"
},
"mapped_key": "Sample ID given for sequencing",
"function": "None"
},
"samples_in_run": {
"file_name": "run_and_num_of_samples.csv",
"mapped_fields": {
Expand All @@ -114,7 +106,7 @@
},
"purpose_of_sequencing": {
"file_name": "",
"mapped_fields" : {"Purpose of Sequencing" : "" },
"mapped_fields" : {"Purpose of sampling" : "" },
"mapped_key" : "",
"function": "translate_purpose_seq_to_english"
},
Expand Down
23 changes: 23 additions & 0 deletions relecov_tools/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,29 @@ def read_json_file(j_file):
return data


def write_to_excel_file(data, f_name, sheet_name, post_process=None):
book = openpyxl.Workbook()
sheet = book.active
for row in data:
sheet.append(row)
# adding one column with row number
if "insert_cols" in post_process:
sheet.insert_cols(post_process["insert_cols"])
sheet["A1"] = "CAMPO"
counter = 1
for i in range(len(data) - 1):
idx = "A" + str(counter + 1)
sheet[idx] = counter
counter += 1
# adding 3 empty rows
if "insert_rows" in post_process:
for x in range(post_process["insert_rows"]):
sheet.insert_rows(1)
sheet.title = sheet_name
book.save(f_name)
return


def read_excel_file(f_name, sheet_name, header_flag, leave_empty=True):
"""Read the input excel file and return the data as a list of dictionaries.
If openpyxl fails, fall back to pandas but return in the same format.
Expand Down
Loading