From e93e5e30fa39f7da86cecf610b0820ecf4bc4f42 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 27 Feb 2026 11:47:25 +0000 Subject: [PATCH 1/5] Initial plan From fbe8558aa396391334831fe0c88f20112a696a0a Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 27 Feb 2026 11:57:03 +0000 Subject: [PATCH 2/5] Filter NA/empty values from metadata when reading tabular and MSP files Co-authored-by: hechth <12066490+hechth@users.noreply.github.com> --- MSMetaEnhancer/libs/data/DataFrame.py | 4 ++- MSMetaEnhancer/libs/data/Spectra.py | 4 ++- MSMetaEnhancer/libs/utils/Generic.py | 17 ++++++++++++ tests/test_data/sample_metadata_with_na.csv | 4 +++ tests/test_data/sample_with_na.msp | 30 +++++++++++++++++++++ tests/test_io.py | 19 +++++++++++++ 6 files changed, 76 insertions(+), 2 deletions(-) create mode 100644 tests/test_data/sample_metadata_with_na.csv create mode 100644 tests/test_data/sample_with_na.msp diff --git a/MSMetaEnhancer/libs/data/DataFrame.py b/MSMetaEnhancer/libs/data/DataFrame.py index a6c06aa..f68fb5e 100644 --- a/MSMetaEnhancer/libs/data/DataFrame.py +++ b/MSMetaEnhancer/libs/data/DataFrame.py @@ -2,6 +2,7 @@ from MSMetaEnhancer.libs.data.Data import Data from MSMetaEnhancer.libs.utils.Errors import UnknownFileFormat +from MSMetaEnhancer.libs.utils.Generic import is_na_value class DataFrame(Data): @@ -45,7 +46,8 @@ def save_data(self, filename: str, file_format: str): raise UnknownFileFormat(f'Format {file_format} not supported.') def get_metadata(self): - return self.df.to_dict('records') + records = self.df.to_dict('records') + return [{k: v for k, v in record.items() if not is_na_value(v)} for record in records] def fuse_metadata(self, metadata_list): self.df = pandas.DataFrame.from_dict(metadata_list) diff --git a/MSMetaEnhancer/libs/data/Spectra.py b/MSMetaEnhancer/libs/data/Spectra.py index 94bf1e4..6c7a152 100644 --- a/MSMetaEnhancer/libs/data/Spectra.py +++ b/MSMetaEnhancer/libs/data/Spectra.py @@ -5,6 +5,7 @@ from MSMetaEnhancer.libs.data.Data import Data from MSMetaEnhancer.libs.utils.Errors import UnknownFileFormat +from MSMetaEnhancer.libs.utils.Generic import is_na_value class Spectra(Data): @@ -48,7 +49,8 @@ def save_data(self, filename: str, file_format: str): raise UnknownFileFormat(f'Format {file_format} not supported.') def get_metadata(self): - return [spectra.metadata for spectra in self.spectrums] + return [{k: v for k, v in spectra.metadata.items() if not is_na_value(v)} + for spectra in self.spectrums] def fuse_metadata(self, metadata): for i in range(len(metadata)): diff --git a/MSMetaEnhancer/libs/utils/Generic.py b/MSMetaEnhancer/libs/utils/Generic.py index f59e4bf..c6fd0aa 100644 --- a/MSMetaEnhancer/libs/utils/Generic.py +++ b/MSMetaEnhancer/libs/utils/Generic.py @@ -1,3 +1,20 @@ +import math + + +NA_STRING_VALUES = {'na', 'n/a', 'nan', 'none', ''} + + +def is_na_value(value) -> bool: + """Check if a value should be treated as NA/missing (e.g. empty, None, NaN, 'NA').""" + if value is None: + return True + if isinstance(value, float) and math.isnan(value): + return True + if isinstance(value, str) and value.strip().lower() in NA_STRING_VALUES: + return True + return False + + def escape_single_quotes(f): async def wrapper(self, arg): return await f(self, arg.replace("'", "\\'")) diff --git a/tests/test_data/sample_metadata_with_na.csv b/tests/test_data/sample_metadata_with_na.csv new file mode 100644 index 0000000..41d4e3a --- /dev/null +++ b/tests/test_data/sample_metadata_with_na.csv @@ -0,0 +1,4 @@ +formula,mw,casno,inchikey,smiles +H2,2,1333740,NA, +D2,4,7782390,nan,None +CH4,16,74828,N/A,n/a diff --git a/tests/test_data/sample_with_na.msp b/tests/test_data/sample_with_na.msp new file mode 100644 index 0000000..9f3d6a3 --- /dev/null +++ b/tests/test_data/sample_with_na.msp @@ -0,0 +1,30 @@ +NAME: Hydrogen +FORMULA: H2 +MW: 2 +INCHIKEY: NA +SMILES: n/a +NUM PEAKS: 2 +1.0 20.98 +2.0 999.0 + +NAME: Deuterium +FORMULA: D2 +MW: 4 +INCHIKEY: nan +SMILES: None +NUM PEAKS: 2 +2.0 14.99 +4.0 999.0 + +NAME: Methane +FORMULA: CH4 +MW: 16 +INCHIKEY: N/A +SMILES: +NUM PEAKS: 6 +12.0 37.97 +13.0 105.9 +14.0 203.82 +15.0 886.2 +16.0 999.0 +17.0 15.99 diff --git a/tests/test_io.py b/tests/test_io.py index a2a1063..fcfb424 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -83,3 +83,22 @@ def test_tabular_data(): assert meta_item[key] == data_item[key], ( f"Value mismatch for key '{key}' at index {i}: {meta_item[key]} != {data_item[key]}" ) + + +@pytest.mark.parametrize('backend, file_type, filename, absent_keys', [ + [DataFrame(), 'csv', 'tests/test_data/sample_metadata_with_na.csv', ['inchikey', 'smiles']], + [Spectra(), 'msp', 'tests/test_data/sample_with_na.msp', ['inchikey', 'smiles']], +]) +def test_na_values_filtered_from_metadata(backend, file_type, filename, absent_keys): + """NA and empty values in data files should be excluded from metadata dicts.""" + backend.load_data(filename, file_type) + metadata = backend.get_metadata() + + assert len(metadata) == 3 + + for i, meta_item in enumerate(metadata): + # Keys that had NA values must be absent + for key in absent_keys: + assert key not in meta_item, ( + f"NA key '{key}' should not be present at index {i}, got {meta_item.get(key)}" + ) From 66942cb150c5f519550ac21a139db65bad5e56c1 Mon Sep 17 00:00:00 2001 From: hechth Date: Mon, 9 Mar 2026 13:03:42 +0100 Subject: [PATCH 3/5] updated test for application with None or NA input data --- MSMetaEnhancer/libs/utils/ConverterBuilder.py | 2 ++ pyproject.toml | 2 +- tests/test_application.py | 11 +++++++++++ tests/test_data/sparse.tsv | 4 ++++ 4 files changed, 18 insertions(+), 1 deletion(-) create mode 100644 tests/test_data/sparse.tsv diff --git a/MSMetaEnhancer/libs/utils/ConverterBuilder.py b/MSMetaEnhancer/libs/utils/ConverterBuilder.py index 9f28d5f..818c531 100644 --- a/MSMetaEnhancer/libs/utils/ConverterBuilder.py +++ b/MSMetaEnhancer/libs/utils/ConverterBuilder.py @@ -1,5 +1,7 @@ from MSMetaEnhancer.libs.converters.web import __all__ as web_converters +from MSMetaEnhancer.libs.converters.web import * from MSMetaEnhancer.libs.converters.compute import __all__ as compute_converters +from MSMetaEnhancer.libs.converters.compute import * from MSMetaEnhancer.libs.utils.Errors import UnknownConverter diff --git a/pyproject.toml b/pyproject.toml index 12b00bf..51d6d29 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,7 +13,7 @@ packages = [ ] [tool.poetry.dependencies] -python = ">=3.10,<3.13" +python = ">=3.10,<3.14" matchms = ">=0.30.0" pandas = "^2.2.1" scipy = "^1.12.0" diff --git a/tests/test_application.py b/tests/test_application.py index e044467..8bcf00c 100644 --- a/tests/test_application.py +++ b/tests/test_application.py @@ -3,6 +3,7 @@ from MSMetaEnhancer import Application from tests.utils import FakeMonitor, FakeAnnotator +from MSMetaEnhancer.libs.utils.Generic import is_na_value def test_annotate_spectra_monitor_stops(): @@ -27,3 +28,13 @@ def test_annotate_spectra_monitor_stops_after_exception(): asyncio.run(app.annotate_spectra([], monitor=monitor, annotator=annotator)) assert monitor.stop_request.is_set() + + +def test_application_sparse(): + app = Application() + app.load_data('tests/test_data/sparse.tsv', file_format='tabular') + asyncio.run(app.annotate_spectra(['PubChem', 'IDSM'])) + + actual = [x.get('canonical_smiles') for x in app.data.get_metadata()] + assert any([is_na_value(x) for x in actual]) == False + diff --git a/tests/test_data/sparse.tsv b/tests/test_data/sparse.tsv new file mode 100644 index 0000000..47c41ba --- /dev/null +++ b/tests/test_data/sparse.tsv @@ -0,0 +1,4 @@ +compound_name rel.effect rel.relationship publication.id chemical_normalized heading canonical_smiles inchi inchikey iupac_name +(-)-epicatechin gap junction intercellular communication no inhibition 18828601 (-)-epicatechin epicatechin InChI=1S/C15H14O6/c16-8-4-11(18)9-6-13(20)15(21-14(9)5-8)7-1-2-10(17)12(19)3-7/h1-5,13,15-20H,6H2/t13-,15-/m1/s1 PFTAWBLQPZVEMU-UKRRQHHQSA-N (2R,3R)-2-(3,4-dihydroxyphenyl)-3,4-dihydro-2H-chromene-3,5,7-triol +1-chloroanthracene gap junction intercellular communication inhibition 10416268 1-chloroanthracene 1-chloroanthracene InChI=1S/C14H9Cl/c15-14-7-3-6-12-8-10-4-1-2-5-11(10)9-13(12)14/h1-9H SRIHSAFSOOUEGL-UHFFFAOYSA-N 1-chloroanthracene +1-methyl-fluorene gap junction intercellular communication inhibition 7835547 1-methyl-fluorene 1-methylfluorene CC1=C2CC3=CC=CC=C3C2=CC=C1 InChI=1S/C14H12/c1-10-5-4-8-13-12-7-3-2-6-11(12)9-14(10)13/h2-8H,9H2,1H3 GKEUODMJRFDLJY-UHFFFAOYSA-N 1-methyl-9H-fluorene \ No newline at end of file From ddd8fc3d1703f3f2a1230b88ce47833ca0b7627e Mon Sep 17 00:00:00 2001 From: hechth Date: Mon, 9 Mar 2026 13:06:11 +0100 Subject: [PATCH 4/5] added other NA types to table --- tests/test_data/sparse.tsv | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_data/sparse.tsv b/tests/test_data/sparse.tsv index 47c41ba..31f3239 100644 --- a/tests/test_data/sparse.tsv +++ b/tests/test_data/sparse.tsv @@ -1,4 +1,4 @@ compound_name rel.effect rel.relationship publication.id chemical_normalized heading canonical_smiles inchi inchikey iupac_name -(-)-epicatechin gap junction intercellular communication no inhibition 18828601 (-)-epicatechin epicatechin InChI=1S/C15H14O6/c16-8-4-11(18)9-6-13(20)15(21-14(9)5-8)7-1-2-10(17)12(19)3-7/h1-5,13,15-20H,6H2/t13-,15-/m1/s1 PFTAWBLQPZVEMU-UKRRQHHQSA-N (2R,3R)-2-(3,4-dihydroxyphenyl)-3,4-dihydro-2H-chromene-3,5,7-triol -1-chloroanthracene gap junction intercellular communication inhibition 10416268 1-chloroanthracene 1-chloroanthracene InChI=1S/C14H9Cl/c15-14-7-3-6-12-8-10-4-1-2-5-11(10)9-13(12)14/h1-9H SRIHSAFSOOUEGL-UHFFFAOYSA-N 1-chloroanthracene +(-)-epicatechin gap junction intercellular communication no inhibition 18828601 (-)-epicatechin epicatechin n/a InChI=1S/C15H14O6/c16-8-4-11(18)9-6-13(20)15(21-14(9)5-8)7-1-2-10(17)12(19)3-7/h1-5,13,15-20H,6H2/t13-,15-/m1/s1 PFTAWBLQPZVEMU-UKRRQHHQSA-N (2R,3R)-2-(3,4-dihydroxyphenyl)-3,4-dihydro-2H-chromene-3,5,7-triol +1-chloroanthracene gap junction intercellular communication inhibition 10416268 1-chloroanthracene 1-chloroanthracene NA InChI=1S/C14H9Cl/c15-14-7-3-6-12-8-10-4-1-2-5-11(10)9-13(12)14/h1-9H SRIHSAFSOOUEGL-UHFFFAOYSA-N 1-chloroanthracene 1-methyl-fluorene gap junction intercellular communication inhibition 7835547 1-methyl-fluorene 1-methylfluorene CC1=C2CC3=CC=CC=C3C2=CC=C1 InChI=1S/C14H12/c1-10-5-4-8-13-12-7-3-2-6-11(12)9-14(10)13/h2-8H,9H2,1H3 GKEUODMJRFDLJY-UHFFFAOYSA-N 1-methyl-9H-fluorene \ No newline at end of file From 954b0b5511948197349adefdb547168102e874db Mon Sep 17 00:00:00 2001 From: hechth Date: Tue, 10 Mar 2026 10:40:45 +0100 Subject: [PATCH 5/5] formatting and linting --- MSMetaEnhancer/app.py | 38 +++-- MSMetaEnhancer/libs/Annotator.py | 31 +++- MSMetaEnhancer/libs/Converter.py | 14 +- MSMetaEnhancer/libs/Curator.py | 33 ++-- .../converters/compute/ComputeConverter.py | 3 +- .../libs/converters/compute/RDKit.py | 31 ++-- .../libs/converters/compute/__init__.py | 2 +- .../libs/converters/web/BridgeDb.py | 106 ++++++------ MSMetaEnhancer/libs/converters/web/CIR.py | 47 +++--- MSMetaEnhancer/libs/converters/web/CTS.py | 70 ++++---- MSMetaEnhancer/libs/converters/web/IDSM.py | 55 +++--- MSMetaEnhancer/libs/converters/web/PubChem.py | 156 ++++++++++-------- .../libs/converters/web/WebConverter.py | 47 ++++-- .../libs/converters/web/__init__.py | 2 +- MSMetaEnhancer/libs/data/Data.py | 1 + MSMetaEnhancer/libs/data/DataFrame.py | 27 +-- MSMetaEnhancer/libs/data/Spectra.py | 30 +++- MSMetaEnhancer/libs/data/__init__.py | 2 +- MSMetaEnhancer/libs/utils/ConverterBuilder.py | 37 +++-- MSMetaEnhancer/libs/utils/Errors.py | 3 +- MSMetaEnhancer/libs/utils/Generic.py | 3 +- MSMetaEnhancer/libs/utils/Job.py | 20 ++- MSMetaEnhancer/libs/utils/LogRecord.py | 12 +- MSMetaEnhancer/libs/utils/Logger.py | 14 +- MSMetaEnhancer/libs/utils/Metrics.py | 20 ++- MSMetaEnhancer/libs/utils/Monitor.py | 7 +- MSMetaEnhancer/libs/utils/Throttler.py | 1 + docs/source/conf.py | 25 +-- galaxy/generate_options.py | 12 +- tests/__init__.py | 2 +- tests/test_BridgeDB.py | 16 +- tests/test_CIR.py | 8 +- tests/test_CTS.py | 15 +- tests/test_IDSM.py | 28 +++- tests/test_PubChem.py | 65 +++++--- tests/test_annotator.py | 84 +++++++--- tests/test_application.py | 18 +- tests/test_converter.py | 77 +++++---- tests/test_curator.py | 17 +- tests/test_io.py | 98 ++++++++--- tests/test_rdkit.py | 31 ++-- tests/utils.py | 2 + 42 files changed, 802 insertions(+), 508 deletions(-) diff --git a/MSMetaEnhancer/app.py b/MSMetaEnhancer/app.py index d8e0a47..db2c226 100644 --- a/MSMetaEnhancer/app.py +++ b/MSMetaEnhancer/app.py @@ -14,7 +14,7 @@ class Application: - def __init__(self, log_level='info', log_file=None): + def __init__(self, log_level="info", log_file=None): self.data = None logger.setup(log_level, log_file) @@ -25,12 +25,12 @@ def load_data(self, filename, file_format): :param filename: path to source spectra file :param file_format: format of spectra """ - if file_format in ['msp', 'mgf', 'json']: + if file_format in ["msp", "mgf", "json"]: self.data = Spectra() - elif file_format in ['csv', 'tsv', 'tabular', 'xlsx']: + elif file_format in ["csv", "tsv", "tabular", "xlsx"]: self.data = DataFrame() else: - raise UnknownFileFormat(f'Format {file_format} not supported.') + raise UnknownFileFormat(f"Format {file_format} not supported.") self.data.load_data(filename, file_format) def save_data(self, filename, file_format): @@ -51,12 +51,14 @@ def curate_metadata(self): curated_metadata = Curator().curate_metadata(self.data.get_metadata()) self.data.fuse_metadata(curated_metadata) - async def annotate_spectra(self, - converters, - jobs=None, - repeat: bool = False, - monitor: Monitor = Monitor(), - annotator: Annotator = Annotator()): + async def annotate_spectra( + self, + converters, + jobs=None, + repeat: bool = False, + monitor: Monitor = Monitor(), + annotator: Annotator = Annotator(), + ): """ Annotates current Spectra data by specified jobs. @@ -72,9 +74,11 @@ async def annotate_spectra(self, async with aiohttp.ClientSession() as session: builder = ConverterBuilder() builder.validate_converters(converters) - converters, web_converters = builder.build_converters(session, converters) + compute_converters, web_converters = builder.build_converters( + session, converters + ) - annotator.set_converters(converters) + annotator.set_converters(compute_converters | web_converters) monitor.set_converters(web_converters) # start converters status checker and wait for first status @@ -86,7 +90,7 @@ async def annotate_spectra(self, if not jobs: jobs = [] converter: Converter - for converter in converters.values(): + for converter in annotator.converters.values(): jobs += converter.get_conversion_functions() jobs = convert_to_jobs(jobs) @@ -94,8 +98,12 @@ async def annotate_spectra(self, logger.set_target_attributes(jobs, len(metadata_list)) - results = await asyncio.gather(*[annotator.annotate(metadata, jobs, repeat) - for metadata in metadata_list]) + results = await asyncio.gather( + *[ + annotator.annotate(metadata, jobs, repeat) + for metadata in metadata_list + ] + ) finally: monitor.join() diff --git a/MSMetaEnhancer/libs/Annotator.py b/MSMetaEnhancer/libs/Annotator.py index b1541ef..6d6cf83 100644 --- a/MSMetaEnhancer/libs/Annotator.py +++ b/MSMetaEnhancer/libs/Annotator.py @@ -2,8 +2,13 @@ from MSMetaEnhancer.libs.Curator import Curator from MSMetaEnhancer.libs.utils import logger -from MSMetaEnhancer.libs.utils.Errors import TargetAttributeNotRetrieved, SourceAttributeNotAvailable, \ - ServiceNotAvailable, UnknownResponse, DataAlreadyPresent +from MSMetaEnhancer.libs.utils.Errors import ( + TargetAttributeNotRetrieved, + SourceAttributeNotAvailable, + ServiceNotAvailable, + UnknownResponse, + DataAlreadyPresent, +) from MSMetaEnhancer.libs.utils.Logger import LogRecord @@ -11,6 +16,7 @@ class Annotator: """ Annotator is responsible for annotation process of single spectra. """ + def __init__(self): self.converters = dict() self.curator = Curator() @@ -41,17 +47,28 @@ async def annotate(self, metadata, jobs, repeat=False): for job in jobs: if job.target not in metadata: try: - metadata, cache = await self.execute_job_with_cache(job, metadata, cache, log) + metadata, cache = await self.execute_job_with_cache( + job, metadata, cache, log + ) if repeat: added_metadata = True - except (SourceAttributeNotAvailable, TargetAttributeNotRetrieved) as exc: + except ( + SourceAttributeNotAvailable, + TargetAttributeNotRetrieved, + ) as exc: log.update(exc, job, level=3) except (ServiceNotAvailable, UnknownResponse) as exc: log.update(exc, job, level=2) except Exception: log.update(Exception(traceback.format_exc()), job, level=1) else: - log.update(DataAlreadyPresent(f'Requested attribute {job.target} already present.'), job, level=2) + log.update( + DataAlreadyPresent( + f"Requested attribute {job.target} already present." + ), + job, + level=2, + ) logger.add_logs(log) logger.add_coverage_after(metadata.keys()) @@ -85,7 +102,7 @@ async def execute_job_with_cache(self, job, metadata, cache, warning): if job.target in cache[job.converter]: metadata[job.target] = cache[job.converter][job.target] else: - raise TargetAttributeNotRetrieved('No data retrieved.') + raise TargetAttributeNotRetrieved("No data retrieved.") else: - raise ServiceNotAvailable(f'Service {job.converter} not available.') + raise ServiceNotAvailable(f"Service {job.converter} not available.") return metadata, cache diff --git a/MSMetaEnhancer/libs/Converter.py b/MSMetaEnhancer/libs/Converter.py index 4e70cb9..961271f 100644 --- a/MSMetaEnhancer/libs/Converter.py +++ b/MSMetaEnhancer/libs/Converter.py @@ -5,6 +5,7 @@ class Converter(ABC): """ General class for conversions. """ + def __init__(self): self.is_available = True @@ -46,13 +47,15 @@ def get_conversion_functions(self) -> list: :return: a list of available conversion functions """ available_conversions = [] - methods = [method_name for method_name in dir(self) if '_to_' in method_name] + methods = [method_name for method_name in dir(self) if "_to_" in method_name] for method in methods: - available_conversions.append((*method.split('_to_'), self.converter_name)) + available_conversions.append((*method.split("_to_"), self.converter_name)) return available_conversions -def create_top_level_method(obj: Converter, source: str, target: str, method: str, asynch: bool = True): +def create_top_level_method( + obj: Converter, source: str, target: str, method: str, asynch: bool = True +): """ Assign a new method to {obj} called {source}_to_{target} which calls {method}. @@ -62,14 +65,15 @@ def create_top_level_method(obj: Converter, source: str, target: str, method: st :param method: method which is called in the object with single argument :param asynch: whether to create asynchronous methods """ + async def async_conversion(key): return await getattr(obj, str(method))(key) def sync_conversion(key): return getattr(obj, str(method))(key) - doc = f'Convert {source} to {target} using {obj.__class__.__name__} converter' - name = f'{source}_to_{target}' + doc = f"Convert {source} to {target} using {obj.__class__.__name__} converter" + name = f"{source}_to_{target}" if asynch: async_conversion.__doc__ = doc diff --git a/MSMetaEnhancer/libs/Curator.py b/MSMetaEnhancer/libs/Curator.py index 7c833f8..358f1ac 100644 --- a/MSMetaEnhancer/libs/Curator.py +++ b/MSMetaEnhancer/libs/Curator.py @@ -1,5 +1,7 @@ from matchms.filtering.filter_utils.smile_inchi_inchikey_conversions import ( - is_valid_smiles, is_valid_inchi, is_valid_inchikey + is_valid_smiles, + is_valid_inchi, + is_valid_inchikey, ) from MSMetaEnhancer.libs.utils.Errors import InvalidAttributeFormat @@ -10,7 +12,7 @@ inchikey = "VNWKTOKETHGBQD-UHFFFAOYSA-N" print(is_valid_smiles(smiles)) # True if valid SMILES -print(is_valid_inchi(inchi)) # True if valid InChI +print(is_valid_inchi(inchi)) # True if valid InChI print(is_valid_inchikey(inchikey)) # True if valid InChIKey @@ -21,6 +23,7 @@ class Curator: Additionally, it supports metadata validation to make sure the produced data are correct. """ + def curate_metadata(self, metadata_list): """ Iterates over given metadata and curates individual entries. @@ -40,8 +43,8 @@ def curate_casno(self, metadata): :param metadata: given metadata :return: curated metadata """ - if 'casno' in metadata: - metadata['casno'] = self.fix_cas_number(metadata['casno']) + if "casno" in metadata: + metadata["casno"] = self.fix_cas_number(metadata["casno"]) return metadata @staticmethod @@ -54,7 +57,7 @@ def fix_cas_number(cas_number): """ if isinstance(cas_number, str): if "-" not in cas_number: - return f'{cas_number[:-3]}-{cas_number[-3:-1]}-{cas_number[-1]}' + return f"{cas_number[:-3]}-{cas_number[-3:-1]}-{cas_number[-1]}" return cas_number @staticmethod @@ -68,20 +71,26 @@ def filter_invalid_metadata(metadata, log, job): :return: only valid metadata """ filters = { - 'smiles': is_valid_smiles, - 'canonical_smiles': is_valid_smiles, - 'isomeric_smiles': is_valid_smiles, - 'inchi': is_valid_inchi, - 'inchikey': is_valid_inchikey + "smiles": is_valid_smiles, + "canonical_smiles": is_valid_smiles, + "isomeric_smiles": is_valid_smiles, + "inchi": is_valid_inchi, + "inchikey": is_valid_inchikey, } valid_metadata = {} - for (attribute, value) in metadata.items(): + for attribute, value in metadata.items(): if attribute in filters.keys(): if filters[attribute](value): valid_metadata[attribute] = value else: - log.update(InvalidAttributeFormat(f'Obtained {attribute} in invalid format: {value}'), job, level=2) + log.update( + InvalidAttributeFormat( + f"Obtained {attribute} in invalid format: {value}" + ), + job, + level=2, + ) else: valid_metadata[attribute] = value return valid_metadata diff --git a/MSMetaEnhancer/libs/converters/compute/ComputeConverter.py b/MSMetaEnhancer/libs/converters/compute/ComputeConverter.py index fb15c22..9df0c2e 100644 --- a/MSMetaEnhancer/libs/converters/compute/ComputeConverter.py +++ b/MSMetaEnhancer/libs/converters/compute/ComputeConverter.py @@ -5,5 +5,6 @@ class ComputeConverter(Converter): """ General class for computation conversion. """ + async def convert(self, source, target, data): - return getattr(self, f'{source}_to_{target}')(data) + return getattr(self, f"{source}_to_{target}")(data) diff --git a/MSMetaEnhancer/libs/converters/compute/RDKit.py b/MSMetaEnhancer/libs/converters/compute/RDKit.py index 1762a02..e91bd50 100644 --- a/MSMetaEnhancer/libs/converters/compute/RDKit.py +++ b/MSMetaEnhancer/libs/converters/compute/RDKit.py @@ -13,12 +13,15 @@ class RDKit(ComputeConverter): """ RDKit is a collection of chemo-informatics and machine-learning software. """ + def __init__(self): super().__init__() # generate top level methods defining allowed conversions - conversions = [('smiles', 'mw', 'from_smiles'), - ('canonical_smiles', 'mw', 'from_smiles'), - ('isomeric_smiles', 'mw', 'from_smiles')] + conversions = [ + ("smiles", "mw", "from_smiles"), + ("canonical_smiles", "mw", "from_smiles"), + ("isomeric_smiles", "mw", "from_smiles"), + ] self.create_top_level_conversion_methods(conversions, asynch=False) def from_smiles(self, smiles): @@ -29,7 +32,7 @@ def from_smiles(self, smiles): :return: computed molecular weight """ weight = ExactMolWt(MolFromSmiles(smiles)) - return {'mw': weight} + return {"mw": weight} def inchi_to_canonical_smiles(self, inchi): """ @@ -39,7 +42,7 @@ def inchi_to_canonical_smiles(self, inchi): :return: computed canonical SMILES """ smiles = MolToSmiles(MolFromInchi(inchi), isomericSmiles=False) - return {'canonical_smiles': smiles} + return {"canonical_smiles": smiles} def inchi_to_isomeric_smiles(self, inchi): """ @@ -49,7 +52,7 @@ def inchi_to_isomeric_smiles(self, inchi): :return: computed isomeric SMILES """ smiles = MolToSmiles(MolFromInchi(inchi)) - return {'isomeric_smiles': smiles} + return {"isomeric_smiles": smiles} def formula_to_mw(self, formula): """ @@ -66,9 +69,13 @@ def formula_to_mw(self, formula): continue atom = Atom(parts[index]) - multiplier = int(parts[index + 1]) if len(parts) > index + 1 and parts[index + 1].isnumeric() else 1 + multiplier = ( + int(parts[index + 1]) + if len(parts) > index + 1 and parts[index + 1].isnumeric() + else 1 + ) mass += atom.GetMass() * multiplier - return {'mw': mass} + return {"mw": mass} def smiles_to_formula(self, smiles: str) -> dict: """ @@ -79,11 +86,11 @@ def smiles_to_formula(self, smiles: str) -> dict: """ mol = MolFromSmiles(smiles) if mol is None: - return {'formula': ''} + return {"formula": ""} formula = CalcMolFormula(mol) - return {'formula': formula} + return {"formula": formula} def inchi_to_formula(self, inchi: str) -> dict: """ @@ -94,6 +101,6 @@ def inchi_to_formula(self, inchi: str) -> dict: """ mol = MolFromInchi(inchi) if mol is None: - return {'formula': ''} + return {"formula": ""} formula = CalcMolFormula(mol) - return {'formula': formula} + return {"formula": formula} diff --git a/MSMetaEnhancer/libs/converters/compute/__init__.py b/MSMetaEnhancer/libs/converters/compute/__init__.py index defa371..acc60d5 100644 --- a/MSMetaEnhancer/libs/converters/compute/__init__.py +++ b/MSMetaEnhancer/libs/converters/compute/__init__.py @@ -1,3 +1,3 @@ from MSMetaEnhancer.libs.converters.compute.RDKit import RDKit -__all__ = ['RDKit'] +__all__ = ["RDKit"] diff --git a/MSMetaEnhancer/libs/converters/web/BridgeDb.py b/MSMetaEnhancer/libs/converters/web/BridgeDb.py index 6126917..e0e9b68 100644 --- a/MSMetaEnhancer/libs/converters/web/BridgeDb.py +++ b/MSMetaEnhancer/libs/converters/web/BridgeDb.py @@ -8,48 +8,62 @@ class BridgeDb(WebConverter): More info about the available conversions: https://bridgedb.github.io/ """ + def __init__(self, session): super().__init__(session) # service URLs - self.endpoints = {'BridgeDb': 'https://webservice.bridgedb.org/Human/xrefs/'} - - self.codes = {'hmdbid': 'Ch', 'pubchemid': 'Cpc', 'chemspiderid': 'Cs', 'wikidataid': 'Wd', 'chebiid': 'Ce', - 'keggid': 'Ck'} - self.identifiers = {'PubChem-compound': 'pubchemid', 'Chemspider': 'chemspiderid', 'ChEBI': 'chebiid', - 'HMDB': 'hmdbid', 'Wikidata': 'wikidataid', 'KEGG Compound': 'keggid'} + self.endpoints = {"BridgeDb": "https://webservice.bridgedb.org/Human/xrefs/"} + + self.codes = { + "hmdbid": "Ch", + "pubchemid": "Cpc", + "chemspiderid": "Cs", + "wikidataid": "Wd", + "chebiid": "Ce", + "keggid": "Ck", + } + self.identifiers = { + "PubChem-compound": "pubchemid", + "Chemspider": "chemspiderid", + "ChEBI": "chebiid", + "HMDB": "hmdbid", + "Wikidata": "wikidataid", + "KEGG Compound": "keggid", + } # generate top level methods defining allowed conversions - conversions = [('hmdbid', 'pubchemid', 'from_hmdbid'), - ('hmdbid', 'chemspiderid', 'from_hmdbid'), - ('hmdbid', 'wikidataid', 'from_hmdbid'), - ('hmdbid', 'chebiid', 'from_hmdbid'), - ('hmdbid', 'keggid', 'from_hmdbid'), - ('pubchemid', 'hmdbid', 'from_pubchemid'), - ('pubchemid', 'chemspiderid', 'from_pubchemid'), - ('pubchemid', 'wikidataid', 'from_pubchemid'), - ('pubchemid', 'chebiid', 'from_pubchemid'), - ('pubchemid', 'keggid', 'from_pubchemid'), - ('chemspiderid', 'hmdbid', 'from_chemspiderid'), - ('chemspiderid', 'pubchemid', 'from_chemspiderid'), - ('chemspiderid', 'wikidataid', 'from_chemspiderid'), - ('chemspiderid', 'chebiid', 'from_chemspiderid'), - ('chemspiderid', 'keggid', 'from_chemspiderid'), - ('wikidataid', 'hmdbid', 'from_wikidataid'), - ('wikidataid', 'pubchemid', 'from_wikidataid'), - ('wikidataid', 'chemspiderid', 'from_wikidataid'), - ('wikidataid', 'chebiid', 'from_wikidataid'), - ('wikidataid', 'keggid', 'from_wikidataid'), - ('chebiid', 'hmdbid', 'from_chebiid'), - ('chebiid', 'pubchemid', 'from_chebiid'), - ('chebiid', 'chemspiderid', 'from_chebiid'), - ('chebiid', 'wikidataid', 'from_chebiid'), - ('chebiid', 'keggid', 'from_chebiid'), - ('keggid', 'hmdbid', 'from_keggid'), - ('keggid', 'pubchemid', 'from_keggid'), - ('keggid', 'chemspiderid', 'from_keggid'), - ('keggid', 'wikidataid', 'from_keggid'), - ('keggid', 'chebiid', 'from_keggid'), - ] + conversions = [ + ("hmdbid", "pubchemid", "from_hmdbid"), + ("hmdbid", "chemspiderid", "from_hmdbid"), + ("hmdbid", "wikidataid", "from_hmdbid"), + ("hmdbid", "chebiid", "from_hmdbid"), + ("hmdbid", "keggid", "from_hmdbid"), + ("pubchemid", "hmdbid", "from_pubchemid"), + ("pubchemid", "chemspiderid", "from_pubchemid"), + ("pubchemid", "wikidataid", "from_pubchemid"), + ("pubchemid", "chebiid", "from_pubchemid"), + ("pubchemid", "keggid", "from_pubchemid"), + ("chemspiderid", "hmdbid", "from_chemspiderid"), + ("chemspiderid", "pubchemid", "from_chemspiderid"), + ("chemspiderid", "wikidataid", "from_chemspiderid"), + ("chemspiderid", "chebiid", "from_chemspiderid"), + ("chemspiderid", "keggid", "from_chemspiderid"), + ("wikidataid", "hmdbid", "from_wikidataid"), + ("wikidataid", "pubchemid", "from_wikidataid"), + ("wikidataid", "chemspiderid", "from_wikidataid"), + ("wikidataid", "chebiid", "from_wikidataid"), + ("wikidataid", "keggid", "from_wikidataid"), + ("chebiid", "hmdbid", "from_chebiid"), + ("chebiid", "pubchemid", "from_chebiid"), + ("chebiid", "chemspiderid", "from_chebiid"), + ("chebiid", "wikidataid", "from_chebiid"), + ("chebiid", "keggid", "from_chebiid"), + ("keggid", "hmdbid", "from_keggid"), + ("keggid", "pubchemid", "from_keggid"), + ("keggid", "chemspiderid", "from_keggid"), + ("keggid", "wikidataid", "from_keggid"), + ("keggid", "chebiid", "from_keggid"), + ] self.create_top_level_conversion_methods(conversions) async def from_hmdbid(self, hmdbid): @@ -59,7 +73,7 @@ async def from_hmdbid(self, hmdbid): :param hmdbid: given HMDB ID number :return: obtained IDs """ - args = f'{self.codes["hmdbid"]}/{hmdbid}' + args = f"{self.codes['hmdbid']}/{hmdbid}" return await self.call_service(args) async def from_pubchemid(self, pubchemid): @@ -69,7 +83,7 @@ async def from_pubchemid(self, pubchemid): :param pubchemid: given PubChem ID number :return: obtained IDs """ - args = f'{self.codes["pubchemid"]}/{pubchemid}' + args = f"{self.codes['pubchemid']}/{pubchemid}" return await self.call_service(args) async def from_chemspiderid(self, chemspiderid): @@ -79,7 +93,7 @@ async def from_chemspiderid(self, chemspiderid): :param chemspiderid: given ChemSpider ID number :return: obtained IDs """ - args = f'{self.codes["chemspiderid"]}/{chemspiderid}' + args = f"{self.codes['chemspiderid']}/{chemspiderid}" return await self.call_service(args) async def from_wikidataid(self, wikidataid): @@ -89,7 +103,7 @@ async def from_wikidataid(self, wikidataid): :param wikidataid: given WikiData ID number :return: obtained IDs """ - args = f'{self.codes["wikidataid"]}/{wikidataid}' + args = f"{self.codes['wikidataid']}/{wikidataid}" return await self.call_service(args) async def from_chebiid(self, chebiid): @@ -99,7 +113,7 @@ async def from_chebiid(self, chebiid): :param chebiid: given ChEBI ID number :return: obtained IDs """ - args = f'{self.codes["chebiid"]}/{chebiid}' + args = f"{self.codes['chebiid']}/{chebiid}" return await self.call_service(args) async def from_keggid(self, keggid): @@ -109,11 +123,11 @@ async def from_keggid(self, keggid): :param keggid: given KEGG ID number :return: obtained IDs """ - args = f'{self.codes["keggid"]}/{keggid}' + args = f"{self.codes['keggid']}/{keggid}" return await self.call_service(args) async def call_service(self, args): - response = await self.query_the_service('BridgeDb', args) + response = await self.query_the_service("BridgeDb", args) if response: return self.parse_attributes(response) @@ -126,10 +140,10 @@ def parse_attributes(self, response): """ result = dict() - lines = response.split('\n') + lines = response.split("\n") for line in lines: if line: - value, identifier = line.split('\t') + value, identifier = line.split("\t") if identifier in self.identifiers.keys(): result[self.identifiers[identifier]] = value return result diff --git a/MSMetaEnhancer/libs/converters/web/CIR.py b/MSMetaEnhancer/libs/converters/web/CIR.py index b898217..414dd02 100644 --- a/MSMetaEnhancer/libs/converters/web/CIR.py +++ b/MSMetaEnhancer/libs/converters/web/CIR.py @@ -10,10 +10,11 @@ class CIR(WebConverter): More info about the available conversions: https://cactus.nci.nih.gov/chemical/structure_documentation """ + def __init__(self, session): super().__init__(session) # service URLs - self.endpoints = {'CIR': 'https://cactus.nci.nih.gov/chemical/structure/'} + self.endpoints = {"CIR": "https://cactus.nci.nih.gov/chemical/structure/"} async def casno_to_smiles(self, cas_number): """ @@ -22,10 +23,10 @@ async def casno_to_smiles(self, cas_number): :param cas_number: given CAS number :return: obtained SMILES """ - args = f'{cas_number}/smiles?resolver=cas_number' - response = await self.query_the_service('CIR', args) + args = f"{cas_number}/smiles?resolver=cas_number" + response = await self.query_the_service("CIR", args) if response: - return {'smiles': self.retrieve_first(response)} + return {"smiles": self.retrieve_first(response)} async def inchikey_to_smiles(self, inchikey): """ @@ -34,10 +35,10 @@ async def inchikey_to_smiles(self, inchikey): :param inchikey: given InChiKey :return: obtained SMILES """ - args = f'{inchikey}/smiles' - response = await self.query_the_service('CIR', args) + args = f"{inchikey}/smiles" + response = await self.query_the_service("CIR", args) if response: - return {'smiles': self.retrieve_first(response)} + return {"smiles": self.retrieve_first(response)} async def inchikey_to_inchi(self, inchikey): """ @@ -46,10 +47,10 @@ async def inchikey_to_inchi(self, inchikey): :param inchikey: given InChiKey :return: obtained InCHi """ - args = f'{inchikey}/stdinchi' - response = await self.query_the_service('CIR', args) + args = f"{inchikey}/stdinchi" + response = await self.query_the_service("CIR", args) if response: - return {'inchi': self.retrieve_first(response)} + return {"inchi": self.retrieve_first(response)} async def inchikey_to_casno(self, inchikey): """ @@ -58,10 +59,10 @@ async def inchikey_to_casno(self, inchikey): :param inchikey: given InChiKey :return: obtained CAS number """ - args = f'{inchikey}/cas' - response = await self.query_the_service('CIR', args) + args = f"{inchikey}/cas" + response = await self.query_the_service("CIR", args) if response: - return {'casno': self.retrieve_first(response)} + return {"casno": self.retrieve_first(response)} async def inchikey_to_formula(self, inchikey): """ @@ -70,10 +71,10 @@ async def inchikey_to_formula(self, inchikey): :param inchikey: given InChiKey :return: obtained chemical formula """ - args = f'{inchikey}/formula' - response = await self.query_the_service('CIR', args) + args = f"{inchikey}/formula" + response = await self.query_the_service("CIR", args) if response: - return {'formula': self.retrieve_first(response)} + return {"formula": self.retrieve_first(response)} async def smiles_to_inchikey(self, smiles): """ @@ -82,10 +83,10 @@ async def smiles_to_inchikey(self, smiles): :param smiles: given SMILES :return: obtained InChiKey """ - args = f'{smiles}/stdinchikey' - response = await self.query_the_service('CIR', args) + args = f"{smiles}/stdinchikey" + response = await self.query_the_service("CIR", args) if response: - return {'inchikey': self.retrieve_first(response)[9:]} + return {"inchikey": self.retrieve_first(response)[9:]} async def inchi_to_smiles(self, inchi): """ @@ -94,10 +95,10 @@ async def inchi_to_smiles(self, inchi): :param inchi: given InChi :return: obtained SMILES """ - args = f'{inchi}/smiles' - response = await self.query_the_service('CIR', args) + args = f"{inchi}/smiles" + response = await self.query_the_service("CIR", args) if response: - return {'smiles': self.retrieve_first(response)} + return {"smiles": self.retrieve_first(response)} @staticmethod def retrieve_first(response): @@ -108,4 +109,4 @@ def retrieve_first(response): :param response: given response from CIR :return: only first hit """ - return response.split('\n')[0] + return response.split("\n")[0] diff --git a/MSMetaEnhancer/libs/converters/web/CTS.py b/MSMetaEnhancer/libs/converters/web/CTS.py index a83b167..f691db5 100644 --- a/MSMetaEnhancer/libs/converters/web/CTS.py +++ b/MSMetaEnhancer/libs/converters/web/CTS.py @@ -11,17 +11,21 @@ class CTS(WebConverter): More info about the available conversions: http://cts.fiehnlab.ucdavis.edu/services """ + def __init__(self, session): super().__init__(session) # service URLs - self.endpoints = {'CTS': 'https://cts.fiehnlab.ucdavis.edu/rest/convert/', - 'CTS_compound': 'http://cts.fiehnlab.ucdavis.edu/service/compound/' - } + self.endpoints = { + "CTS": "https://cts.fiehnlab.ucdavis.edu/rest/convert/", + "CTS_compound": "http://cts.fiehnlab.ucdavis.edu/service/compound/", + } # generate top level methods defining allowed conversions - conversions = [('inchikey', 'inchi', 'from_inchikey'), - ('inchikey', 'compound_name', 'from_inchikey'), - ('inchikey', 'iupac_name', 'from_inchikey')] + conversions = [ + ("inchikey", "inchi", "from_inchikey"), + ("inchikey", "compound_name", "from_inchikey"), + ("inchikey", "iupac_name", "from_inchikey"), + ] self.create_top_level_conversion_methods(conversions) # top level methods defining allowed conversions @@ -33,10 +37,10 @@ async def hmdbid_to_inchi(self, hmdbid): :param hmdbid: given HMDB ID :return: obtained InChi """ - args = f'Human%20Metabolome%20Database/InChI%20Code/{hmdbid}' - response = await self.query_the_service('CTS', args) + args = f"Human%20Metabolome%20Database/InChI%20Code/{hmdbid}" + response = await self.query_the_service("CTS", args) if response: - return self.parse_single_response(response, 'inchi') + return self.parse_single_response(response, "inchi") async def casno_to_inchi(self, cas_number): """ @@ -45,10 +49,10 @@ async def casno_to_inchi(self, cas_number): :param cas_number: given CAS number :return: obtained InChi """ - args = f'CAS/InChI%20Code/{cas_number}' - response = await self.query_the_service('CTS', args) + args = f"CAS/InChI%20Code/{cas_number}" + response = await self.query_the_service("CTS", args) if response: - return self.parse_single_response(response, 'inchi') + return self.parse_single_response(response, "inchi") async def casno_to_inchikey(self, cas_number): """ @@ -59,10 +63,10 @@ async def casno_to_inchikey(self, cas_number): :param cas_number: given CAS number :return: obtained InChiKey """ - args = f'CAS/InChIKey/{cas_number}' - response = await self.query_the_service('CTS', args) + args = f"CAS/InChIKey/{cas_number}" + response = await self.query_the_service("CTS", args) if response: - return self.parse_single_response(response, 'inchikey') + return self.parse_single_response(response, "inchikey") async def compound_name_to_inchikey(self, name): """ @@ -71,10 +75,10 @@ async def compound_name_to_inchikey(self, name): :param name: given Chemical name :return: obtained InChiKey """ - args = f'Chemical%20Name/InChIKey/{name}' - response = await self.query_the_service('CTS', args) + args = f"Chemical%20Name/InChIKey/{name}" + response = await self.query_the_service("CTS", args) if response: - return self.parse_single_response(response, 'inchikey') + return self.parse_single_response(response, "inchikey") async def from_inchikey(self, inchikey): """ @@ -84,7 +88,7 @@ async def from_inchikey(self, inchikey): :return: all found data """ args = inchikey - response = await self.query_the_service('CTS_compound', args) + response = await self.query_the_service("CTS_compound", args) if response: return self.parse_attributes(response) @@ -97,8 +101,8 @@ def parse_single_response(self, response, attribute): :return: parsed InChiKey """ response_json = json.loads(response) - if len(response_json[0]['results']) != 0: - return {attribute: response_json[0]['results'][0]} + if len(response_json[0]["results"]) != 0: + return {attribute: response_json[0]["results"][0]} def parse_attributes(self, response): """ @@ -110,20 +114,24 @@ def parse_attributes(self, response): response_json = json.loads(response) result = dict() - if 'inchicode' in response_json: - result['inchi'] = response_json['inchicode'] + if "inchicode" in response_json: + result["inchi"] = response_json["inchicode"] - if 'formula' in response_json: - result['formula'] = response_json['formula'] + if "formula" in response_json: + result["formula"] = response_json["formula"] - if 'synonyms' in response_json: - synonyms = response_json['synonyms'] + if "synonyms" in response_json: + synonyms = response_json["synonyms"] - names = [item['name'] for item in synonyms if item['type'] == 'Synonym'] + names = [item["name"] for item in synonyms if item["type"] == "Synonym"] if names: - result['compound_name'] = names[0] + result["compound_name"] = names[0] - names = [item['name'] for item in synonyms if item['type'] == 'IUPAC Name (Preferred)'] + names = [ + item["name"] + for item in synonyms + if item["type"] == "IUPAC Name (Preferred)" + ] if names: - result['iupac_name'] = names[0] + result["iupac_name"] = names[0] return result diff --git a/MSMetaEnhancer/libs/converters/web/IDSM.py b/MSMetaEnhancer/libs/converters/web/IDSM.py index 950608b..90d471d 100644 --- a/MSMetaEnhancer/libs/converters/web/IDSM.py +++ b/MSMetaEnhancer/libs/converters/web/IDSM.py @@ -14,31 +14,36 @@ class IDSM(WebConverter): IDSM service: https://idsm.elixir-czech.cz/ """ + def __init__(self, session): super().__init__(session) # service URLs - self.endpoints = {'IDSM': 'https://idsm.elixir-czech.cz/sparql/endpoint/idsm'} + self.endpoints = {"IDSM": "https://idsm.elixir-czech.cz/sparql/endpoint/idsm"} self.header = frozendict({"Accept": "application/sparql-results+json"}) - self.attributes = [{'code': 'inchi', 'label': 'CHEMINF_000396'}, - {'code': 'iupac_name', 'label': 'CHEMINF_000382'}, - {'code': 'inchikey', 'label': 'CHEMINF_000399'}, - {'code': 'formula', 'label': 'CHEMINF_000335'}, - {'code': 'canonical_smiles', 'label': 'CHEMINF_000376'}, - {'code': 'isomeric_smiles', 'label': 'CHEMINF_000379'}] + self.attributes = [ + {"code": "inchi", "label": "CHEMINF_000396"}, + {"code": "iupac_name", "label": "CHEMINF_000382"}, + {"code": "inchikey", "label": "CHEMINF_000399"}, + {"code": "formula", "label": "CHEMINF_000335"}, + {"code": "canonical_smiles", "label": "CHEMINF_000376"}, + {"code": "isomeric_smiles", "label": "CHEMINF_000379"}, + ] # generate top level methods defining allowed conversions - conversions = [('compound_name', 'inchi', 'from_name'), - ('compound_name', 'iupac_name', 'from_name'), - ('compound_name', 'inchikey', 'from_name'), - ('compound_name', 'formula', 'from_name'), - ('compound_name', 'canonical_smiles', 'from_name'), - ('compound_name', 'isomeric_smiles', 'from_name'), - ('inchi', 'iupac_name', 'from_inchi'), - ('inchi', 'inchikey', 'from_inchi'), - ('inchi', 'formula', 'from_inchi'), - ('inchi', 'canonical_smiles', 'from_inchi'), - ('inchi', 'isomeric_smiles', 'from_inchi')] + conversions = [ + ("compound_name", "inchi", "from_name"), + ("compound_name", "iupac_name", "from_name"), + ("compound_name", "inchikey", "from_name"), + ("compound_name", "formula", "from_name"), + ("compound_name", "canonical_smiles", "from_name"), + ("compound_name", "isomeric_smiles", "from_name"), + ("inchi", "iupac_name", "from_inchi"), + ("inchi", "inchikey", "from_inchi"), + ("inchi", "formula", "from_inchi"), + ("inchi", "canonical_smiles", "from_inchi"), + ("inchi", "isomeric_smiles", "from_inchi"), + ] self.create_top_level_conversion_methods(conversions) # used to limit the maximal number of simultaneous requests being processed @@ -137,7 +142,9 @@ async def call_service(self, query): """ data = frozendict({"query": query}) async with self.semaphore: - response = await self.query_the_service('IDSM', '', method='POST', data=data, headers=self.header) + response = await self.query_the_service( + "IDSM", "", method="POST", data=data, headers=self.header + ) if response: return self.parse_attributes(response) @@ -153,10 +160,10 @@ def parse_attributes(self, response): response_json = eval(response) result = dict() - for prop in response_json['results']['bindings']: - identifier = prop['type']['value'].rsplit('/', 1)[-1] - value = prop['value']['value'] + for prop in response_json["results"]["bindings"]: + identifier = prop["type"]["value"].rsplit("/", 1)[-1] + value = prop["value"]["value"] for att in self.attributes: - if identifier == att['label']: - result[att['code']] = value + if identifier == att["label"]: + result[att["code"]] = value return result diff --git a/MSMetaEnhancer/libs/converters/web/PubChem.py b/MSMetaEnhancer/libs/converters/web/PubChem.py index 52371ff..9af2016 100644 --- a/MSMetaEnhancer/libs/converters/web/PubChem.py +++ b/MSMetaEnhancer/libs/converters/web/PubChem.py @@ -14,44 +14,49 @@ class PubChem(WebConverter): PubChem service: https://pubchem.ncbi.nlm.nih.gov/ """ + def __init__(self, session): super().__init__(session) # service URLs - self.endpoints = {'PubChem': 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/'} - - self.attributes = [{'code': 'inchi', 'label': 'InChI', 'extra': None}, - {'code': 'inchikey', 'label': 'InChIKey', 'extra': None}, - {'code': 'iupac_name', 'label': 'IUPAC Name', 'extra': 'Preferred'}, - {'code': 'formula', 'label': 'Molecular Formula', 'extra': None}, - {'code': 'canonical_smiles', 'label': 'SMILES', 'extra': 'Canonical'}, - {'code': 'isomeric_smiles', 'label': 'SMILES', 'extra': 'Isomeric'}] + self.endpoints = { + "PubChem": "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/" + } + + self.attributes = [ + {"code": "inchi", "label": "InChI", "extra": None}, + {"code": "inchikey", "label": "InChIKey", "extra": None}, + {"code": "iupac_name", "label": "IUPAC Name", "extra": "Preferred"}, + {"code": "formula", "label": "Molecular Formula", "extra": None}, + {"code": "canonical_smiles", "label": "SMILES", "extra": "Canonical"}, + {"code": "isomeric_smiles", "label": "SMILES", "extra": "Isomeric"}, + ] # generate top level methods defining allowed conversions conversions = [ - ('compound_name', 'inchi', 'from_name'), - ('compound_name', 'inchikey', 'from_name'), - ('compound_name', 'iupac_name', 'from_name'), - ('compound_name', 'formula', 'from_name'), - ('compound_name', 'canonical_smiles', 'from_name'), - ('compound_name', 'isomeric_smiles', 'from_name'), - ('inchi', 'inchikey', 'from_inchi'), - ('inchi', 'iupac_name', 'from_inchi'), - ('inchi', 'formula', 'from_inchi'), - ('inchi', 'canonical_smiles', 'from_inchi'), - ('inchi', 'isomeric_smiles', 'from_inchi'), - ('inchi', 'pubchemid', 'from_inchi'), - ('inchikey', 'inchi', 'from_inchikey'), - ('inchikey', 'iupac_name', 'from_inchikey'), - ('inchikey', 'formula', 'from_inchikey'), - ('inchikey', 'canonical_smiles', 'from_inchikey'), - ('inchikey', 'isomeric_smiles', 'from_inchikey'), - ('inchikey', 'pubchemid', 'from_inchikey'), - ('pubchemid', 'inchi', 'from_pubchemid'), - ('pubchemid', 'iupac_name', 'from_pubchemid'), - ('pubchemid', 'formula', 'from_pubchemid'), - ('pubchemid', 'canonical_smiles', 'from_pubchemid'), - ('pubchemid', 'isomeric_smiles', 'from_pubchemid'), - ('pubchemid', 'inchikey', 'from_pubchemid') + ("compound_name", "inchi", "from_name"), + ("compound_name", "inchikey", "from_name"), + ("compound_name", "iupac_name", "from_name"), + ("compound_name", "formula", "from_name"), + ("compound_name", "canonical_smiles", "from_name"), + ("compound_name", "isomeric_smiles", "from_name"), + ("inchi", "inchikey", "from_inchi"), + ("inchi", "iupac_name", "from_inchi"), + ("inchi", "formula", "from_inchi"), + ("inchi", "canonical_smiles", "from_inchi"), + ("inchi", "isomeric_smiles", "from_inchi"), + ("inchi", "pubchemid", "from_inchi"), + ("inchikey", "inchi", "from_inchikey"), + ("inchikey", "iupac_name", "from_inchikey"), + ("inchikey", "formula", "from_inchikey"), + ("inchikey", "canonical_smiles", "from_inchikey"), + ("inchikey", "isomeric_smiles", "from_inchikey"), + ("inchikey", "pubchemid", "from_inchikey"), + ("pubchemid", "inchi", "from_pubchemid"), + ("pubchemid", "iupac_name", "from_pubchemid"), + ("pubchemid", "formula", "from_pubchemid"), + ("pubchemid", "canonical_smiles", "from_pubchemid"), + ("pubchemid", "isomeric_smiles", "from_pubchemid"), + ("pubchemid", "inchikey", "from_pubchemid"), ] self.create_top_level_conversion_methods(conversions) @@ -65,16 +70,16 @@ async def pubchemid_to_hmdbid(self, pubchemid): :param pubchemid: given Chemical name :return: all found data """ - args = f'cid/{pubchemid}/xrefs/RegistryID/JSON' + args = f"cid/{pubchemid}/xrefs/RegistryID/JSON" async with self.throttler: - response = await self.query_the_service('PubChem', args) + response = await self.query_the_service("PubChem", args) response_json = json.loads(response) - registry_ids = response_json['InformationList']['Information'][0]['RegistryID'] - hmdbids = [item for item in registry_ids if item.startswith('HMDB')] + registry_ids = response_json["InformationList"]["Information"][0]["RegistryID"] + hmdbids = [item for item in registry_ids if item.startswith("HMDB")] if len(hmdbids) != 0: - return {'hmdbid': hmdbids[0]} + return {"hmdbid": hmdbids[0]} return dict() async def from_pubchemid(self, pubchemid): @@ -85,8 +90,8 @@ async def from_pubchemid(self, pubchemid): :param pubchemid: given Chemical name :return: all found data """ - args = f'cid/{pubchemid}/JSON' - return await self.call_service(args, 'GET', None) + args = f"cid/{pubchemid}/JSON" + return await self.call_service(args, "GET", None) async def from_name(self, name): """ @@ -96,8 +101,8 @@ async def from_name(self, name): :param name: given Chemical name :return: all found data """ - args = f'name/{name}/JSON' - return await self.call_service(args, 'GET', None) + args = f"name/{name}/JSON" + return await self.call_service(args, "GET", None) async def from_inchi(self, inchi): """ @@ -108,7 +113,7 @@ async def from_inchi(self, inchi): :return: all found data """ args = "inchi/JSON" - return await self.call_service(args, 'POST', frozendict({'inchi': inchi})) + return await self.call_service(args, "POST", frozendict({"inchi": inchi})) async def from_inchikey(self, inchikey): """ @@ -119,7 +124,7 @@ async def from_inchikey(self, inchikey): :return: all found data """ args = "inchikey/JSON" - return await self.call_service(args, 'POST', frozendict({'inchikey': inchikey})) + return await self.call_service(args, "POST", frozendict({"inchikey": inchikey})) async def call_service(self, args, method, data): """ @@ -136,7 +141,9 @@ async def call_service(self, args, method, data): :return: obtained attributes """ async with self.throttler: - response = await self.query_the_service('PubChem', args, method=method, data=data) + response = await self.query_the_service( + "PubChem", args, method=method, data=data + ) if response: return self.parse_attributes(response) @@ -150,14 +157,18 @@ async def process_request(self, response, url, method): :return: processed response """ result = await response.text() - if 'X-Throttling-Control' in response.headers: - sleep_time = self.adjust_throttling(response.headers['X-Throttling-Control']) + if "X-Throttling-Control" in response.headers: + sleep_time = self.adjust_throttling( + response.headers["X-Throttling-Control"] + ) if sleep_time: await asyncio.sleep(sleep_time) if response.ok: return result else: - raise UnknownResponse(f'Unknown response {response.status}:{result} for {method} request on {url}.') + raise UnknownResponse( + f"Unknown response {response.status}:{result} for {method} request on {url}." + ) def adjust_throttling(self, throttling_header): """ @@ -167,9 +178,10 @@ def adjust_throttling(self, throttling_header): :param throttling_header: header containing current service load info """ + def parse_status(part): - value = part.split(': ')[1] - return int(value.split(' (')[1][:-2]) + value = part.split(": ")[1] + return int(value.split(" (")[1][:-2]) def parse_pubchem_info(header): """ @@ -181,24 +193,26 @@ def parse_pubchem_info(header): :param header: given PubChem header with Throttling info :return: most critical indicator value (maximum of three) with possible complete blacklist indicator """ - indicators = header.split(',') + indicators = header.split(",") blocked = False sleep_time = 0 - if 'too many requests per second or blacklisted' in indicators[-1]: + if "too many requests per second or blacklisted" in indicators[-1]: blocked = True - if 'Remaining blocking time' in indicators[-1]: - sleep_time = string_to_seconds(indicators[-1].split(': ')[1]) + if "Remaining blocking time" in indicators[-1]: + sleep_time = string_to_seconds(indicators[-1].split(": ")[1]) blocked = True - return {'load': max([parse_status(indicator) for indicator in indicators[:3]]), - 'blocked': blocked, - 'sleep_time': sleep_time} + return { + "load": max([parse_status(indicator) for indicator in indicators[:3]]), + "blocked": blocked, + "sleep_time": sleep_time, + } status = parse_pubchem_info(throttling_header) - if status['blocked'] or status['load'] > 75: + if status["blocked"] or status["load"] > 75: self.throttler.decrease_limit() - elif status['load'] < 25: + elif status["load"] < 25: self.throttler.increase_limit() - return status['sleep_time'] + return status["sleep_time"] def parse_attributes(self, response): """ @@ -212,21 +226,21 @@ def parse_attributes(self, response): response_json = json.loads(response) result = dict() - if 'PC_Compounds' in response_json: - if len(response_json['PC_Compounds']) > 0: - first_hit = response_json['PC_Compounds'][0] + if "PC_Compounds" in response_json: + if len(response_json["PC_Compounds"]) > 0: + first_hit = response_json["PC_Compounds"][0] - pubchemid = first_hit.get('id', {}).get('id', {}).get('cid', None) + pubchemid = first_hit.get("id", {}).get("id", {}).get("cid", None) if pubchemid: - result['pubchemid'] = pubchemid + result["pubchemid"] = pubchemid - for prop in first_hit.get('props', {}): - label = prop['urn']['label'] + for prop in first_hit.get("props", {}): + label = prop["urn"]["label"] for att in self.attributes: - if label == att['label']: - if att['extra']: - if prop['urn']['name'] == att['extra']: - result[att['code']] = prop['value']['sval'] + if label == att["label"]: + if att["extra"]: + if prop["urn"]["name"] == att["extra"]: + result[att["code"]] = prop["value"]["sval"] else: - result[att['code']] = prop['value']['sval'] + result[att["code"]] = prop["value"]["sval"] return result diff --git a/MSMetaEnhancer/libs/converters/web/WebConverter.py b/MSMetaEnhancer/libs/converters/web/WebConverter.py index d5736e2..5e04eb0 100644 --- a/MSMetaEnhancer/libs/converters/web/WebConverter.py +++ b/MSMetaEnhancer/libs/converters/web/WebConverter.py @@ -8,13 +8,18 @@ from aiocircuitbreaker import circuit from MSMetaEnhancer.libs.Converter import Converter -from MSMetaEnhancer.libs.utils.Errors import ServiceNotAvailable, UnknownResponse, TargetAttributeNotRetrieved +from MSMetaEnhancer.libs.utils.Errors import ( + ServiceNotAvailable, + UnknownResponse, + TargetAttributeNotRetrieved, +) class WebConverter(Converter): """ General class for web conversions. """ + FAILURE_THRESHOLD: int = 10 """Number of consecutive failures before circuit breaker is opened.""" @@ -42,14 +47,16 @@ async def convert(self, source: str, target: str, data: Union[str, int, float]): Returns: _type_: Data retrieved from the service. """ - result = await getattr(self, f'{source}_to_{target}')(data) + result = await getattr(self, f"{source}_to_{target}")(data) if result: return result else: - raise TargetAttributeNotRetrieved('No data retrieved.') + raise TargetAttributeNotRetrieved("No data retrieved.") @lru_cache - async def query_the_service(self, service: str, args: str, method: str = 'GET', data=None, headers=None) -> str: + async def query_the_service( + self, service: str, args: str, method: str = "GET", data=None, headers=None + ) -> str: """ Make get request to given converter with arguments. Raises ConnectionError if converter is not available. @@ -62,14 +69,20 @@ async def query_the_service(self, service: str, args: str, method: str = 'GET', :return: obtained response """ try: - result = await self.loop_request(self.endpoints[service] + args, method, data, headers) + result = await self.loop_request( + self.endpoints[service] + args, method, data, headers + ) return result except TypeError: - raise TypeError(f'Incorrect argument {args} for converter {service}.') - - @circuit(failure_threshold=FAILURE_THRESHOLD, - expected_exception=Union[TimeoutError, ServerDisconnectedError, ClientConnectorError].__args__, - fallback_function=ServiceNotAvailable.raise_circuitbreaker) + raise TypeError(f"Incorrect argument {args} for converter {service}.") + + @circuit( + failure_threshold=FAILURE_THRESHOLD, + expected_exception=Union[ + TimeoutError, ServerDisconnectedError, ClientConnectorError + ].__args__, + fallback_function=ServiceNotAvailable.raise_circuitbreaker, + ) async def make_request(self, url, method, data, headers): """ Enter a circuit breaker loop and execute request with type depending on specified method. @@ -82,7 +95,7 @@ async def make_request(self, url, method, data, headers): """ if headers is None: headers = {} - if method == 'GET': + if method == "GET": async with self.session.get(url, headers=headers) as response: return await self.process_request(response, url, method) else: @@ -90,7 +103,9 @@ async def make_request(self, url, method, data, headers): async with self.session.post(url, data=data, headers=headers) as response: return await self.process_request(response, url, method) - async def loop_request(self, url: str, method: str, data: Any, headers: dict) -> str: + async def loop_request( + self, url: str, method: str, data: Any, headers: dict + ) -> str: """ Execute request in a circuit breaker loop. If the request fails multiple times in a row, the circuit breaker is opened and ServiceNotAvailable exception is raised. @@ -106,7 +121,9 @@ async def loop_request(self, url: str, method: str, data: Any, headers: dict) -> except (ServerDisconnectedError, ClientConnectorError, TimeoutError): return await self.loop_request(url, method, data, headers) - async def process_request(self, response: aiohttp.ClientResponse, url: str, method: str) -> str: + async def process_request( + self, response: aiohttp.ClientResponse, url: str, method: str + ) -> str: """ Method to wrap response handling (same for POST and GET requests). @@ -119,4 +136,6 @@ async def process_request(self, response: aiohttp.ClientResponse, url: str, meth if response.ok: return result else: - raise UnknownResponse(f'Unknown response {response.status}:{result} for {method} request on {url}.') + raise UnknownResponse( + f"Unknown response {response.status}:{result} for {method} request on {url}." + ) diff --git a/MSMetaEnhancer/libs/converters/web/__init__.py b/MSMetaEnhancer/libs/converters/web/__init__.py index 30ea43a..76e1e31 100644 --- a/MSMetaEnhancer/libs/converters/web/__init__.py +++ b/MSMetaEnhancer/libs/converters/web/__init__.py @@ -4,4 +4,4 @@ from MSMetaEnhancer.libs.converters.web.PubChem import PubChem from MSMetaEnhancer.libs.converters.web.BridgeDb import BridgeDb -__all__ = ['IDSM', 'CTS', 'CIR', 'PubChem', 'BridgeDb'] +__all__ = ["IDSM", "CTS", "CIR", "PubChem", "BridgeDb"] diff --git a/MSMetaEnhancer/libs/data/Data.py b/MSMetaEnhancer/libs/data/Data.py index eff3671..8983287 100644 --- a/MSMetaEnhancer/libs/data/Data.py +++ b/MSMetaEnhancer/libs/data/Data.py @@ -6,6 +6,7 @@ class Data(ABC): """ General class for data. """ + @abstractmethod def get_metadata(self) -> List[Dict]: """ diff --git a/MSMetaEnhancer/libs/data/DataFrame.py b/MSMetaEnhancer/libs/data/DataFrame.py index f68fb5e..998e06f 100644 --- a/MSMetaEnhancer/libs/data/DataFrame.py +++ b/MSMetaEnhancer/libs/data/DataFrame.py @@ -18,14 +18,14 @@ def load_data(self, filename: str, file_format: str): :param filename: given file :param file_format: format of the input file """ - if file_format == 'csv': + if file_format == "csv": self.df = pandas.read_csv(filename, dtype=str) - elif file_format in ['tsv', 'tabular']: - self.df = pandas.read_csv(filename, dtype=str, sep='\t') - elif file_format == 'xlsx': + elif file_format in ["tsv", "tabular"]: + self.df = pandas.read_csv(filename, dtype=str, sep="\t") + elif file_format == "xlsx": self.df = pandas.read_excel(filename, dtype=str) else: - raise UnknownFileFormat(f'Format {file_format} not supported.') + raise UnknownFileFormat(f"Format {file_format} not supported.") def save_data(self, filename: str, file_format: str): """ @@ -36,18 +36,21 @@ def save_data(self, filename: str, file_format: str): :param filename: target file :param file_format: format of the output file """ - if file_format == 'csv': + if file_format == "csv": self.df.to_csv(filename, index=False) - elif file_format in ['tsv', 'tabular']: - self.df.to_csv(filename, index=False, sep='\t') - elif file_format == 'xlsx': + elif file_format in ["tsv", "tabular"]: + self.df.to_csv(filename, index=False, sep="\t") + elif file_format == "xlsx": self.df.to_excel(filename) else: - raise UnknownFileFormat(f'Format {file_format} not supported.') + raise UnknownFileFormat(f"Format {file_format} not supported.") def get_metadata(self): - records = self.df.to_dict('records') - return [{k: v for k, v in record.items() if not is_na_value(v)} for record in records] + records = self.df.to_dict("records") + return [ + {k: v for k, v in record.items() if not is_na_value(v)} + for record in records + ] def fuse_metadata(self, metadata_list): self.df = pandas.DataFrame.from_dict(metadata_list) diff --git a/MSMetaEnhancer/libs/data/Spectra.py b/MSMetaEnhancer/libs/data/Spectra.py index 6c7a152..856ce64 100644 --- a/MSMetaEnhancer/libs/data/Spectra.py +++ b/MSMetaEnhancer/libs/data/Spectra.py @@ -13,12 +13,18 @@ class Spectra(Data): Spectra class represents a single spectra dataset as a list. It is using `matchms` package to load and save MSP files. """ + def __init__(self): self.spectrums: List[Spectrum] = [] def __eq__(self, other): if len(self.spectrums) == len(other.spectrums): - return all([spectra_eq(self.spectrums[i], other.spectrums[i]) for i in range(len(self.spectrums))]) + return all( + [ + spectra_eq(self.spectrums[i], other.spectrums[i]) + for i in range(len(self.spectrums)) + ] + ) else: return False @@ -31,7 +37,9 @@ def load_data(self, filename: str, file_format: str): :param filename: given file :param file_format: format of the input file """ - self.spectrums = list(getattr(matchms.importing, f'load_from_{file_format}')(filename)) + self.spectrums = list( + getattr(matchms.importing, f"load_from_{file_format}")(filename) + ) def save_data(self, filename: str, file_format: str): """ @@ -44,13 +52,17 @@ def save_data(self, filename: str, file_format: str): :param file_format: format of the output file """ try: - getattr(matchms.exporting, f'save_as_{file_format}')(self.spectrums, filename) + getattr(matchms.exporting, f"save_as_{file_format}")( + self.spectrums, filename + ) except Exception: - raise UnknownFileFormat(f'Format {file_format} not supported.') + raise UnknownFileFormat(f"Format {file_format} not supported.") def get_metadata(self): - return [{k: v for k, v in spectra.metadata.items() if not is_na_value(v)} - for spectra in self.spectrums] + return [ + {k: v for k, v in spectra.metadata.items() if not is_na_value(v)} + for spectra in self.spectrums + ] def fuse_metadata(self, metadata): for i in range(len(metadata)): @@ -65,4 +77,8 @@ def spectra_eq(first: Spectrum, second: Spectrum): :param first: spectra object :param second: spectra object """ - return first.peaks == second.peaks and first.losses == second.losses and first.metadata == second.metadata + return ( + first.peaks == second.peaks + and first.losses == second.losses + and first.metadata == second.metadata + ) diff --git a/MSMetaEnhancer/libs/data/__init__.py b/MSMetaEnhancer/libs/data/__init__.py index 185a037..7dc89b8 100644 --- a/MSMetaEnhancer/libs/data/__init__.py +++ b/MSMetaEnhancer/libs/data/__init__.py @@ -1,4 +1,4 @@ from MSMetaEnhancer.libs.data.Spectra import Spectra from MSMetaEnhancer.libs.data.DataFrame import DataFrame -__all__ = ['Spectra', 'DataFrame'] +__all__ = ["Spectra", "DataFrame"] diff --git a/MSMetaEnhancer/libs/utils/ConverterBuilder.py b/MSMetaEnhancer/libs/utils/ConverterBuilder.py index 818c531..9871ed2 100644 --- a/MSMetaEnhancer/libs/utils/ConverterBuilder.py +++ b/MSMetaEnhancer/libs/utils/ConverterBuilder.py @@ -1,11 +1,16 @@ -from MSMetaEnhancer.libs.converters.web import __all__ as web_converters -from MSMetaEnhancer.libs.converters.web import * -from MSMetaEnhancer.libs.converters.compute import __all__ as compute_converters -from MSMetaEnhancer.libs.converters.compute import * +from MSMetaEnhancer.libs.converters.web.WebConverter import WebConverter +from MSMetaEnhancer.libs.converters.compute.ComputeConverter import ComputeConverter from MSMetaEnhancer.libs.utils.Errors import UnknownConverter class ConverterBuilder: + converters: dict[str, type] = {} + + @staticmethod + def register(converters: list[type]): + for converter in converters: + ConverterBuilder.converters[converter.__name__] = converter + @staticmethod def validate_converters(converters): """ @@ -15,14 +20,11 @@ def validate_converters(converters): :param converters: given list of converters names """ for converter in converters: - try: - eval(converter) - - except NameError: - raise UnknownConverter(f'Converter {converter} unknown.') + if ConverterBuilder.converters.get(converter) is None: + raise UnknownConverter(f"Converter {converter} unknown.") @staticmethod - def build_converters(session, converters: list): + def build_converters(session, converters: list[str]): """ Create provided converters. @@ -30,11 +32,12 @@ def build_converters(session, converters: list): :param converters: list of converters to be built :return: built converters """ - built_web_converters, built_converters = {}, {} + web_converters, compute_converters = {}, {} for converter in converters: - if converter in web_converters: - built_web_converters[converter] = eval(converter)(session) - elif converter in compute_converters: - built_converters[converter] = eval(converter)() - built_converters.update(built_web_converters) - return built_converters, built_web_converters + if issubclass(ConverterBuilder.converters[converter], WebConverter): + web_converters[converter] = ConverterBuilder.converters[converter]( + session + ) + elif issubclass(ConverterBuilder.converters[converter], ComputeConverter): + compute_converters[converter] = ConverterBuilder.converters[converter]() + return compute_converters, web_converters diff --git a/MSMetaEnhancer/libs/utils/Errors.py b/MSMetaEnhancer/libs/utils/Errors.py index ba18d79..57672fe 100644 --- a/MSMetaEnhancer/libs/utils/Errors.py +++ b/MSMetaEnhancer/libs/utils/Errors.py @@ -12,6 +12,7 @@ class UnknownConverter(Exception): class UnknownFileFormat(Exception): """Format not supported.""" + pass @@ -23,7 +24,7 @@ class ServiceNotAvailable(Exception): @staticmethod async def raise_circuitbreaker(*args): converter_name = args[0].converter_name - raise ServiceNotAvailable(f'Service {converter_name} not available.') + raise ServiceNotAvailable(f"Service {converter_name} not available.") class UnknownResponse(Exception): diff --git a/MSMetaEnhancer/libs/utils/Generic.py b/MSMetaEnhancer/libs/utils/Generic.py index c6fd0aa..0b177bf 100644 --- a/MSMetaEnhancer/libs/utils/Generic.py +++ b/MSMetaEnhancer/libs/utils/Generic.py @@ -1,7 +1,7 @@ import math -NA_STRING_VALUES = {'na', 'n/a', 'nan', 'none', ''} +NA_STRING_VALUES = {"na", "n/a", "nan", "none", ""} def is_na_value(value) -> bool: @@ -18,6 +18,7 @@ def is_na_value(value) -> bool: def escape_single_quotes(f): async def wrapper(self, arg): return await f(self, arg.replace("'", "\\'")) + return wrapper diff --git a/MSMetaEnhancer/libs/utils/Job.py b/MSMetaEnhancer/libs/utils/Job.py index fc96b5d..3838b87 100644 --- a/MSMetaEnhancer/libs/utils/Job.py +++ b/MSMetaEnhancer/libs/utils/Job.py @@ -2,8 +2,10 @@ from matchms import Metadata from MSMetaEnhancer.libs.Converter import Converter -from MSMetaEnhancer.libs.utils.Errors import (ConversionNotSupported, - SourceAttributeNotAvailable) +from MSMetaEnhancer.libs.utils.Errors import ( + ConversionNotSupported, + SourceAttributeNotAvailable, +) class Job: @@ -11,10 +13,10 @@ def __init__(self, data: Tuple[str, str, str]): self.source, self.target, self.converter = data def __str__(self): - return f'{self.converter}: {self.source} -> {self.target}' + return f"{self.converter}: {self.source} -> {self.target}" def __repr__(self): - return f'Job(({self.source}, {self.target}, {self.converter}))' + return f"Job(({self.source}, {self.target}, {self.converter}))" def validate(self, converters: dict, metadata: Metadata) -> Tuple[Converter, Any]: """ @@ -28,10 +30,14 @@ def validate(self, converters: dict, metadata: Metadata) -> Tuple[Converter, Any data = metadata.get(self.source, None) if converter is None: - raise ConversionNotSupported(f'Conversion ({self.converter}) {self.source} -> {self.target}: ' - f'is not supported') + raise ConversionNotSupported( + f"Conversion ({self.converter}) {self.source} -> {self.target}: " + f"is not supported" + ) elif data is None: - raise SourceAttributeNotAvailable(f'{self}:\n Attribute {self.source} missing in given metadata.') + raise SourceAttributeNotAvailable( + f"{self}:\n Attribute {self.source} missing in given metadata." + ) else: return converter, data diff --git a/MSMetaEnhancer/libs/utils/LogRecord.py b/MSMetaEnhancer/libs/utils/LogRecord.py index dcecd55..1808e9a 100644 --- a/MSMetaEnhancer/libs/utils/LogRecord.py +++ b/MSMetaEnhancer/libs/utils/LogRecord.py @@ -15,14 +15,14 @@ def format_log(self, level: str) -> str: Returns: str: Formatted log message """ - message = f'Issues related to metadata:\n\n{self.metadata}\n\n' - filtered_logs = [log['msg'] for log in self.logs if level >= log['level']] + message = f"Issues related to metadata:\n\n{self.metadata}\n\n" + filtered_logs = [log["msg"] for log in self.logs if level >= log["level"]] if filtered_logs: for log in filtered_logs: - message += f'{log}\n' + message += f"{log}\n" else: return None - return f'{message}\n' + return f"{message}\n" def update(self, exc: Exception, job: Job, level: str): """ @@ -32,4 +32,6 @@ def update(self, exc: Exception, job: Job, level: str): :param job: related job :param level: log level """ - self.logs.append({'level': level, 'msg': f'-> {type(exc).__name__} - {job}:\n{exc}'}) + self.logs.append( + {"level": level, "msg": f"-> {type(exc).__name__} - {job}:\n{exc}"} + ) diff --git a/MSMetaEnhancer/libs/utils/Logger.py b/MSMetaEnhancer/libs/utils/Logger.py index d34004d..00928bb 100644 --- a/MSMetaEnhancer/libs/utils/Logger.py +++ b/MSMetaEnhancer/libs/utils/Logger.py @@ -9,13 +9,13 @@ class Logger: def __init__(self): - self.logger = logging.getLogger('log') - self.logger.setLevel('INFO') + self.logger = logging.getLogger("log") + self.logger.setLevel("INFO") # statistical values self.metrics = Metrics() - self.LEVELS = {'error': 1, 'warning': 2, 'info': 3} + self.LEVELS = {"error": 1, "warning": 2, "info": 3} self.log_level = 3 @@ -36,12 +36,12 @@ def add_filehandler(self, file_name: str = None): file_name (str, optional): Log filename. Defaults to None. """ if file_name is None: - file_name = datetime.now().strftime('MSMetaEnhancer_%Y%m%d%H%M%S.log') + file_name = datetime.now().strftime("MSMetaEnhancer_%Y%m%d%H%M%S.log") - filehandler_dbg = logging.FileHandler(file_name, mode='w') - filehandler_dbg.setLevel('DEBUG') + filehandler_dbg = logging.FileHandler(file_name, mode="w") + filehandler_dbg.setLevel("DEBUG") - streamformatter = logging.Formatter(fmt='%(levelname)s: %(message)s') + streamformatter = logging.Formatter(fmt="%(levelname)s: %(message)s") # Apply formatters to handlers filehandler_dbg.setFormatter(streamformatter) diff --git a/MSMetaEnhancer/libs/utils/Metrics.py b/MSMetaEnhancer/libs/utils/Metrics.py index e5b4714..45b2bed 100644 --- a/MSMetaEnhancer/libs/utils/Metrics.py +++ b/MSMetaEnhancer/libs/utils/Metrics.py @@ -39,10 +39,16 @@ def update_after_annotation(self, metadata_keys): self.coverage_after_annotation[key] += 1 def __str__(self): - table = tabulate([[key, - f'{(self.coverage_before_annotation[key]/self.max_spectra)*100:.2f}%', - f'{(self.coverage_after_annotation[key]/self.max_spectra)*100:.2f}%'] - for key in self.coverage_before_annotation], - headers=['Target\nattribute', 'Coverage\nbefore', 'Coverage\nafter']) - - return f'\nAttribute discovery rates:\n\n{table}\n' + '='*50 + '\n' + table = tabulate( + [ + [ + key, + f"{(self.coverage_before_annotation[key] / self.max_spectra) * 100:.2f}%", + f"{(self.coverage_after_annotation[key] / self.max_spectra) * 100:.2f}%", + ] + for key in self.coverage_before_annotation + ], + headers=["Target\nattribute", "Coverage\nbefore", "Coverage\nafter"], + ) + + return f"\nAttribute discovery rates:\n\n{table}\n" + "=" * 50 + "\n" diff --git a/MSMetaEnhancer/libs/utils/Monitor.py b/MSMetaEnhancer/libs/utils/Monitor.py index 9431ca8..d4c5ec0 100644 --- a/MSMetaEnhancer/libs/utils/Monitor.py +++ b/MSMetaEnhancer/libs/utils/Monitor.py @@ -8,6 +8,7 @@ class Monitor(Thread): """ Class to periodically monitor status of used web. """ + def __init__(self): super(Monitor, self).__init__() self.converters = dict() @@ -39,7 +40,11 @@ def check_service(url): try: result = requests.get(url, timeout=5) return result.status_code == 200 - except (requests.exceptions.ConnectionError, TimeoutError, requests.exceptions.ReadTimeout): + except ( + requests.exceptions.ConnectionError, + TimeoutError, + requests.exceptions.ReadTimeout, + ): return False def run(self): diff --git a/MSMetaEnhancer/libs/utils/Throttler.py b/MSMetaEnhancer/libs/utils/Throttler.py index a611285..3b06bf6 100644 --- a/MSMetaEnhancer/libs/utils/Throttler.py +++ b/MSMetaEnhancer/libs/utils/Throttler.py @@ -8,6 +8,7 @@ class Throttler: """ Class to limit number of parallel requests by a rate (number per period of time). """ + def __init__(self, rate_limit=10, period=1, retry_interval=0.01): self.rate = rate_limit self.rate_limit = rate_limit diff --git a/docs/source/conf.py b/docs/source/conf.py index 8014eb3..3c53040 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -13,31 +13,32 @@ import os import sys -sys.path.insert(0, os.path.abspath('../../')) +sys.path.insert(0, os.path.abspath("../../")) from shutil import copyfile -copyfile('../../README.md', 'readme.md') -copyfile('../../CHANGELOG.md', 'CHANGELOG.md') -copyfile('../../CONTRIBUTING.md', 'CONTRIBUTING.md') + +copyfile("../../README.md", "readme.md") +copyfile("../../CHANGELOG.md", "CHANGELOG.md") +copyfile("../../CONTRIBUTING.md", "CONTRIBUTING.md") # -- Project information ----------------------------------------------------- -project = 'MSMetaEnhancer' -copyright = '2021, RECETOX' -author = 'RECETOX' +project = "MSMetaEnhancer" +copyright = "2021, RECETOX" +author = "RECETOX" # -- General configuration --------------------------------------------------- # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. -extensions = ['sphinx.ext.autodoc', 'myst_parser'] +extensions = ["sphinx.ext.autodoc", "myst_parser"] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +pygments_style = "sphinx" # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. @@ -49,9 +50,9 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = 'sphinx_rtd_theme' +html_theme = "sphinx_rtd_theme" # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = ["_static"] diff --git a/galaxy/generate_options.py b/galaxy/generate_options.py index d9553a9..fe8d2ca 100644 --- a/galaxy/generate_options.py +++ b/galaxy/generate_options.py @@ -12,14 +12,18 @@ def generate_options(): jobs = [] converters = web_converters + compute_converters - built_converters, built_web_converters = ConverterBuilder().build_converters(None, converters) + built_converters, built_web_converters = ConverterBuilder().build_converters( + None, converters + ) for converter in built_converters: - jobs += (built_converters[converter].get_conversion_functions()) + jobs += built_converters[converter].get_conversion_functions() for job in jobs: - print(f'') + print( + f'' + ) -if __name__ == '__main__': +if __name__ == "__main__": generate_options() diff --git a/tests/__init__.py b/tests/__init__.py index ac023ca..d4b517c 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,7 +1,7 @@ import asyncio import sys -if sys.platform == 'win32': +if sys.platform == "win32": # Set the policy to prevent "Event loop is closed" error on Windows - https://github.com/encode/httpx/issues/914 # See https://stackoverflow.com/questions/63860576/asyncio-event-loop-is-closed-when-using-asyncio-run asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) diff --git a/tests/test_BridgeDB.py b/tests/test_BridgeDB.py index 565ddae..a6c2c69 100644 --- a/tests/test_BridgeDB.py +++ b/tests/test_BridgeDB.py @@ -5,25 +5,27 @@ from tests.utils import wrap_with_session -HMDBID = 'HMDB0000001' +HMDBID = "HMDB0000001" @pytest.mark.dependency() def test_service_available(): - asyncio.run(wrap_with_session(BridgeDb, 'hmdbid_to_pubchemid', ['HMDB0000001'])) + asyncio.run(wrap_with_session(BridgeDb, "hmdbid_to_pubchemid", ["HMDB0000001"])) @pytest.mark.dependency(depends=["test_service_available"]) def test_format(): - args = f'Ch/{HMDBID}' - response = asyncio.run(wrap_with_session(BridgeDb, 'query_the_service', ['BridgeDb', args])) + args = f"Ch/{HMDBID}" + response = asyncio.run( + wrap_with_session(BridgeDb, "query_the_service", ["BridgeDb", args]) + ) assert isinstance(response, str) - lines = response.split('\n') + lines = response.split("\n") assert len(lines) != 0 - assert '\t' in response + assert "\t" in response def test_get_conversions(): jobs = BridgeDb(None).get_conversion_functions() - assert ('wikidataid', 'pubchemid', 'BridgeDb') in jobs + assert ("wikidataid", "pubchemid", "BridgeDb") in jobs diff --git a/tests/test_CIR.py b/tests/test_CIR.py index 564769d..63138bc 100644 --- a/tests/test_CIR.py +++ b/tests/test_CIR.py @@ -7,14 +7,14 @@ @pytest.mark.dependency() def test_service_available(): - asyncio.run(wrap_with_session(CIR, 'casno_to_smiles', ['7783-89-3'])) + asyncio.run(wrap_with_session(CIR, "casno_to_smiles", ["7783-89-3"])) @pytest.mark.dependency(depends=["test_service_available"]) def test_format(): - casno = '7783-89-3' - args = '{}/smiles?resolver=cas_number'.format(casno) - response = asyncio.run(wrap_with_session(CIR, 'query_the_service', ['CIR', args])) + casno = "7783-89-3" + args = "{}/smiles?resolver=cas_number".format(casno) + response = asyncio.run(wrap_with_session(CIR, "query_the_service", ["CIR", args])) assert isinstance(response, str) diff --git a/tests/test_CTS.py b/tests/test_CTS.py index ceada45..d4d4e07 100644 --- a/tests/test_CTS.py +++ b/tests/test_CTS.py @@ -8,22 +8,19 @@ @pytest.mark.dependency() def test_service_available(): - asyncio.run(wrap_with_session(CTS, 'casno_to_inchikey', ['7783-89-3'])) + asyncio.run(wrap_with_session(CTS, "casno_to_inchikey", ["7783-89-3"])) @pytest.mark.dependency(depends=["test_service_available"]) -@pytest.mark.parametrize('value, size', [ - ['7783-89-3', 1], - ['7783893', 0] -]) +@pytest.mark.parametrize("value, size", [["7783-89-3", 1], ["7783893", 0]]) def test_format(value, size): - args = 'CAS/InChIKey/{}'.format(value) - response = asyncio.run(wrap_with_session(CTS, 'query_the_service', ['CTS', args])) + args = "CAS/InChIKey/{}".format(value) + response = asyncio.run(wrap_with_session(CTS, "query_the_service", ["CTS", args])) response_json = json.loads(response) assert isinstance(response_json, list) assert len(response_json) == 1 - assert 'results' in response_json[0] - assert len(response_json[0]['results']) == size + assert "results" in response_json[0] + assert len(response_json[0]["results"]) == size def test_get_conversions(): diff --git a/tests/test_IDSM.py b/tests/test_IDSM.py index 25c6d95..fab8db3 100644 --- a/tests/test_IDSM.py +++ b/tests/test_IDSM.py @@ -8,12 +8,12 @@ from tests.utils import wrap_with_session -INCHI = 'InChI=1S/C11H8FNO3/c1-13-6-9(10(14)16-11(13)15)7-2-4-8(12)5-3-7/h2-6H,1H3' +INCHI = "InChI=1S/C11H8FNO3/c1-13-6-9(10(14)16-11(13)15)7-2-4-8(12)5-3-7/h2-6H,1H3" @pytest.mark.dependency() def test_service_available(): - asyncio.run(wrap_with_session(IDSM, 'inchi_to_inchikey', [INCHI])) + asyncio.run(wrap_with_session(IDSM, "inchi_to_inchikey", [INCHI])) @pytest.mark.dependency(depends=["test_service_available"]) @@ -33,17 +33,29 @@ def test_format(): """ data = frozendict({"query": query}) - response = asyncio.run(wrap_with_session(IDSM, 'query_the_service', - ['IDSM', '', 'POST', frozendict(data), - frozendict({"Accept": "application/sparql-results+json"})])) + response = asyncio.run( + wrap_with_session( + IDSM, + "query_the_service", + [ + "IDSM", + "", + "POST", + frozendict(data), + frozendict({"Accept": "application/sparql-results+json"}), + ], + ) + ) try: response_json = json.loads(response) # Safely parse JSON except json.JSONDecodeError as e: pytest.fail(f"Failed to decode JSON response: {e}") - assert 'results' in response_json, "Key 'results' not found in response" - assert 'bindings' in response_json['results'], "Key 'bindings' not found in 'results'" - assert len(response_json['results']['bindings']) > 1 + assert "results" in response_json, "Key 'results' not found in response" + assert "bindings" in response_json["results"], ( + "Key 'bindings' not found in 'results'" + ) + assert len(response_json["results"]["bindings"]) > 1 def test_get_conversions(): diff --git a/tests/test_PubChem.py b/tests/test_PubChem.py index f362328..bb9bbd5 100644 --- a/tests/test_PubChem.py +++ b/tests/test_PubChem.py @@ -8,25 +8,32 @@ from tests.utils import wrap_with_session -INCHI = 'InChI=1S/C11H8FNO3/c1-13-6-9(10(14)16-11(13)15)7-2-4-8(12)5-3-7/h2-6H,1H3' +INCHI = "InChI=1S/C11H8FNO3/c1-13-6-9(10(14)16-11(13)15)7-2-4-8(12)5-3-7/h2-6H,1H3" @pytest.mark.dependency() def test_service_available(): - asyncio.run(wrap_with_session(PubChem, 'inchi_to_inchikey', [INCHI])) + asyncio.run(wrap_with_session(PubChem, "inchi_to_inchikey", [INCHI])) @pytest.mark.dependency(depends=["test_service_available"]) def test_format(): - inchi = 'InChI=1S/C9H10O4/c10-7-3-1-6(2-4-7)5-8(11)9(12)13/h1-4,8,10-11H,5H2,(H,12,13)' - data = frozendict({'inchi': inchi}) - - response = asyncio.run(wrap_with_session(PubChem, 'query_the_service', - ['PubChem', 'inchi/JSON', 'POST', frozendict(data)])) + inchi = ( + "InChI=1S/C9H10O4/c10-7-3-1-6(2-4-7)5-8(11)9(12)13/h1-4,8,10-11H,5H2,(H,12,13)" + ) + data = frozendict({"inchi": inchi}) + + response = asyncio.run( + wrap_with_session( + PubChem, + "query_the_service", + ["PubChem", "inchi/JSON", "POST", frozendict(data)], + ) + ) response_json = json.loads(response) - assert 'PC_Compounds' in response_json - assert len(response_json['PC_Compounds']) > 0 - assert 'props' in response_json['PC_Compounds'][0] + assert "PC_Compounds" in response_json + assert len(response_json["PC_Compounds"]) > 0 + assert "props" in response_json["PC_Compounds"][0] def test_get_conversions(): @@ -35,15 +42,31 @@ def test_get_conversions(): jobs = PubChem(None).get_conversion_functions() loop.close() - assert ('inchi', 'iupac_name', 'PubChem') in jobs - - -@pytest.mark.parametrize('response, expected', [ - [{"PC_Compounds": [{"id": {"id": {"cid": "123"}}, - "props": [{"urn": {"label": "InChI"}, "value": {"sval": "random_inchi"}}]}]}, - {"pubchemid": "123", "inchi": "random_inchi"}], - [{"PC_Compounds": [{"id": {}, "props": []}]}, dict()] -]) + assert ("inchi", "iupac_name", "PubChem") in jobs + + +@pytest.mark.parametrize( + "response, expected", + [ + [ + { + "PC_Compounds": [ + { + "id": {"id": {"cid": "123"}}, + "props": [ + { + "urn": {"label": "InChI"}, + "value": {"sval": "random_inchi"}, + } + ], + } + ] + }, + {"pubchemid": "123", "inchi": "random_inchi"}, + ], + [{"PC_Compounds": [{"id": {}, "props": []}]}, dict()], + ], +) def test_parse_attributes(response, expected): actual = PubChem(None).parse_attributes(json.dumps(response)) assert actual == expected @@ -53,5 +76,5 @@ def test_convert_inchikey_to_inchi(): inchikey = "OHCNQFYTLLGNOE-UHFFFAOYSA-N" expected = "InChI=1S/C5H13NSi/c1-7(2,3)6-4-5-6/h4-5H2,1-3H3" - actual = asyncio.run(wrap_with_session(PubChem, 'inchikey_to_inchi', [inchikey])) - assert actual['inchi'] == expected + actual = asyncio.run(wrap_with_session(PubChem, "inchikey_to_inchi", [inchikey])) + assert actual["inchi"] == expected diff --git a/tests/test_annotator.py b/tests/test_annotator.py index 059ba10..1803627 100644 --- a/tests/test_annotator.py +++ b/tests/test_annotator.py @@ -7,16 +7,31 @@ from MSMetaEnhancer.libs.utils.Job import Job -@pytest.mark.parametrize('metadata, expected, repeat, mocked', [ - [{'compound_name': '$NAME'}, {'compound_name': '$NAME', 'inchi': '$InChi'}, False, - [({'compound_name': '$NAME', 'inchi': '$InChi'}, None)]], - [{'compound_name': '$NAME'}, {'compound_name': '$NAME', 'inchi': '$InChi', 'smiles': '$SMILES'}, True, - [({'compound_name': '$NAME', 'inchi': '$InChi'}, None), - ({'compound_name': '$NAME', 'inchi': '$InChi', 'smiles': '$SMILES'}, None)]] -]) +@pytest.mark.parametrize( + "metadata, expected, repeat, mocked", + [ + [ + {"compound_name": "$NAME"}, + {"compound_name": "$NAME", "inchi": "$InChi"}, + False, + [({"compound_name": "$NAME", "inchi": "$InChi"}, None)], + ], + [ + {"compound_name": "$NAME"}, + {"compound_name": "$NAME", "inchi": "$InChi", "smiles": "$SMILES"}, + True, + [ + ({"compound_name": "$NAME", "inchi": "$InChi"}, None), + ( + {"compound_name": "$NAME", "inchi": "$InChi", "smiles": "$SMILES"}, + None, + ), + ], + ], + ], +) def test_annotate(metadata, expected, repeat, mocked): - jobs = [Job(('inchi', 'smiles', 'IDSM')), - Job(('name', 'inchi', 'IDSM'))] + jobs = [Job(("inchi", "smiles", "IDSM")), Job(("name", "inchi", "IDSM"))] annotator = Annotator() annotator.set_converters(dict()) @@ -34,32 +49,36 @@ def test_execute_job_with_cache(): curator.filter_invalid_metadata = mock.MagicMock(side_effect=lambda a, b, c: a) idsm = mock.Mock() - idsm.convert = mock.AsyncMock(return_value={'smiles': '$SMILES'}) + idsm.convert = mock.AsyncMock(return_value={"smiles": "$SMILES"}) - job = Job(('inchi', 'smiles', 'IDSM')) + job = Job(("inchi", "smiles", "IDSM")) job.validate = mock.Mock(return_value=(idsm, None)) annotator = Annotator() - annotator.set_converters({'IDSM': idsm}) + annotator.set_converters({"IDSM": idsm}) annotator.curator = curator - metadata, cache = asyncio.run(annotator.execute_job_with_cache(job, {'inchi': '$InChi'}, dict(), warning)) - assert metadata == {'inchi': '$InChi', 'smiles': '$SMILES'} + metadata, cache = asyncio.run( + annotator.execute_job_with_cache(job, {"inchi": "$InChi"}, dict(), warning) + ) + assert metadata == {"inchi": "$InChi", "smiles": "$SMILES"} # already cached cts = mock.Mock() cts.convert = mock.AsyncMock(return_value=dict()) - job = Job(('smiles', 'formula', 'CTS')) + job = Job(("smiles", "formula", "CTS")) job.validate = mock.Mock(return_value=(cts, None)) - cache = {job.converter: {'formula': '$FORMULA'}} + cache = {job.converter: {"formula": "$FORMULA"}} annotator = Annotator() - annotator.set_converters({'CTS': cts}) + annotator.set_converters({"CTS": cts}) annotator.curator = curator - metadata, cache = asyncio.run(annotator.execute_job_with_cache(job, {'smiles': '$SMILES'}, cache, warning)) - assert metadata == {'smiles': '$SMILES', 'formula': '$FORMULA'} + metadata, cache = asyncio.run( + annotator.execute_job_with_cache(job, {"smiles": "$SMILES"}, cache, warning) + ) + assert metadata == {"smiles": "$SMILES", "formula": "$FORMULA"} # no data retrieved @@ -67,22 +86,33 @@ def test_execute_job_with_cache(): cir.convert = mock.AsyncMock(return_value=dict()) annotator = Annotator() - annotator.set_converters({'CIR': cir}) + annotator.set_converters({"CIR": cir}) annotator.curator = curator with pytest.raises(TargetAttributeNotRetrieved): - metadata, cache = asyncio.run(annotator.execute_job_with_cache(job, {'smiles': '$SMILES'}, dict(), warning)) + metadata, cache = asyncio.run( + annotator.execute_job_with_cache( + job, {"smiles": "$SMILES"}, dict(), warning + ) + ) def test_catch_exception(): - metadata = {'inchi': 'a value', 'compound_name': 'a molecule'} - result_metadata = {'inchi': 'a value', 'compound_name': 'a molecule', 'atr1': 'val1', 'atr2': 'val2'} - jobs = [mock.Mock(target='a target')] * 3 + metadata = {"inchi": "a value", "compound_name": "a molecule"} + result_metadata = { + "inchi": "a value", + "compound_name": "a molecule", + "atr1": "val1", + "atr2": "val2", + } + jobs = [mock.Mock(target="a target")] * 3 annotator = Annotator() annotator.set_converters(dict()) - mocked = [({'inchi': 'a value', 'compound_name': 'a molecule', 'atr1': 'val1'}, dict()), - Exception(), - (result_metadata, dict())] + mocked = [ + ({"inchi": "a value", "compound_name": "a molecule", "atr1": "val1"}, dict()), + Exception(), + (result_metadata, dict()), + ] annotator.execute_job_with_cache = mock.AsyncMock() annotator.execute_job_with_cache.side_effect = mocked diff --git a/tests/test_application.py b/tests/test_application.py index 8bcf00c..16bb78a 100644 --- a/tests/test_application.py +++ b/tests/test_application.py @@ -2,6 +2,8 @@ import pytest from MSMetaEnhancer import Application +from MSMetaEnhancer.libs.converters.web import IDSM, PubChem +from MSMetaEnhancer.libs.utils.ConverterBuilder import ConverterBuilder from tests.utils import FakeMonitor, FakeAnnotator from MSMetaEnhancer.libs.utils.Generic import is_na_value @@ -11,7 +13,7 @@ def test_annotate_spectra_monitor_stops(): monitor = FakeMonitor() annotator = FakeAnnotator() - app.load_data('tests/test_data/sample.msp', file_format='msp') + app.load_data("tests/test_data/sample.msp", file_format="msp") asyncio.run(app.annotate_spectra([], monitor=monitor, annotator=annotator)) assert monitor.stop_request.is_set() @@ -22,19 +24,19 @@ def test_annotate_spectra_monitor_stops_after_exception(): monitor = FakeMonitor() annotator = FakeAnnotator(True) - app.load_data('tests/test_data/sample.msp', file_format='msp') + app.load_data("tests/test_data/sample.msp", file_format="msp") with pytest.raises(Exception): - asyncio.run(app.annotate_spectra([], monitor=monitor, annotator=annotator)) + asyncio.run(app.annotate_spectra({}, monitor=monitor, annotator=annotator)) assert monitor.stop_request.is_set() def test_application_sparse(): + ConverterBuilder.register([PubChem, IDSM]) app = Application() - app.load_data('tests/test_data/sparse.tsv', file_format='tabular') - asyncio.run(app.annotate_spectra(['PubChem', 'IDSM'])) - - actual = [x.get('canonical_smiles') for x in app.data.get_metadata()] - assert any([is_na_value(x) for x in actual]) == False + app.load_data("tests/test_data/sparse.tsv", file_format="tabular") + asyncio.run(app.annotate_spectra(["PubChem", "IDSM"])) + actual = [x.get("canonical_smiles") for x in app.data.get_metadata()] + assert not any([is_na_value(x) for x in actual]) diff --git a/tests/test_converter.py b/tests/test_converter.py index a6a46e7..1f6c581 100644 --- a/tests/test_converter.py +++ b/tests/test_converter.py @@ -8,45 +8,49 @@ from asyncio.exceptions import TimeoutError from MSMetaEnhancer.libs.converters.web.WebConverter import WebConverter -from MSMetaEnhancer.libs.utils.Errors import TargetAttributeNotRetrieved, UnknownResponse, ServiceNotAvailable +from MSMetaEnhancer.libs.utils.Errors import ( + TargetAttributeNotRetrieved, + UnknownResponse, + ServiceNotAvailable, +) def test_query_the_service(): converter = WebConverter(mock.Mock()) - converter.endpoints = {'CTS': 'what a converter'} - converter.loop_request = mock.AsyncMock(return_value={'smiles': '$SMILES'}) + converter.endpoints = {"CTS": "what a converter"} + converter.loop_request = mock.AsyncMock(return_value={"smiles": "$SMILES"}) - result = asyncio.run(converter.query_the_service('CTS', 'arg')) - assert result == {'smiles': '$SMILES'} + result = asyncio.run(converter.query_the_service("CTS", "arg")) + assert result == {"smiles": "$SMILES"} converter.loop_request.assert_called() # test wrong arg type with pytest.raises(TypeError): - _ = asyncio.run(converter.query_the_service('CTS', 10)) + _ = asyncio.run(converter.query_the_service("CTS", 10)) # test lru_cache converter.executed = False converter.loop_request = mock.AsyncMock() - result = asyncio.run(converter.query_the_service('CTS', 'arg')) - assert result == {'smiles': '$SMILES'} + result = asyncio.run(converter.query_the_service("CTS", "arg")) + assert result == {"smiles": "$SMILES"} converter.loop_request.assert_not_called() async def test_loop_request(aiohttp_client): - response = {'smiles': '$SMILES'} + response = {"smiles": "$SMILES"} async def fake_request(): return web.Response(body=response) app = web.Application() - app.router.add_route('GET', '/', fake_request) + app.router.add_route("GET", "/", fake_request) session = await aiohttp_client(app) converter = WebConverter(session) converter.process_request = mock.AsyncMock(return_value=response) - result = await converter.loop_request('/', 'GET', None, None) + result = await converter.loop_request("/", "GET", None, None) assert result == response @@ -55,16 +59,22 @@ async def fake_request(): raise ServerDisconnectedError() app = web.Application() - app.router.add_route('GET', '/', fake_request) + app.router.add_route("GET", "/", fake_request) session = await aiohttp_client(app) converter = WebConverter(session) with pytest.raises(UnknownResponse): - await converter.loop_request('/', 'GET', None, None) + await converter.loop_request("/", "GET", None, None) -@pytest.fixture(params=[TimeoutError, ServerDisconnectedError, ClientConnectorError(None, OSError())]) +@pytest.fixture( + params=[ + TimeoutError, + ServerDisconnectedError, + ClientConnectorError(None, OSError()), + ] +) def failing_session_mock(request): session = mock.AsyncMock() session.get = mock.Mock(side_effect=request.param) @@ -76,15 +86,15 @@ async def test_loop_request_circuit_breaker_get(failing_session_mock): converter = WebConverter(failing_session_mock) with pytest.raises(ServiceNotAvailable): - await converter.loop_request('/', 'GET', None, None) + await converter.loop_request("/", "GET", None, None) async def test_loop_request_circuit_breaker_post(failing_session_mock): converter = WebConverter(failing_session_mock) - data = {'inchi': 'inchi'} + data = {"inchi": "inchi"} with pytest.raises(ServiceNotAvailable): - await converter.loop_request('/', 'POST', data, None) + await converter.loop_request("/", "POST", data, None) def test_process_request(): @@ -93,44 +103,41 @@ def test_process_request(): response = mock.AsyncMock() response.status = 200 - response.text = mock.AsyncMock(return_value='this is response') + response.text = mock.AsyncMock(return_value="this is response") response.ok = True - result = asyncio.run(converter.process_request(response, '/', 'GET')) - assert result == 'this is response' + result = asyncio.run(converter.process_request(response, "/", "GET")) + assert result == "this is response" -@pytest.mark.parametrize('ok, status', [ - [False, 500], - [False, 503] -]) +@pytest.mark.parametrize("ok, status", [[False, 500], [False, 503]]) def test_process_request_exception(ok, status): converter = WebConverter(mock.Mock()) converter.loop_request = mock.AsyncMock(return_value=None) response = mock.AsyncMock() response.status = status - response.text = mock.AsyncMock(return_value='this is response') + response.text = mock.AsyncMock(return_value="this is response") response.ok = ok with pytest.raises(UnknownResponse): - asyncio.run(converter.process_request(response, '/', 'GET')) + asyncio.run(converter.process_request(response, "/", "GET")) def test_convert(): converter = WebConverter(mock.Mock()) converter.A_to_B = mock.AsyncMock() - converter.A_to_B.side_effect = ['value'] + converter.A_to_B.side_effect = ["value"] - result = asyncio.run(converter.convert('A', 'B', None)) - assert result == 'value' + result = asyncio.run(converter.convert("A", "B", None)) + assert result == "value" converter.A_to_B.side_effect = [None] with pytest.raises(TargetAttributeNotRetrieved): - _ = asyncio.run(converter.convert('A', 'B', None)) + _ = asyncio.run(converter.convert("A", "B", None)) with pytest.raises(AttributeError): - _ = asyncio.run(converter.convert('B', 'C', None)) + _ = asyncio.run(converter.convert("B", "C", None)) async def test_lru_cache(aiohttp_client): @@ -138,16 +145,16 @@ async def test_lru_cache(aiohttp_client): session = await aiohttp_client(app) converter = WebConverter(session) - converter.endpoints = {'/': '/'} + converter.endpoints = {"/": "/"} converter.loop_request = mock.AsyncMock(return_value=(1, 2, 3)) converter.query_the_service.cache_clear() - _ = await converter.query_the_service('/', '') + _ = await converter.query_the_service("/", "") assert converter.query_the_service.cache_info().hits == 0 - _ = await converter.query_the_service('/', '') + _ = await converter.query_the_service("/", "") assert converter.query_the_service.cache_info().hits == 1 - _ = await converter.query_the_service('/', '') + _ = await converter.query_the_service("/", "") assert converter.query_the_service.cache_info().hits == 2 diff --git a/tests/test_curator.py b/tests/test_curator.py index 898dfe5..311f80f 100644 --- a/tests/test_curator.py +++ b/tests/test_curator.py @@ -7,17 +7,20 @@ def test_fix_cas_number(): curator = Curator() - assert curator.fix_cas_number('7783893') == '7783-89-3' - assert curator.fix_cas_number('7783-89-3') == '7783-89-3' + assert curator.fix_cas_number("7783893") == "7783-89-3" + assert curator.fix_cas_number("7783-89-3") == "7783-89-3" -@pytest.mark.parametrize('metadata, validated_metadata, logs_size', [ - [{'inchikey': 'random content'}, {}, 1], - [{'smiles': 'CC(NC(C)=O)C#N'}, {'smiles': 'CC(NC(C)=O)C#N'}, 0] -]) +@pytest.mark.parametrize( + "metadata, validated_metadata, logs_size", + [ + [{"inchikey": "random content"}, {}, 1], + [{"smiles": "CC(NC(C)=O)C#N"}, {"smiles": "CC(NC(C)=O)C#N"}, 0], + ], +) def test_filter_invalid_metadata(metadata, validated_metadata, logs_size): warning = LogRecord(dict()) - job = Job(('smiles', 'inchi', 'converter')) + job = Job(("smiles", "inchi", "converter")) curator = Curator() assert curator.filter_invalid_metadata(metadata, warning, job) == validated_metadata assert len(warning.logs) == logs_size diff --git a/tests/test_io.py b/tests/test_io.py index fcfb424..db0194f 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -4,25 +4,53 @@ from MSMetaEnhancer.libs.data import Spectra, DataFrame -DATA = [{'formula': 'H2', 'mw': '2', 'casno': '1333740', 'id': '1', 'num_peaks': '2', 'compound_name': 'Hydrogen'}, - {'formula': 'D2', 'mw': '4', 'casno': '7782390', 'id': '2', 'num_peaks': '2', 'compound_name': 'Deuterium'}, - {'formula': 'CH4', 'mw': '16', 'casno': '74828', 'id': '3', 'num_peaks': '6', 'compound_name': 'Methane'}] - - -@pytest.mark.parametrize('backend, file_type, filename', [ - [Spectra(), 'msp', 'tests/test_data/sample.msp'], - [Spectra(), 'mgf', 'tests/test_data/sample.mgf'], - [Spectra(), 'json', 'tests/test_data/sample.json'], - [DataFrame(), 'csv', 'tests/test_data/sample_metadata.csv'], - [DataFrame(), 'tsv', 'tests/test_data/sample_metadata.tsv'], - [DataFrame(), 'xlsx', 'tests/test_data/sample_metadata.xlsx'] -]) +DATA = [ + { + "formula": "H2", + "mw": "2", + "casno": "1333740", + "id": "1", + "num_peaks": "2", + "compound_name": "Hydrogen", + }, + { + "formula": "D2", + "mw": "4", + "casno": "7782390", + "id": "2", + "num_peaks": "2", + "compound_name": "Deuterium", + }, + { + "formula": "CH4", + "mw": "16", + "casno": "74828", + "id": "3", + "num_peaks": "6", + "compound_name": "Methane", + }, +] + + +@pytest.mark.parametrize( + "backend, file_type, filename", + [ + [Spectra(), "msp", "tests/test_data/sample.msp"], + [Spectra(), "mgf", "tests/test_data/sample.mgf"], + [Spectra(), "json", "tests/test_data/sample.json"], + [DataFrame(), "csv", "tests/test_data/sample_metadata.csv"], + [DataFrame(), "tsv", "tests/test_data/sample_metadata.tsv"], + [DataFrame(), "xlsx", "tests/test_data/sample_metadata.xlsx"], + ], +) def test_get_metadata(backend, file_type, filename): backend.load_data(filename, file_type) metadata = backend.get_metadata() # Compare lengths - assert len(metadata) == len(DATA), f"Metadata length mismatch: {len(metadata)} != {len(DATA)}" + assert len(metadata) == len(DATA), ( + f"Metadata length mismatch: {len(metadata)} != {len(DATA)}" + ) # Compare values of matching keys for i, (meta_item, data_item) in enumerate(zip(metadata, DATA)): @@ -32,11 +60,14 @@ def test_get_metadata(backend, file_type, filename): f"Value mismatch for key '{key}' at index {i}: {meta_item[key]} != {data_item[key]}" ) + def test_fuse_metadata_dataframe(): df = DataFrame() df.fuse_metadata(DATA) # Compare row by row, ignoring mismatched keys - for i, (fused_row, original_row) in enumerate(zip(df.df.to_dict(orient='records'), DATA)): + for i, (fused_row, original_row) in enumerate( + zip(df.df.to_dict(orient="records"), DATA) + ): for key in original_row.keys(): if key in fused_row: assert fused_row[key] == original_row[key], ( @@ -46,11 +77,15 @@ def test_fuse_metadata_dataframe(): def test_fuse_metadata_spectra(): spectra_fused = Spectra() - spectra_fused.spectrums = [mock.Mock(metadata=dict()), mock.Mock(metadata=dict()), mock.Mock(metadata=dict())] + spectra_fused.spectrums = [ + mock.Mock(metadata=dict()), + mock.Mock(metadata=dict()), + mock.Mock(metadata=dict()), + ] spectra_fused.fuse_metadata(DATA) spectra_loaded = Spectra() - spectra_loaded.load_data('tests/test_data/sample.msp', 'msp') + spectra_loaded.load_data("tests/test_data/sample.msp", "msp") # Compare metadata row by row, ignoring mismatched keys fused_metadata = spectra_fused.get_metadata() @@ -69,13 +104,15 @@ def test_tabular_data(): Test loading and comparing tabular (TSV) data using the DataFrame backend. """ df = DataFrame() - filename = 'tests/test_data/sample_metadata.tsv' - file_type = 'tabular' + filename = "tests/test_data/sample_metadata.tsv" + file_type = "tabular" df.load_data(filename, file_type) metadata = df.get_metadata() # Compare lengths - assert len(metadata) == len(DATA), f"Metadata length mismatch: {len(metadata)} != {len(DATA)}" + assert len(metadata) == len(DATA), ( + f"Metadata length mismatch: {len(metadata)} != {len(DATA)}" + ) # Compare values of matching keys for i, (meta_item, data_item) in enumerate(zip(metadata, DATA)): for key in meta_item.keys(): @@ -85,10 +122,23 @@ def test_tabular_data(): ) -@pytest.mark.parametrize('backend, file_type, filename, absent_keys', [ - [DataFrame(), 'csv', 'tests/test_data/sample_metadata_with_na.csv', ['inchikey', 'smiles']], - [Spectra(), 'msp', 'tests/test_data/sample_with_na.msp', ['inchikey', 'smiles']], -]) +@pytest.mark.parametrize( + "backend, file_type, filename, absent_keys", + [ + [ + DataFrame(), + "csv", + "tests/test_data/sample_metadata_with_na.csv", + ["inchikey", "smiles"], + ], + [ + Spectra(), + "msp", + "tests/test_data/sample_with_na.msp", + ["inchikey", "smiles"], + ], + ], +) def test_na_values_filtered_from_metadata(backend, file_type, filename, absent_keys): """NA and empty values in data files should be excluded from metadata dicts.""" backend.load_data(filename, file_type) diff --git a/tests/test_rdkit.py b/tests/test_rdkit.py index fdc2ffb..c6562b5 100644 --- a/tests/test_rdkit.py +++ b/tests/test_rdkit.py @@ -3,20 +3,27 @@ from MSMetaEnhancer.libs.converters.compute import RDKit -INCHI = 'InChI=1S/C19H28O2/c1-18-9-7-13(20)11-12(18)3-4-14-15-5-6-17(21)19(15,2)10-8-16(14)18/h11,14-17,21H,3-10H2,1-2H3/t14-,15-,16-,17-,18-,19-/m0/s1' -CANONICAL_SMILES = 'CC12CCC(=O)C=C1CCC1C2CCC2(C)C(O)CCC12' +INCHI = "InChI=1S/C19H28O2/c1-18-9-7-13(20)11-12(18)3-4-14-15-5-6-17(21)19(15,2)10-8-16(14)18/h11,14-17,21H,3-10H2,1-2H3/t14-,15-,16-,17-,18-,19-/m0/s1" +CANONICAL_SMILES = "CC12CCC(=O)C=C1CCC1C2CCC2(C)C(O)CCC12" -@pytest.mark.parametrize('method, input, expected', [ - ['inchi_to_canonical_smiles', INCHI, {'canonical_smiles': CANONICAL_SMILES}], - ['inchi_to_isomeric_smiles', INCHI, { - 'isomeric_smiles': 'C[C@]12CC[C@H]3[C@@H](CCC4=CC(=O)CC[C@@]43C)[C@@H]1CC[C@@H]2O' - }], - ['from_smiles', CANONICAL_SMILES, {'mw': 288.208930136}], - ["formula_to_mw", "C9H15N4O8P", {'mw': 338.21299999999997}], - ['smiles_to_formula', CANONICAL_SMILES, {'formula': 'C19H28O2'}], - ['inchi_to_formula', INCHI, {'formula': 'C19H28O2'}], -]) +@pytest.mark.parametrize( + "method, input, expected", + [ + ["inchi_to_canonical_smiles", INCHI, {"canonical_smiles": CANONICAL_SMILES}], + [ + "inchi_to_isomeric_smiles", + INCHI, + { + "isomeric_smiles": "C[C@]12CC[C@H]3[C@@H](CCC4=CC(=O)CC[C@@]43C)[C@@H]1CC[C@@H]2O" + }, + ], + ["from_smiles", CANONICAL_SMILES, {"mw": 288.208930136}], + ["formula_to_mw", "C9H15N4O8P", {"mw": 338.21299999999997}], + ["smiles_to_formula", CANONICAL_SMILES, {"formula": "C19H28O2"}], + ["inchi_to_formula", INCHI, {"formula": "C19H28O2"}], + ], +) def test_convert_methods(method, input, expected): func = getattr(RDKit(), method) actual = func(input) diff --git a/tests/utils.py b/tests/utils.py index 4452392..726bfe6 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -13,6 +13,7 @@ class FakeMonitor(Thread): """ Fake Monitor to test basic functionality. """ + def __init__(self): super(FakeMonitor, self).__init__() self.converters = None @@ -35,6 +36,7 @@ class FakeAnnotator: """ Fake Annotator to test basic functionality. """ + def __init__(self, raise_exception=False): self.converters = None self.raise_exception = raise_exception