From c1661ee33631c180f47d7b90eb57c922716340fc Mon Sep 17 00:00:00 2001 From: Charles Trenholm Date: Thu, 5 May 2022 13:47:39 -0700 Subject: [PATCH 1/2] Add list of metadata keys to DFU.save_objects for sample sets --- lib/sample_uploader/sample_uploaderImpl.py | 2 -- lib/sample_uploader/utils/importer.py | 15 ++++++++++----- lib/sample_uploader/utils/mappings.py | 1 - lib/sample_uploader/utils/sample_utils.py | 10 ---------- 4 files changed, 10 insertions(+), 18 deletions(-) diff --git a/lib/sample_uploader/sample_uploaderImpl.py b/lib/sample_uploader/sample_uploaderImpl.py index 19c8f15..a2fd602 100644 --- a/lib/sample_uploader/sample_uploaderImpl.py +++ b/lib/sample_uploader/sample_uploaderImpl.py @@ -98,8 +98,6 @@ def import_samples(self, ctx, params): # ctx is the context object # return variables are: output #BEGIN import_samples - print(f"Beginning sample import with following parameters:") - print(f"params -- {params}") sample_set = {"samples": []} # Check if we have an existing Sample Set as input # if so, download diff --git a/lib/sample_uploader/utils/importer.py b/lib/sample_uploader/utils/importer.py index 36eb968..d546696 100644 --- a/lib/sample_uploader/utils/importer.py +++ b/lib/sample_uploader/utils/importer.py @@ -34,7 +34,6 @@ def _has_sesar_header(df): sesar_headers = ['object type:', 'user code:'] sesar_headers_count = [i for i in df.columns if i.lower() in sesar_headers] if len(unnamed_cols) > 0 or len(sesar_headers_count) > 0: - print('Detected extra header line. Setting header_row_index to 1') has_sesar_header = True return has_sesar_header @@ -63,7 +62,6 @@ def find_header_row(sample_file, file_format): # TODO: this function will fail if the extra header line happens to have exactly the same number of columns as the real header line non_empty_header = [i for i in first_line.split(inferred_sep) if i not in ['', '\n']] if len(non_empty_header) != len(second_line.split(inferred_sep)): - print('Detected extra header line. Setting header_row_index to 1') header_row_index = 1 elif sample_file.endswith('.csv'): @@ -278,20 +276,26 @@ def _get_existing_sample(name, kbase_sample_id): del existing_sample_names[extra_sample] user_keys = set() + controlled_keys = set() for s in samples: for n in s['sample']['node_tree']: ukeys = set(n['meta_user'].keys()) ckeys = set(n['meta_controlled'].keys()) user_keys |= (ukeys - ckeys) + controlled_keys |= (ckeys - ukeys) for key in user_keys: + warnings.warn(SampleContentWarning( f"\"{key}\" is a user-defined column. It is of unknown type, will not be automatically validated, and may not be interoperable with other samples during analysis.", key=key, severity='warning' )) + # send list of keys with annotated custom fields to save with KBaseSets.SampleSet + metadata_keys = list(controlled_keys | {f'custom:{key}' for key in user_keys}) + # add the missing samples from existing_sample_names - return samples, [existing_sample_names[key] for key in existing_sample_names] + return samples, [existing_sample_names[key] for key in existing_sample_names], metadata_keys def _save_samples(samples, acls, sample_url, token, propagate_links): @@ -449,7 +453,7 @@ def import_samples_from_file( # query workspace for user permissions. acls = get_workspace_user_perms(workspace_url, params.get('workspace_id'), token, username, acls) - samples, existing_samples = _produce_samples( + samples, existing_samples, metadata_keys = _produce_samples( callback_url, df, column_groups, @@ -527,5 +531,6 @@ def import_samples_from_file( return { "samples": saved_samples, - "description": params.get('description') + "description": params.get('description'), + "metadata_keys": metadata_keys }, has_unignored_errors, errors.get(), sample_data_json diff --git a/lib/sample_uploader/utils/mappings.py b/lib/sample_uploader/utils/mappings.py index 6b2c4d1..6dc026b 100644 --- a/lib/sample_uploader/utils/mappings.py +++ b/lib/sample_uploader/utils/mappings.py @@ -78,7 +78,6 @@ def _fetch_global_config(config_url, github_release_url, gh_token, file_name): else: NON_PREFIX_TO_PREFIX[field_name] = [field.lower()] - def alias_map(col_config): """ aliases map diff --git a/lib/sample_uploader/utils/sample_utils.py b/lib/sample_uploader/utils/sample_utils.py index 2693db2..41c0653 100644 --- a/lib/sample_uploader/utils/sample_utils.py +++ b/lib/sample_uploader/utils/sample_utils.py @@ -236,7 +236,6 @@ def compare_samples(s1, s2): if s1 is None or s2 is None: return False else: - print('start comparing samples: {} vs {}'.format(s1['name'], s2['name'])) def remove_field(node, field): if field in node: @@ -248,10 +247,6 @@ def remove_field(node, field): # TODO: worth scrutiny comp = s1['name'] == s2['name'] and s1_nt == s2_nt - if comp: - print('compared samples are the same') - else: - print('compared samples are different') return comp @@ -294,7 +289,6 @@ def save_sample(sample, sample_url, token, previous_version=None, propagate_link token - workspace token for Authorization previous_version - data of previous version of sample """ - print('start saving sample') headers = { "Authorization": token, "Content-Type": "application/json" @@ -329,10 +323,7 @@ def save_sample(sample, sample_url, token, previous_version=None, propagate_link sample_id = resp_json['result'][0]['id'] sample_ver = resp_json['result'][0]['version'] - print('saved sample {} (version: {}'.format(sample_id, sample_ver)) - if previous_version and propagate_links: - print('start propagating previous data links') ss = SampleService(sample_url) ss.propagate_data_links({'id': sample_id, @@ -498,7 +489,6 @@ def expire_data_link(obj_refs, sample_url, token): links = get_data_links_from_ss(upa, sample_url, token) for link in links: upa, dataid = link.get('upa'), link.get('dataid') - print('start expiring link {}-{}'.format(upa, dataid)) ss.expire_data_link({'upa': upa, 'dataid': dataid}) expired_link_count += 1 From 0e4012424c933bbe44f8635771edbb287266c3d4 Mon Sep 17 00:00:00 2001 From: Charles Trenholm Date: Thu, 5 May 2022 14:56:17 -0700 Subject: [PATCH 2/2] add tests for adding metadata keys to sample set object --- lib/sample_uploader/utils/importer.py | 1 + test/utils/importer_test.py | 58 +++++++++++++++++++++++++++ 2 files changed, 59 insertions(+) diff --git a/lib/sample_uploader/utils/importer.py b/lib/sample_uploader/utils/importer.py index d546696..2607be2 100644 --- a/lib/sample_uploader/utils/importer.py +++ b/lib/sample_uploader/utils/importer.py @@ -522,6 +522,7 @@ def import_samples_from_file( if has_unignored_errors: saved_samples = [] + metadata_keys = {} else: saved_samples = _save_samples(samples, acls, sample_url, token, params.get('propagate_links', 0)) diff --git a/test/utils/importer_test.py b/test/utils/importer_test.py index c54d3c4..9e499fa 100644 --- a/test/utils/importer_test.py +++ b/test/utils/importer_test.py @@ -127,6 +127,17 @@ def test_ENIGMA_format(self): expected_sample_name = ['s1', 's2', 's3'] self.assertCountEqual([sample['name'] for sample in samples], expected_sample_name) self.assertEqual(has_unignored_errors, False) + self.assertIn('metadata_keys', sample_set) + self.assertCountEqual(sample_set['metadata_keys'], [ + 'feature', + 'longitude', + 'enigma:aquifer', + 'custom:jamboree', + 'material', + 'latitude', + 'biome', + 'sample_template' + ]) ori_compare_path = os.path.join(self.test_dir, "data", "fake_samples_ENIGMA.json") compare_path = os.path.join(self.test_dir, "data", "updated_fake_samples_ENIGMA.json") @@ -180,6 +191,17 @@ def test_ENIGMA_format(self): samples = sample_set['samples'] self.assertEqual(len(samples), 3) expected_sample_name = ['s1', 's2', 's3'] + self.assertIn('metadata_keys', sample_set) + self.assertCountEqual(sample_set['metadata_keys'], [ + 'feature', + 'longitude', + 'enigma:aquifer', + 'custom:jamboree', + 'material', + 'latitude', + 'biome', + 'sample_template' + ]) self.assertCountEqual([sample['name'] for sample in samples], expected_sample_name) self.assertEqual(has_unignored_errors, False) @@ -362,6 +384,18 @@ def test_import_SESAR_format(self): expected_sample_name = ['s1', 's2', 's3'] self.assertCountEqual([sample['name'] for sample in samples], expected_sample_name) self.assertEqual(has_unignored_errors, False) + self.assertIn('metadata_keys', sample_set) + self.assertCountEqual(sample_set['metadata_keys'], [ + 'custom:user_field', + 'sesar:material', + 'country', + 'biome', + 'sesar:size', + 'depth_bgs', + 'sample_template', + 'feature', + 'sesar:elevation_start' + ]) compare_path = os.path.join(self.test_dir, "data", "fake_samples.json") self._verify_samples(sample_set, compare_path) @@ -400,6 +434,29 @@ def test_KBASE_format(self): expected_sample_name = ['SAMN03166112', 'SAMN04383980'] self.assertCountEqual([sample['name'] for sample in samples], expected_sample_name) self.assertEqual(has_unignored_errors, False) + self.assertIn('metadata_keys', sample_set) + self.assertCountEqual(sample_set['metadata_keys'], [ + 'custom:source_name', + 'custom:env_biome', + 'custom:misc_param', + 'custom:elev', + 'sample_template', + 'custom:env_feature', + 'sesar:collection_date', + 'custom:current_archive_contact', + 'custom:collector', + 'custom:cell_line', + 'custom:depth', + 'custom:env_material', + 'custom:publication_date', + 'longitude', + 'custom:current_archive', + 'custom:last_update', + 'latitude', + 'custom:submission_date', + 'description', + 'custom:geo_loc_name' + ]) def test_dup_column_file(self): '''''' @@ -433,6 +490,7 @@ def test_dup_column_file(self): # check errors self.assertEqual(len(errors), 1) + self.assertEqual(sample_set['metadata_keys'], {}) self.assertEqual(errors[0].message, 'Duplicate column "some_field". "some_field" would ' + 'overwrite a different column "some field". Rename ' + 'your columns to be unique alphanumericaly, ' +