Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions lib/sample_uploader/sample_uploaderImpl.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,6 @@ def import_samples(self, ctx, params):
# ctx is the context object
# return variables are: output
#BEGIN import_samples
print(f"Beginning sample import with following parameters:")
print(f"params -- {params}")
sample_set = {"samples": []}
# Check if we have an existing Sample Set as input
# if so, download
Expand Down
16 changes: 11 additions & 5 deletions lib/sample_uploader/utils/importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ def _has_sesar_header(df):
sesar_headers = ['object type:', 'user code:']
sesar_headers_count = [i for i in df.columns if i.lower() in sesar_headers]
if len(unnamed_cols) > 0 or len(sesar_headers_count) > 0:
print('Detected extra header line. Setting header_row_index to 1')
has_sesar_header = True

return has_sesar_header
Expand Down Expand Up @@ -63,7 +62,6 @@ def find_header_row(sample_file, file_format):
# TODO: this function will fail if the extra header line happens to have exactly the same number of columns as the real header line
non_empty_header = [i for i in first_line.split(inferred_sep) if i not in ['', '\n']]
if len(non_empty_header) != len(second_line.split(inferred_sep)):
print('Detected extra header line. Setting header_row_index to 1')
header_row_index = 1

elif sample_file.endswith('.csv'):
Expand Down Expand Up @@ -278,20 +276,26 @@ def _get_existing_sample(name, kbase_sample_id):
del existing_sample_names[extra_sample]

user_keys = set()
controlled_keys = set()
for s in samples:
for n in s['sample']['node_tree']:
ukeys = set(n['meta_user'].keys())
ckeys = set(n['meta_controlled'].keys())
user_keys |= (ukeys - ckeys)
controlled_keys |= (ckeys - ukeys)
for key in user_keys:

warnings.warn(SampleContentWarning(
f"\"{key}\" is a user-defined column. It is of unknown type, will not be automatically validated, and may not be interoperable with other samples during analysis.",
key=key,
severity='warning'
))

# send list of keys with annotated custom fields to save with KBaseSets.SampleSet
metadata_keys = list(controlled_keys | {f'custom:{key}' for key in user_keys})

# add the missing samples from existing_sample_names
return samples, [existing_sample_names[key] for key in existing_sample_names]
return samples, [existing_sample_names[key] for key in existing_sample_names], metadata_keys


def _save_samples(samples, acls, sample_url, token, propagate_links):
Expand Down Expand Up @@ -449,7 +453,7 @@ def import_samples_from_file(
# query workspace for user permissions.
acls = get_workspace_user_perms(workspace_url, params.get('workspace_id'), token, username, acls)

samples, existing_samples = _produce_samples(
samples, existing_samples, metadata_keys = _produce_samples(
callback_url,
df,
column_groups,
Expand Down Expand Up @@ -518,6 +522,7 @@ def import_samples_from_file(

if has_unignored_errors:
saved_samples = []
metadata_keys = {}
else:
saved_samples = _save_samples(samples, acls, sample_url, token,
params.get('propagate_links', 0))
Expand All @@ -527,5 +532,6 @@ def import_samples_from_file(

return {
"samples": saved_samples,
"description": params.get('description')
"description": params.get('description'),
"metadata_keys": metadata_keys
}, has_unignored_errors, errors.get(), sample_data_json
1 change: 0 additions & 1 deletion lib/sample_uploader/utils/mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,6 @@ def _fetch_global_config(config_url, github_release_url, gh_token, file_name):
else:
NON_PREFIX_TO_PREFIX[field_name] = [field.lower()]


def alias_map(col_config):
"""
aliases map
Expand Down
10 changes: 0 additions & 10 deletions lib/sample_uploader/utils/sample_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,6 @@ def compare_samples(s1, s2):
if s1 is None or s2 is None:
return False
else:
print('start comparing samples: {} vs {}'.format(s1['name'], s2['name']))

def remove_field(node, field):
if field in node:
Expand All @@ -248,10 +247,6 @@ def remove_field(node, field):

# TODO: worth scrutiny
comp = s1['name'] == s2['name'] and s1_nt == s2_nt
if comp:
print('compared samples are the same')
else:
print('compared samples are different')
return comp


Expand Down Expand Up @@ -294,7 +289,6 @@ def save_sample(sample, sample_url, token, previous_version=None, propagate_link
token - workspace token for Authorization
previous_version - data of previous version of sample
"""
print('start saving sample')
headers = {
"Authorization": token,
"Content-Type": "application/json"
Expand Down Expand Up @@ -329,10 +323,7 @@ def save_sample(sample, sample_url, token, previous_version=None, propagate_link
sample_id = resp_json['result'][0]['id']
sample_ver = resp_json['result'][0]['version']

print('saved sample {} (version: {}'.format(sample_id, sample_ver))

if previous_version and propagate_links:
print('start propagating previous data links')

ss = SampleService(sample_url)
ss.propagate_data_links({'id': sample_id,
Expand Down Expand Up @@ -498,7 +489,6 @@ def expire_data_link(obj_refs, sample_url, token):
links = get_data_links_from_ss(upa, sample_url, token)
for link in links:
upa, dataid = link.get('upa'), link.get('dataid')
print('start expiring link {}-{}'.format(upa, dataid))
ss.expire_data_link({'upa': upa, 'dataid': dataid})
expired_link_count += 1

Expand Down
58 changes: 58 additions & 0 deletions test/utils/importer_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,17 @@ def test_ENIGMA_format(self):
expected_sample_name = ['s1', 's2', 's3']
self.assertCountEqual([sample['name'] for sample in samples], expected_sample_name)
self.assertEqual(has_unignored_errors, False)
self.assertIn('metadata_keys', sample_set)
self.assertCountEqual(sample_set['metadata_keys'], [
'feature',
'longitude',
'enigma:aquifer',
'custom:jamboree',
'material',
'latitude',
'biome',
'sample_template'
])

ori_compare_path = os.path.join(self.test_dir, "data", "fake_samples_ENIGMA.json")
compare_path = os.path.join(self.test_dir, "data", "updated_fake_samples_ENIGMA.json")
Expand Down Expand Up @@ -180,6 +191,17 @@ def test_ENIGMA_format(self):
samples = sample_set['samples']
self.assertEqual(len(samples), 3)
expected_sample_name = ['s1', 's2', 's3']
self.assertIn('metadata_keys', sample_set)
self.assertCountEqual(sample_set['metadata_keys'], [
'feature',
'longitude',
'enigma:aquifer',
'custom:jamboree',
'material',
'latitude',
'biome',
'sample_template'
])
self.assertCountEqual([sample['name'] for sample in samples], expected_sample_name)
self.assertEqual(has_unignored_errors, False)

Expand Down Expand Up @@ -362,6 +384,18 @@ def test_import_SESAR_format(self):
expected_sample_name = ['s1', 's2', 's3']
self.assertCountEqual([sample['name'] for sample in samples], expected_sample_name)
self.assertEqual(has_unignored_errors, False)
self.assertIn('metadata_keys', sample_set)
self.assertCountEqual(sample_set['metadata_keys'], [
'custom:user_field',
'sesar:material',
'country',
'biome',
'sesar:size',
'depth_bgs',
'sample_template',
'feature',
'sesar:elevation_start'
])

compare_path = os.path.join(self.test_dir, "data", "fake_samples.json")
self._verify_samples(sample_set, compare_path)
Expand Down Expand Up @@ -400,6 +434,29 @@ def test_KBASE_format(self):
expected_sample_name = ['SAMN03166112', 'SAMN04383980']
self.assertCountEqual([sample['name'] for sample in samples], expected_sample_name)
self.assertEqual(has_unignored_errors, False)
self.assertIn('metadata_keys', sample_set)
self.assertCountEqual(sample_set['metadata_keys'], [
'custom:source_name',
'custom:env_biome',
'custom:misc_param',
'custom:elev',
'sample_template',
'custom:env_feature',
'sesar:collection_date',
'custom:current_archive_contact',
'custom:collector',
'custom:cell_line',
'custom:depth',
'custom:env_material',
'custom:publication_date',
'longitude',
'custom:current_archive',
'custom:last_update',
'latitude',
'custom:submission_date',
'description',
'custom:geo_loc_name'
])

def test_dup_column_file(self):
''''''
Expand Down Expand Up @@ -433,6 +490,7 @@ def test_dup_column_file(self):

# check errors
self.assertEqual(len(errors), 1)
self.assertEqual(sample_set['metadata_keys'], {})
self.assertEqual(errors[0].message, 'Duplicate column "some_field". "some_field" would ' +
'overwrite a different column "some field". Rename ' +
'your columns to be unique alphanumericaly, ' +
Expand Down