Skip to content

Commit d4c113e

Browse files
authored
Merge pull request #160 from icgc-argo-workflows/payload-gen-seq-experiment@0.7.1
[release]
2 parents 0d03441 + 3b7b7d9 commit d4c113e

File tree

7 files changed

+224
-44
lines changed

7 files changed

+224
-44
lines changed

payload-gen-seq-experiment/main.nf

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
/* this block is auto-generated based on info from pkg.json where */
2727
/* changes can be made if needed, do NOT modify this block manually */
2828
nextflow.enable.dsl = 2
29-
version = '0.7.0'
29+
version = '0.7.1'
3030

3131
container = [
3232
'ghcr.io': 'ghcr.io/icgc-argo-workflows/data-processing-utility-tools.payload-gen-seq-experiment'
@@ -51,6 +51,7 @@ params.read_group_info_tsv = "NO_FILE2"
5151
params.file_info_tsv = "NO_FILE3"
5252
params.extra_info_tsv = "NO_FILE4"
5353
params.schema_url="NO_FILE5"
54+
params.metadata_payload_json="NO_FILE6"
5455

5556
process payloadGenSeqExperiment {
5657
container "${params.container ?: container[params.container_registry ?: default_container_registry]}:${params.container_version ?: version}"
@@ -64,6 +65,7 @@ process payloadGenSeqExperiment {
6465
path read_group_info_tsv
6566
path file_info_tsv
6667
path extra_info_tsv
68+
path metadata_payload_json
6769
val schema_url
6870

6971
output:
@@ -74,13 +76,15 @@ process payloadGenSeqExperiment {
7476
args_read_group_info_tsv = !read_group_info_tsv.name.startsWith("NO_FILE") ? "-r ${read_group_info_tsv}" : ""
7577
args_file_info_tsv = !file_info_tsv.name.startsWith("NO_FILE") ? "-f ${file_info_tsv}" : ""
7678
args_extra_info_tsv = !extra_info_tsv.name.startsWith("NO_FILE") ? "-e ${extra_info_tsv}" : ""
79+
args_metadata_payload_json= !metadata_payload_json.name.startsWith("NO_FILE") ? "-m ${metadata_payload_json}" : ""
7780
args_schema_url = !schema_url.startsWith("NO_FILE") ? "-s ${schema_url}" : ""
7881
"""
7982
main.py \
8083
${args_experiment_info_tsv} \
8184
${args_read_group_info_tsv} \
8285
${args_file_info_tsv} \
8386
${args_extra_info_tsv} \
87+
${args_metadata_payload_json} \
8488
${args_schema_url}
8589
"""
8690
}
@@ -94,6 +98,7 @@ workflow {
9498
file(params.read_group_info_tsv),
9599
file(params.file_info_tsv),
96100
file(params.extra_info_tsv),
101+
file(params.metadata_payload_json),
97102
params.schema_url
98103
)
99-
}
104+
}

payload-gen-seq-experiment/main.py

Lines changed: 45 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323
Edmund Su <edmund.su@oicr.on.ca>
2424
"""
2525

26-
2726
import sys
2827
import uuid
2928
import json
@@ -173,7 +172,7 @@ def load_all_tsvs(exp_tsv, rg_tsv, file_tsv):
173172

174173
def validate_args(args):
175174
if args.metadata_json and \
176-
not (args.experiment_info_tsv or args.read_group_info_tsv or args.file_info_tsv):
175+
not (args.experiment_info_tsv and args.read_group_info_tsv and args.file_info_tsv):
177176
return True
178177
elif not args.metadata_json and \
179178
(args.experiment_info_tsv and args.read_group_info_tsv and args.file_info_tsv):
@@ -182,18 +181,14 @@ def validate_args(args):
182181
sys.exit(textwrap.dedent(
183182
"""
184183
Usage:
185-
When '-m' is provided, no other arguments can be used
186-
When '-m' is not provided, please provide all of these arguments: -x, -r and -f
184+
When '-m' is provided, '-x','-r' and '-f' are ignored arguments can be used
185+
When '-m' is not provided, please provide all of these arguments: '-x', '-r' and '-f'
187186
Optionally '-s' a schema URL can be provided, which the payload will be validated against
188187
"""
189188
))
190189

191-
def validatePayload(payload,args):
192-
if args.schema_url:
193-
url=args.schema_url
194-
else:
195-
url="https://submission-song.rdpc.cancercollaboratory.org/schemas/sequencing_experiment"
196-
190+
def validatePayload(payload,url):
191+
197192
resp=requests.get(url)
198193
if not resp.status_code==200:
199194
sys.exit("Unable to retrieve schema. Please check URL\n")
@@ -208,7 +203,7 @@ def validatePayload(payload,args):
208203

209204

210205

211-
def main(metadata, extra_info=dict()):
206+
def main(metadata,url,extra_info=dict()):
212207
empty_str_to_null(metadata)
213208

214209
payload = {
@@ -290,8 +285,11 @@ def main(metadata, extra_info=dict()):
290285
sys.exit(f"Field '%s' in file '%s' with value '%s' does not match expected regex pattern '^%s[0-9]{1,32}$'" % (optional_file_field,input_file.get('name'),input_file.get(optional_file_field),EGA_FIELDS[optional_file_field]))
291286

292287
for rg in metadata.get("read_groups"):
293-
rg.pop('type') # remove 'type' field
294-
rg.pop('submitter_sequencing_experiment_id') # remove 'submitter_sequencing_experiment_id' field
288+
if "type" in rg:
289+
print(rg)
290+
rg.pop('type') # remove 'type' field
291+
if "submitter_sequencing_experiment_id" in rg:
292+
rg.pop('submitter_sequencing_experiment_id') # remove 'submitter_sequencing_experiment_id' field
295293
payload['read_groups'].append(rg)
296294

297295

@@ -327,8 +325,8 @@ def main(metadata, extra_info=dict()):
327325
existing_ele['info'].update(extra_info[item][ele_to_update])
328326
else:
329327
existing_ele.update(extra_info[item][ele_to_update])
330-
331-
validatePayload(payload,args)
328+
329+
validatePayload(payload,url)
332330
with open("%s.sequencing_experiment.payload.json" % str(uuid.uuid4()), 'w') as f:
333331
f.write(json.dumps(payload, indent=2))
334332

@@ -351,9 +349,17 @@ def main(metadata, extra_info=dict()):
351349

352350
validate_args(args)
353351

352+
if args.schema_url:
353+
url=args.schema_url
354+
else:
355+
url="https://submission-song.rdpc.cancercollaboratory.org/schemas/sequencing_experiment"
356+
354357
if args.metadata_json:
355358
with open(args.metadata_json, 'r') as f:
356359
metadata = json.load(f)
360+
validatePayload(metadata,url)
361+
with open("%s.sequencing_experiment.payload.json" % str(uuid.uuid4()), 'w') as f:
362+
f.write(json.dumps(metadata, indent=2))
357363
else:
358364
# firstly TSV format conformity check, if not well-formed no point to continue
359365
tsv_confomity_check('experiment', args.experiment_info_tsv)
@@ -367,28 +373,28 @@ def main(metadata, extra_info=dict()):
367373
args.file_info_tsv
368374
)
369375

370-
extra_info = dict()
371-
if args.extra_info_tsv:
372-
with open(args.extra_info_tsv, 'r') as f:
373-
for row in csv.DictReader(f, delimiter='\t'):
374-
375-
for row_type in ['type','submitter_id','submitter_field','field_value']:
376-
if row_type not in row.keys():
377-
sys.exit(f"Incorrect formatting of : {args.extra_info_tsv}. {row_type} is missing")
378-
379-
row_type = row['type']
380-
row_id= row['submitter_id']
381-
row_field= row['submitter_field']
382-
row_val= row['field_value']
383-
384-
if (row_type!="sample") and (row_type!="donor") and (row_type!="specimen") and (row_type!="files") and (row_type!="experiment"):
385-
sys.exit(f"Incorrect identifier supplied. Must be on the following : 'sample','donor','specimen','files','experiments'. Offending value: {type}, in file: {args.extra_info_tsv}")
386-
387-
if row_type not in extra_info:
388-
extra_info[row_type]=dict()
389-
if row_id not in extra_info[row_type]:
390-
extra_info[row_type][row_id]=dict()
391-
extra_info[row_type][row_id][row_field]=row_val
376+
extra_info = dict()
377+
if args.extra_info_tsv:
378+
with open(args.extra_info_tsv, 'r') as f:
379+
for row in csv.DictReader(f, delimiter='\t'):
392380

393-
394-
main(metadata, extra_info)
381+
for row_type in ['type','submitter_id','submitter_field','field_value']:
382+
if row_type not in row.keys():
383+
sys.exit(f"Incorrect formatting of : {args.extra_info_tsv}. {row_type} is missing")
384+
385+
row_type = row['type']
386+
row_id= row['submitter_id']
387+
row_field= row['submitter_field']
388+
row_val= row['field_value']
389+
390+
if (row_type!="sample") and (row_type!="donor") and (row_type!="specimen") and (row_type!="files") and (row_type!="experiment"):
391+
sys.exit(f"Incorrect identifier supplied. Must be on the following : 'sample','donor','specimen','files','experiments'. Offending value: {type}, in file: {args.extra_info_tsv}")
392+
393+
if row_type not in extra_info:
394+
extra_info[row_type]=dict()
395+
if row_id not in extra_info[row_type]:
396+
extra_info[row_type][row_id]=dict()
397+
extra_info[row_type][row_id][row_field]=row_val
398+
399+
400+
main(metadata,url, extra_info)

payload-gen-seq-experiment/pkg.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "payload-gen-seq-experiment",
3-
"version": "0.7.0",
3+
"version": "0.7.1",
44
"description": "SONG payload generation for sequencing experiment",
55
"main": "main.nf",
66
"deprecated": false,

payload-gen-seq-experiment/tests/checker.nf

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,9 @@
1717
along with this program. If not, see <http://www.gnu.org/licenses/>.
1818
1919
Authors:
20+
Linda Xiang
2021
Junjun Zhang
22+
Edmund Su
2123
*/
2224

2325
/*
@@ -29,7 +31,7 @@
2931
/* this block is auto-generated based on info from pkg.json where */
3032
/* changes can be made if needed, do NOT modify this block manually */
3133
nextflow.enable.dsl = 2
32-
version = '0.7.0'
34+
version = '0.7.1'
3335

3436
container = [
3537
'ghcr.io': 'ghcr.io/icgc-argo-workflows/data-processing-utility-tools.payload-gen-seq-experiment'
@@ -48,6 +50,8 @@ params.read_group_info_tsv = "NO_FILE2"
4850
params.file_info_tsv = "NO_FILE3"
4951
params.extra_info_tsv = "NO_FILE4"
5052
params.schema_url = "NO_FILE5"
53+
params.metadata_payload_json = "NO_FILE6"
54+
5155
params.expected_output = ""
5256

5357
include { payloadGenSeqExperiment } from '../main'
@@ -68,7 +72,6 @@ process file_smart_diff {
6872
# Note: this is only for demo purpose, please write your own 'diff' according to your own needs.
6973
# remove date field before comparison eg, <div id="header_filename">Tue 19 Jan 2021<br/>test_rg_3.bam</div>
7074
# sed -e 's#"header_filename">.*<br/>test_rg_3.bam#"header_filename"><br/>test_rg_3.bam</div>#'
71-
7275
diff <( cat ${output_file} | sed -e 's#"header_filename">.*<br/>#"header_filename"><br/>#' ) \
7376
<( ([[ '${expected_file}' == *.gz ]] && gunzip -c ${expected_file} || cat ${expected_file}) | sed -e 's#"header_filename">.*<br/>#"header_filename"><br/>#' ) \
7477
&& ( echo "Test PASSED" && exit 0 ) || ( echo "Test FAILED, output file mismatch." && exit 1 )
@@ -83,6 +86,7 @@ workflow checker {
8386
file_info_tsv
8487
extra_info_tsv
8588
expected_output
89+
metadata_payload_json
8690
schema_url
8791

8892
main:
@@ -91,6 +95,7 @@ workflow checker {
9195
read_group_info_tsv,
9296
file_info_tsv,
9397
extra_info_tsv,
98+
metadata_payload_json,
9499
schema_url
95100
)
96101

@@ -108,6 +113,7 @@ workflow {
108113
file(params.file_info_tsv),
109114
file(params.extra_info_tsv),
110115
file(params.expected_output),
116+
file(params.metadata_payload_json),
111117
params.schema_url
112118
)
113119
}
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
{
2+
"analysisType": {
3+
"name": "sequencing_experiment"
4+
},
5+
"studyId": "TEST-PRO",
6+
"experiment": {
7+
"submitter_sequencing_experiment_id": "TEST_EXP",
8+
"sequencing_center": "EXT",
9+
"platform": "ILLUMINA",
10+
"platform_model": "HiSeq 2000",
11+
"experimental_strategy": "WGS",
12+
"sequencing_date": "2014-12-12"
13+
},
14+
"read_group_count": 3,
15+
"read_groups": [
16+
{
17+
"submitter_read_group_id": "C0HVY.2",
18+
"read_group_id_in_bam": null,
19+
"platform_unit": "74_8a",
20+
"is_paired_end": true,
21+
"file_r1": "test_rg_3.bam",
22+
"file_r2": "test_rg_3.bam",
23+
"read_length_r1": 150,
24+
"read_length_r2": 150,
25+
"insert_size": 298,
26+
"sample_barcode": null,
27+
"library_name": "Pond-147580"
28+
},
29+
{
30+
"submitter_read_group_id": "D0RE2.1",
31+
"read_group_id_in_bam": null,
32+
"platform_unit": "74_8b",
33+
"is_paired_end": true,
34+
"file_r1": "test_rg_3.bam",
35+
"file_r2": "test_rg_3.bam",
36+
"read_length_r1": 150,
37+
"read_length_r2": 150,
38+
"insert_size": 298,
39+
"sample_barcode": null,
40+
"library_name": "Pond-147580"
41+
},
42+
{
43+
"submitter_read_group_id": "D0RH0.2",
44+
"read_group_id_in_bam": null,
45+
"platform_unit": "74_8c",
46+
"is_paired_end": true,
47+
"file_r1": "test_rg_3.bam",
48+
"file_r2": "test_rg_3.bam",
49+
"read_length_r1": 150,
50+
"read_length_r2": 150,
51+
"insert_size": 298,
52+
"sample_barcode": null,
53+
"library_name": "Pond-147580"
54+
}
55+
],
56+
"samples": [
57+
{
58+
"submitterSampleId": "HCC1143_BAM_INPUT",
59+
"matchedNormalSubmitterSampleId": null,
60+
"sampleType": "Total DNA",
61+
"specimen": {
62+
"submitterSpecimenId": "HCC1143_BAM_INPUT",
63+
"tumourNormalDesignation": "Normal",
64+
"specimenTissueSource": "Blood derived",
65+
"specimenType": "Cell line - derived from normal"
66+
},
67+
"donor": {
68+
"submitterDonorId": "HCC1143",
69+
"gender": "Female"
70+
}
71+
}
72+
],
73+
"files": [
74+
{
75+
"fileName": "test_rg_3.bam",
76+
"fileSize": 14911,
77+
"fileMd5sum": "178f97f7b1ca8bfc28fd5586bdd56799",
78+
"fileType": "BAM",
79+
"fileAccess": "controlled",
80+
"dataType": "Submitted Reads",
81+
"info": {
82+
"data_category": "Sequencing Reads",
83+
"ega_file_id": "EGAF000001"
84+
}
85+
}
86+
]
87+
}

0 commit comments

Comments
 (0)