Skip to content

Commit 44f01ac

Browse files
authored
Merge pull request #163 from icgc-argo-workflows/payload-gen-seq-experiment@0.8.0
[release]
2 parents d4c113e + d40c9c4 commit 44f01ac

19 files changed

+580
-17
lines changed

payload-gen-seq-experiment/main.nf

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
/* this block is auto-generated based on info from pkg.json where */
2727
/* changes can be made if needed, do NOT modify this block manually */
2828
nextflow.enable.dsl = 2
29-
version = '0.7.1'
29+
version = '0.8.0'
3030

3131
container = [
3232
'ghcr.io': 'ghcr.io/icgc-argo-workflows/data-processing-utility-tools.payload-gen-seq-experiment'
@@ -52,6 +52,8 @@ params.file_info_tsv = "NO_FILE3"
5252
params.extra_info_tsv = "NO_FILE4"
5353
params.schema_url="NO_FILE5"
5454
params.metadata_payload_json="NO_FILE6"
55+
params.converted_files=["NO_FILE7"]
56+
params.cram_reference="NO_FILE8"
5557

5658
process payloadGenSeqExperiment {
5759
container "${params.container ?: container[params.container_registry ?: default_container_registry]}:${params.container_version ?: version}"
@@ -67,6 +69,8 @@ process payloadGenSeqExperiment {
6769
path extra_info_tsv
6870
path metadata_payload_json
6971
val schema_url
72+
path converted_files
73+
path cram_reference
7074

7175
output:
7276
path "*.sequencing_experiment.payload.json", emit: payload
@@ -78,18 +82,19 @@ process payloadGenSeqExperiment {
7882
args_extra_info_tsv = !extra_info_tsv.name.startsWith("NO_FILE") ? "-e ${extra_info_tsv}" : ""
7983
args_metadata_payload_json= !metadata_payload_json.name.startsWith("NO_FILE") ? "-m ${metadata_payload_json}" : ""
8084
args_schema_url = !schema_url.startsWith("NO_FILE") ? "-s ${schema_url}" : ""
85+
args_converted_file_args = !cram_reference.startsWith("NO_FILE") ? "-br ${cram_reference} -b ${converted_files}" : ""
8186
"""
8287
main.py \
8388
${args_experiment_info_tsv} \
8489
${args_read_group_info_tsv} \
8590
${args_file_info_tsv} \
8691
${args_extra_info_tsv} \
8792
${args_metadata_payload_json} \
88-
${args_schema_url}
93+
${args_schema_url} \
94+
${args_converted_file_args}
8995
"""
9096
}
9197

92-
9398
// this provides an entry point for this main script, so it can be run directly without clone the repo
9499
// using this command: nextflow run <git_acc>/<repo>/<pkg_name>/<main_script>.nf -r <pkg_name>.v<pkg_version> --params-file xxx
95100
workflow {
@@ -99,6 +104,8 @@ workflow {
99104
file(params.file_info_tsv),
100105
file(params.extra_info_tsv),
101106
file(params.metadata_payload_json),
102-
params.schema_url
107+
params.schema_url,
108+
Channel.fromPath(params.converted_files).collect(),
109+
file(params.cram_reference)
103110
)
104111
}

payload-gen-seq-experiment/main.py

Lines changed: 50 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@
3232
import requests
3333
import re
3434
import jsonschema
35+
import os
36+
import hashlib
3537

3638

3739
TSV_FIELDS = {}
@@ -201,9 +203,38 @@ def validatePayload(payload,url):
201203
else:
202204
return True
203205

204-
205-
206-
def main(metadata,url,extra_info=dict()):
206+
def calculate_size(file_path):
207+
return os.stat(file_path).st_size
208+
209+
def calculate_md5(file_path):
210+
md5 = hashlib.md5()
211+
with open(file_path, 'rb') as f:
212+
for chunk in iter(lambda: f.read(1024 * 1024), b''):
213+
md5.update(chunk)
214+
return md5.hexdigest()
215+
216+
def replace_cram_with_bam(payload,bam_from_cram,bam_from_cram_reference):
217+
for bam in bam_from_cram:
218+
for cram in payload['files']:
219+
if re.sub('\.cram$','',cram['fileName'])==re.sub('\.bam$','',bam) and cram['fileType']=='CRAM':
220+
cram['info']['original_cram_info']={}
221+
cram['info']['original_cram_info']['fileName']=cram['fileName']
222+
cram['info']['original_cram_info']['fileSize']=cram['fileSize']
223+
cram['info']['original_cram_info']['fileMd5sum']=cram['fileMd5sum']
224+
cram['info']['original_cram_info']['fileType']=cram['fileType']
225+
cram['info']['original_cram_info']['referenceFileName']=bam_from_cram_reference
226+
cram['fileName']=bam
227+
cram['fileSize']=calculate_size(bam)
228+
cram['fileMd5sum']=calculate_md5(bam)
229+
cram['fileType']="BAM"
230+
for rg in payload["read_groups"]:
231+
if re.sub('\.cram$','',rg['file_r1'])==re.sub('\.bam$','',bam):
232+
rg['file_r1']=bam
233+
if rg['is_paired_end']:
234+
rg['file_r2']=bam
235+
return(payload)
236+
237+
def main(metadata,url,bam_from_cram,bam_from_cram_reference,extra_info=dict()):
207238
empty_str_to_null(metadata)
208239

209240
payload = {
@@ -280,13 +311,16 @@ def main(metadata,url,extra_info=dict()):
280311
for optional_file_field in TSV_FIELDS['file']["conditional"]:
281312
if input_file.get(optional_file_field):
282313
if re.findall("^"+EGA_FIELDS[optional_file_field]+'[0-9]{1,32}$',input_file.get(optional_file_field)):
283-
payload['files'][-1]['info'][optional_file_field]=input_file.get(optional_file_field)
314+
if payload['files'][-1]['info'].get("ega"):
315+
payload['files'][-1]['info']['ega'][optional_file_field]=input_file.get(optional_file_field)
316+
else:
317+
payload['files'][-1]['info']['ega']={}
318+
payload['files'][-1]['info']['ega'][optional_file_field]=input_file.get(optional_file_field)
284319
else:
285320
sys.exit(f"Field '%s' in file '%s' with value '%s' does not match expected regex pattern '^%s[0-9]{1,32}$'" % (optional_file_field,input_file.get('name'),input_file.get(optional_file_field),EGA_FIELDS[optional_file_field]))
286321

287322
for rg in metadata.get("read_groups"):
288323
if "type" in rg:
289-
print(rg)
290324
rg.pop('type') # remove 'type' field
291325
if "submitter_sequencing_experiment_id" in rg:
292326
rg.pop('submitter_sequencing_experiment_id') # remove 'submitter_sequencing_experiment_id' field
@@ -325,7 +359,9 @@ def main(metadata,url,extra_info=dict()):
325359
existing_ele['info'].update(extra_info[item][ele_to_update])
326360
else:
327361
existing_ele.update(extra_info[item][ele_to_update])
328-
362+
if len(bam_from_cram)>0:
363+
payload=replace_cram_with_bam(payload,bam_from_cram,bam_from_cram_reference)
364+
329365
validatePayload(payload,url)
330366
with open("%s.sequencing_experiment.payload.json" % str(uuid.uuid4()), 'w') as f:
331367
f.write(json.dumps(payload, indent=2))
@@ -345,6 +381,10 @@ def main(metadata,url,extra_info=dict()):
345381
help="tsv file containing additional information pertaining to existing experiment, read_group, and file information submitted from user that does not fit within existing schemas")
346382
parser.add_argument("-s", "--schema-url",
347383
help="URL to validate schema against")
384+
parser.add_argument("-b", "--bam-from-cram",nargs="+",default=[],
385+
help="BAM files that have converted from CRAM")
386+
parser.add_argument("-br", "--bam-from-cram-reference",default=None,
387+
help="Name of reference file used in cram2bam conversion")
348388
args = parser.parse_args()
349389

350390
validate_args(args)
@@ -357,6 +397,9 @@ def main(metadata,url,extra_info=dict()):
357397
if args.metadata_json:
358398
with open(args.metadata_json, 'r') as f:
359399
metadata = json.load(f)
400+
401+
if len(args.bam_from_cram)>0:
402+
payload=replace_cram_with_bam(metadata,args.bam_from_cram,args.bam_from_cram_reference)
360403
validatePayload(metadata,url)
361404
with open("%s.sequencing_experiment.payload.json" % str(uuid.uuid4()), 'w') as f:
362405
f.write(json.dumps(metadata, indent=2))
@@ -397,4 +440,4 @@ def main(metadata,url,extra_info=dict()):
397440
extra_info[row_type][row_id][row_field]=row_val
398441

399442

400-
main(metadata,url, extra_info)
443+
main(metadata,url,args.bam_from_cram,args.bam_from_cram_reference,extra_info)

payload-gen-seq-experiment/pkg.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "payload-gen-seq-experiment",
3-
"version": "0.7.1",
3+
"version": "0.8.0",
44
"description": "SONG payload generation for sequencing experiment",
55
"main": "main.nf",
66
"deprecated": false,

payload-gen-seq-experiment/tests/checker.nf

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
/* this block is auto-generated based on info from pkg.json where */
3232
/* changes can be made if needed, do NOT modify this block manually */
3333
nextflow.enable.dsl = 2
34-
version = '0.7.1'
34+
version = '0.8.0'
3535

3636
container = [
3737
'ghcr.io': 'ghcr.io/icgc-argo-workflows/data-processing-utility-tools.payload-gen-seq-experiment'
@@ -51,6 +51,8 @@ params.file_info_tsv = "NO_FILE3"
5151
params.extra_info_tsv = "NO_FILE4"
5252
params.schema_url = "NO_FILE5"
5353
params.metadata_payload_json = "NO_FILE6"
54+
params.converted_files=["NO_FILE7"]
55+
params.cram_reference="NO_FILE8"
5456

5557
params.expected_output = ""
5658

@@ -88,6 +90,8 @@ workflow checker {
8890
expected_output
8991
metadata_payload_json
9092
schema_url
93+
converted_files
94+
cram_reference
9195

9296
main:
9397
payloadGenSeqExperiment(
@@ -96,7 +100,9 @@ workflow checker {
96100
file_info_tsv,
97101
extra_info_tsv,
98102
metadata_payload_json,
99-
schema_url
103+
schema_url,
104+
converted_files,
105+
cram_reference
100106
)
101107

102108
file_smart_diff(
@@ -114,6 +120,8 @@ workflow {
114120
file(params.extra_info_tsv),
115121
file(params.expected_output),
116122
file(params.metadata_payload_json),
117-
params.schema_url
123+
params.schema_url,
124+
Channel.fromPath(params.converted_files).collect(),
125+
file(params.cram_reference)
118126
)
119127
}

payload-gen-seq-experiment/tests/input/12c64309-4f21-4a86-8175-ca2340babadd.sequencing_experiment.payload.json

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,9 @@
8080
"dataType": "Submitted Reads",
8181
"info": {
8282
"data_category": "Sequencing Reads",
83-
"ega_file_id": "EGAF000001"
83+
"ega": {
84+
"ega_file_id": "EGAF000001"
85+
}
8486
}
8587
}
8688
]

payload-gen-seq-experiment/tests/input/208c5ea6-c17a-4a63-981e-4bb91d3119f2.sequencing_experiment.payload.json

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,9 @@
8181
"dataType": "Submitted Reads",
8282
"info": {
8383
"data_category": "Sequencing Reads",
84-
"ega_file_id": "EGAF000001"
84+
"ega": {
85+
"ega_file_id": "EGAF000001"
86+
}
8587
}
8688
}
8789
]
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
{
2+
"analysisType": {
3+
"name": "sequencing_experiment"
4+
},
5+
"studyId": "TEST-PRO",
6+
"experiment": {
7+
"submitter_sequencing_experiment_id": "TEST_EXP",
8+
"sequencing_center": "EXT",
9+
"platform": "ILLUMINA",
10+
"platform_model": "HiSeq 2000",
11+
"experimental_strategy": "WGS",
12+
"sequencing_date": "2014-12-12"
13+
},
14+
"read_group_count": 3,
15+
"read_groups": [
16+
{
17+
"submitter_read_group_id": "C0HVY.2",
18+
"read_group_id_in_bam": null,
19+
"platform_unit": "74_8a",
20+
"is_paired_end": true,
21+
"file_r1": "test_rg_3.bam",
22+
"file_r2": "test_rg_3.bam",
23+
"read_length_r1": 150,
24+
"read_length_r2": 150,
25+
"insert_size": 298,
26+
"sample_barcode": null,
27+
"library_name": "Pond-147580"
28+
},
29+
{
30+
"submitter_read_group_id": "D0RE2.1",
31+
"read_group_id_in_bam": null,
32+
"platform_unit": "74_8b",
33+
"is_paired_end": true,
34+
"file_r1": "example1.bam",
35+
"file_r2": "example1.bam",
36+
"read_length_r1": 150,
37+
"read_length_r2": 150,
38+
"insert_size": 298,
39+
"sample_barcode": null,
40+
"library_name": "Pond-147580"
41+
},
42+
{
43+
"submitter_read_group_id": "D0RH0.2",
44+
"read_group_id_in_bam": null,
45+
"platform_unit": "74_8c",
46+
"is_paired_end": true,
47+
"file_r1": "example2.bam",
48+
"file_r2": "example2.bam",
49+
"read_length_r1": 150,
50+
"read_length_r2": 150,
51+
"insert_size": 298,
52+
"sample_barcode": null,
53+
"library_name": "Pond-147580"
54+
}
55+
],
56+
"samples": [
57+
{
58+
"submitterSampleId": "HCC1143_BAM_INPUT",
59+
"matchedNormalSubmitterSampleId": null,
60+
"sampleType": "Total DNA",
61+
"specimen": {
62+
"submitterSpecimenId": "HCC1143_BAM_INPUT",
63+
"tumourNormalDesignation": "Normal",
64+
"specimenTissueSource": "Blood derived",
65+
"specimenType": "Cell line - derived from normal"
66+
},
67+
"donor": {
68+
"submitterDonorId": "HCC1143",
69+
"gender": "Female"
70+
}
71+
}
72+
],
73+
"files": [
74+
{
75+
"fileName": "test_rg_3.bam",
76+
"fileSize": 14911,
77+
"fileMd5sum": "178f97f7b1ca8bfc28fd5586bdd56799",
78+
"fileType": "BAM",
79+
"fileAccess": "controlled",
80+
"dataType": "Submitted Reads",
81+
"info": {
82+
"data_category": "Sequencing Reads",
83+
"ega": {
84+
"ega_file_id": "EGAF000001"
85+
}
86+
}
87+
},
88+
{
89+
"fileName": "example1.bam",
90+
"fileSize": 10,
91+
"fileMd5sum": "e2bb33a7b2c6a45933a994e3e2747458",
92+
"fileType": "BAM",
93+
"fileAccess": "controlled",
94+
"dataType": "Submitted Reads",
95+
"info": {
96+
"data_category": "Sequencing Reads",
97+
"ega": {
98+
"ega_file_id": "EGAF000002"
99+
},
100+
"original_cram_info": {
101+
"fileName": "example1.cram",
102+
"fileSize": 9,
103+
"fileMd5sum": "69e5bd0f686feb422ac4592bab5d74af",
104+
"fileType": "CRAM",
105+
"referenceFileName": "hello.fasta"
106+
}
107+
}
108+
},
109+
{
110+
"fileName": "example2.bam",
111+
"fileSize": 8,
112+
"fileMd5sum": "6faea40b2115116047ada65237661273",
113+
"fileType": "BAM",
114+
"fileAccess": "controlled",
115+
"dataType": "Submitted Reads",
116+
"info": {
117+
"data_category": "Sequencing Reads",
118+
"ega": {
119+
"ega_file_id": "EGAF000003"
120+
},
121+
"original_cram_info": {
122+
"fileName": "example2.cram",
123+
"fileSize": 11,
124+
"fileMd5sum": "4dfbb139c7ee52270157abc5ed3f7842",
125+
"fileType": "CRAM",
126+
"referenceFileName": "hello.fasta"
127+
}
128+
}
129+
}
130+
]
131+
}

0 commit comments

Comments
 (0)