forked from dna-seq/dna-seq
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdvc.lock
More file actions
156 lines (156 loc) · 7.32 KB
/
dvc.lock
File metadata and controls
156 lines (156 loc) · 7.32 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
schema: '2.0'
stages:
prepare:
cmd: gunzip -k data/gwas/homo_sapiens_ensembl_103/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz
&& unzip -o -d data/ensembl/103 data/ensembl/103/103.zip && mv data/ensembl/103/VEP_plugins-release-103
data/ensembl/103/plugins
deps:
- path: data/ensembl/103/103.zip
- path: data/gwas/homo_sapiens_ensembl_103/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz
outs:
- path: data/ensembl/103/plugins
- path: data/gwas/homo_sapiens_ensembl_103/Homo_sapiens.GRCh38.dna.primary_assembly.fa
start:
cmd: docker-compose up
stop:
cmd: docker-compose down
cache:
cmd: tar -C data/ensembl/103/cache/ -zxvf data/ensembl/103/cache/homo_sapiens_vep_103_GRCh38.tar.gz
deps:
- path: data/ensembl/103/cache/homo_sapiens_vep_103_GRCh38.tar.gz
outs:
- path: data/ensembl/103/cache/homo_sapiens
prepare_genome:
cmd: gunzip -k data/ensembl/106/species/Homo_sapiens/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz
&& samtools faidx data/ensembl/106/species/Homo_sapiens/Homo_sapiens.GRCh38.dna.primary_assembly.fa
&& samtools dict data/ensembl/106/species/Homo_sapiens/Homo_sapiens.GRCh38.dna.primary_assembly.fa
-o data/ensembl/106/species/Homo_sapiens/Homo_sapiens.GRCh38.dna.primary_assembly.fa.dict
md5sum data/ensembl/106/species/Homo_sapiens/Homo_sapiens.GRCh38.dna.primary_assembly.fa.*
> data/ensembl/106/species/Homo_sapiens/Homo_sapiens.GRCh38.dna.primary_assembly.fa.md5
deps:
- path: data/ensembl/106/species/Homo_sapiens/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz
md5: f58a039f83d8944ceb79cdd16cbda583
size: 881211416
outs:
- path: data/ensembl/106/species/Homo_sapiens/Homo_sapiens.GRCh38.dna.primary_assembly.fa
md5: 9e6b9465dc708d92bf6d67e9c9fa9389
size: 3151425857
- path: data/ensembl/106/species/Homo_sapiens/Homo_sapiens.GRCh38.dna.primary_assembly.fa.dict
md5: dc1b3dc9c008222222c89dfd9eca0087
size: 34186
- path: data/ensembl/106/species/Homo_sapiens/Homo_sapiens.GRCh38.dna.primary_assembly.fa.fai
md5: d527f3eb6b664020cf4d882b5820056f
size: 6406
- path: data/ensembl/106/species/Homo_sapiens/Homo_sapiens.GRCh38.dna.primary_assembly.fa.md5
md5: d41d8cd98f00b204e9800998ecf8427e
size: 0
prepare_vep:
cmd: unzip -u data/ensembl/103/103.zip -d data/ensembl/103/ && mv data/ensembl/103/VEP_plugins-release-103
data/ensembl/103/plugins
deps:
- path: data/ensembl/103/103.zip
- path: data/ensembl/103/cache/homo_sapiens_vep_103_GRCh38.tar.gz
outs:
- path: data/ensembl/103/plugins/
prepare_annotations:
cmd: "gunzip -c data/gwas/annotations/all_variant_disease_pmid_associations.tsv.gz\
\ |\n awk '($1 ~ /^snpId/ || $2 ~ /NA/) {next} {print $0}' |\n sort -t $'\\\
t' -k2,2 -k3,3n |\n awk '{ gsub (/\\t +/, \"\\t\", $0); print}' |\n bgzip\
\ -c > data/gwas/annotations/all_variant_disease_pmid_associations_final.tsv.gz\
\ &&\nrm -f data/gwas/annotations/gene2phenotype/AllG2P.csv && awk '(NR == 1)\
\ || (FNR > 1)' data/gwas/annotations/gene2phenotype/*.csv > data/gwas/annotations/gene2phenotype/AllG2P.csv"
deps:
- path: data/gwas/annotations/all_variant_disease_pmid_associations.tsv.gz
- path: data/gwas/annotations/clinvar.vcf.gz
- path: data/gwas/annotations/clinvar.vcf.gz.tbi
- path: data/gwas/annotations/gene2phenotype/CancerG2P.csv
- path: data/gwas/annotations/gene2phenotype/DDG2P.csv
- path: data/gwas/annotations/gene2phenotype/EyeG2P.csv
- path: data/gwas/annotations/gene2phenotype/SkinG2P.csv
outs:
- path: data/gwas/annotations/all_variant_disease_pmid_associations_final.tsv.gz
- path: data/gwas/annotations/gene2phenotype/AllG2P.csv
prepare_vep_plugins:
cmd: unzip -u data/ensembl/106/106.zip -d data/ensembl/106/ && mv data/ensembl/106/VEP_plugins-release-106
data/ensembl/106/plugins
deps:
- path: data/ensembl/106/106.zip
md5: 24158d25f82878edcacf444da702ed23
size: 519178
outs:
- path: data/ensembl/106/plugins/
md5: 46e9f69002e39c1fb41138fe8ac01710.dir
size: 1470134
nfiles: 77
prepare_vep_cache:
cmd: tar -zxf data/ensembl/106/homo_sapiens_vep_106_GRCh38.tar.gz -C data/ensembl/106/cache/
&& md5sum data/ensembl/106/homo_sapiens_vep_106_GRCh38.tar.gz > data/ensembl/106/homo_sapiens_vep_106_GRCh38.tar.gz.md5
deps:
- path: data/ensembl/106/homo_sapiens_vep_106_GRCh38.tar.gz
md5: f2d4487002a86fe4c39a9ca0fbfab9da
size: 19350569358
outs:
- path: data/ensembl/106/cache/homo_sapiens
md5: f921763987635b40b85cfaf8a3f68048.dir
size: 19621031672
nfiles: 13892
prepare_annotations_clinvar:
cmd: echo "Clinvars annotations prepared"
deps:
- path: data/gwas/annotations/clinvar.vcf.gz
md5: 1a0bd5729abb411a388e590431c62f99
size: 57813301
- path: data/gwas/annotations/clinvar.vcf.gz.tbi
md5: 32876337b3454813376f2599b38fd96a
size: 327433
prepare_annotations_gene2phenotype:
cmd: rm -f data/gwas/annotations/gene2phenotype/AllG2P.csv && awk '(NR == 1) ||
(FNR > 1)' data/gwas/annotations/gene2phenotype/*.csv > data/gwas/annotations/gene2phenotype/AllG2P.csv
deps:
- path: data/gwas/annotations/gene2phenotype/CancerG2P.csv
md5: 6504732dc971b2578c21a8cbd6c2d3ad
size: 19914
- path: data/gwas/annotations/gene2phenotype/DDG2P.csv
md5: d21a0e446a3cc3b3adbae486cd6d11c6
size: 776664
- path: data/gwas/annotations/gene2phenotype/EyeG2P.csv
md5: 9bf5d96096c52df97792b165fbfeda45
size: 221233
- path: data/gwas/annotations/gene2phenotype/SkinG2P.csv
md5: 15e1b25db13f3929357ba3dba60e7e1b
size: 214278
outs:
- path: data/gwas/annotations/gene2phenotype/AllG2P.csv
md5: ccf679127863b2832ccb1ceb2561502a
size: 1231432
prepare_annotations_digenet:
cmd: "gunzip -c data/gwas/annotations/all_variant_disease_pmid_associations.tsv.gz\
\ |\n awk '($1 ~ /^snpId/ || $2 ~ /NA/) {next} {print $0}' |\n sort -t $'\\\
t' -k2,2 -k3,3n |\n awk '{ gsub (/\\t +/, \"\\t\", $0); print}' |\n bgzip\
\ -c > data/gwas/annotations/all_variant_disease_pmid_associations_final.tsv.gz"
deps:
- path: data/gwas/annotations/all_variant_disease_pmid_associations.tsv.gz
md5: a393c9e8e104297aadcb7188f6460eaf
size: 11171100
outs:
- path: data/gwas/annotations/all_variant_disease_pmid_associations_final.tsv.gz
md5: 3d5dea024369a02f300ca3346eedc967
size: 9918420
prepare_opencravat:
cmd: oc module install-base
test_opencravat:
cmd: oc new example-input . && oc run ./example_input -l hg38 && oc gui example_input/example_input.sqlite
install_opencravat:
cmd: oc module install-base && oc module install -y clinvar clinvar_acmg biogrid
uk10k_cohort uniprot pubmed provean polyphen2 cardioboost civic civic_gene cosmic
cosmic_gene go sift ensembl_regulatory_build dgi cvdkp
install_cancer:
cmd: oc module install chasmplus cgc cgl cancer_genome_interpreter cancer_hotspots
civic
prepare_annotations_longevity:
cmd: wget -qO- --timestamping https://genomics.senescence.info/longevity/longevity_genes.zip
| bsdtar -xvf- -C data/gwas/annotations/ longevity.csv
outs:
- path: data/gwas/annotations/longevity.csv
md5: 54e53fe8c1a97282458249e1625beec8
size: 66288