-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathanalysis.smk
More file actions
79 lines (59 loc) · 1.44 KB
/
analysis.smk
File metadata and controls
79 lines (59 loc) · 1.44 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import re
import os
include: 'cgmlst.smk'
include: 'assemble.smk'
fastq_files = sorted(os.listdir('fastqs'))
match_illumina = '(.+?)(_S\d+_L001)?_(R[12])(_001)?\.f(ast)?q\.?(gz)?'
sample_pattern = re.compile(match_illumina)
genome_samples = [re.sub(sample_pattern, '\\1', f)
for i, f in enumerate(fastq_files)
if i % 2 is 0]
# master rule
rule analyze:
input:
'pristine.csv',
'cgf_prediction.csv',
'pristine_distance_matrix.csv',
'melted_distance_matrix.csv',
'melted_pristine_distance_matrix.csv',
'missing_data_histogram.png'
rule ecgf:
input:
expand('genomes/{sample}.fasta', sample=genome_samples)
output:
'cgf_prediction.csv'
shell:
"eCGF genomes {output}"
rule distance_matrix:
input:
'calls.csv'
output:
'distance_matrix.csv'
script:
'scripts/hamming_distance_matrix.py'
rule pristine_distance_matrix:
input:
'pristine.csv',
'distance_matrix.csv'
output:
'pristine_distance_matrix.csv'
run:
import pandas as pd
pristine_genomes = pd.read_csv(input[0], header=0, index_col=0).index
calls_dm = pd.read_csv(input[1], header=0, index_col=0)
pristine_dm = calls_dm.loc[pristine_genomes, pristine_genomes]
pristine_dm.to_csv(output[0])
rule missing_data_histogram:
input:
'calls.csv'
output:
'missing_data_histogram.png'
script:
'scripts/missing_data_histogram.R'
rule melted_distance_matrix:
input:
'{dm}'
output:
'melted_{dm}'
script:
'scripts/melted_distance_matrix.R'