Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/preludes/cluster_prelude.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ galah cluster --ani 95 --precluster-ani 90 --precluster-method finch --genome-fa
# Example: cluster a set of genomes and then their representatives against a set of reference genomes (reduces memory usage against clustering all together)
galah cluster --genome-fasta-directory input_genomes/ --output-representative-list genome_reps.txt
galah cluster --genome-fasta-list genome_reps.txt --reference-genomes-list reference_genomes.txt --output-cluster-definition clusters.tsv
# Example: cluster a large set of genomes using low-memory mode
galah cluster --low-memory --genome-fasta-directory input_genomes/ --output-representative-fasta-directory output_directory/
```

### Precluster ANI
Expand Down
22 changes: 22 additions & 0 deletions src/cluster_argument_parsing.rs
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ pub struct GalahClustererCommandDefinition {
pub dereplication_large_contigs_argument: String,
pub dereplication_fraglen_argument: String,
pub dereplication_cluster_contigs_argument: String,
pub dereplication_low_memory_argument: String,
pub dereplication_reference_genomes_argument: String,
pub dereplication_reference_genomes_list_argument: String,
// pub dereplication_ani_method_argument: String,
Expand All @@ -153,6 +154,7 @@ lazy_static! {
dereplication_large_contigs_argument: "large-contigs".to_string(),
dereplication_fraglen_argument: "fragment-length".to_string(),
dereplication_cluster_contigs_argument: "cluster-contigs".to_string(),
dereplication_low_memory_argument: "low-memory".to_string(),
dereplication_reference_genomes_argument: "reference-genomes".to_string(),
dereplication_reference_genomes_list_argument: "reference-genomes-list".to_string(),
// dereplication_ani_method_argument: "ani-method".to_string(),
Expand Down Expand Up @@ -430,6 +432,14 @@ pub fn add_dereplication_clustering_parameters_to_section(
.help("Do not use small-genomes settings in skani when clustering contigs. \
Recommended for contigs >= 20kb. Mutually exclusive with --small-contigs."),
)
.flag(
Flag::new()
.long(&format!(
"--{}",
definition.dereplication_low_memory_argument
))
.help("Reduce memory use by sketching to file and searching it instead."),
)
.option(
Opt::new("PATH ...")
.long(&format!(
Expand Down Expand Up @@ -1279,6 +1289,8 @@ pub fn generate_galah_clusterer<'a>(
}),
num_kmers: 1000,
kmer_length: 21,
low_memory: clap_matches
.get_flag(&argument_definition.dereplication_low_memory_argument),
}),
"skani" => Preclusterer::Skani(SkaniPreclusterer {
threshold: {
Expand Down Expand Up @@ -1350,6 +1362,8 @@ pub fn generate_galah_clusterer<'a>(
}),
small_genomes,
threads,
low_memory: clap_matches
.get_flag(&argument_definition.dereplication_low_memory_argument),
}),
_ => panic!("Programming error"),
},
Expand Down Expand Up @@ -1663,15 +1677,23 @@ pub fn add_cluster_subcommand(app: clap::Command) -> clap::Command {
.action(clap::ArgAction::SetTrue)
.requires(&*GALAH_COMMAND_DEFINITION.dereplication_cluster_contigs_argument)
.conflicts_with(&*GALAH_COMMAND_DEFINITION.dereplication_small_contigs_argument))
.arg(Arg::new(&*GALAH_COMMAND_DEFINITION.dereplication_low_memory_argument)
.long(&*GALAH_COMMAND_DEFINITION.dereplication_low_memory_argument)
.help("Reduce memory by sketching all genomes and searching instead of triangle")
.action(clap::ArgAction::SetTrue)
.conflicts_with(&*GALAH_COMMAND_DEFINITION.dereplication_reference_genomes_argument)
.conflicts_with(&*GALAH_COMMAND_DEFINITION.dereplication_reference_genomes_list_argument))
.arg(Arg::new(&*GALAH_COMMAND_DEFINITION.dereplication_reference_genomes_argument)
.long("reference-genomes")
.help("Reference genomes to cluster against. These should be representatives already clustered. Galah will only form clusters across the two groups, never within. Uses less memory than clustering together.")
.value_delimiter(' ')
.num_args(1..)
.conflicts_with(&*GALAH_COMMAND_DEFINITION.dereplication_low_memory_argument)
.conflicts_with(&*GALAH_COMMAND_DEFINITION.dereplication_reference_genomes_list_argument))
.arg(Arg::new(&*GALAH_COMMAND_DEFINITION.dereplication_reference_genomes_list_argument)
.long("reference-genomes-list")
.help("File containing paths to reference genomes (one per line). These should be representatives already clustered. Galah will only form clusters across the two groups, never within. Uses less memory than clustering together.")
.conflicts_with(&*GALAH_COMMAND_DEFINITION.dereplication_low_memory_argument)
.conflicts_with(&*GALAH_COMMAND_DEFINITION.dereplication_reference_genomes_argument))
.arg(Arg::new("threads")
.short('t')
Expand Down
42 changes: 42 additions & 0 deletions src/clusterer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -548,6 +548,7 @@ mod tests {
min_ani: 0.9,
num_kmers: 1000,
kmer_length: 21,
low_memory: false,
},
&crate::fastani::FastaniClusterer {
threshold: 95.0,
Expand Down Expand Up @@ -578,6 +579,7 @@ mod tests {
min_ani: 0.9,
num_kmers: 1000,
kmer_length: 21,
low_memory: false,
},
&crate::fastani::FastaniClusterer {
threshold: 98.0,
Expand Down Expand Up @@ -608,6 +610,7 @@ mod tests {
min_ani: 0.9,
num_kmers: 1000,
kmer_length: 21,
low_memory: false,
},
&crate::fastani::FastaniClusterer {
threshold: 98.0,
Expand Down Expand Up @@ -638,6 +641,7 @@ mod tests {
min_ani: 0.9,
num_kmers: 1000,
kmer_length: 21,
low_memory: false,
},
&crate::skani::SkaniClusterer {
threshold: 95.0,
Expand Down Expand Up @@ -668,6 +672,7 @@ mod tests {
min_ani: 0.9,
num_kmers: 1000,
kmer_length: 21,
low_memory: false,
},
&crate::skani::SkaniClusterer {
threshold: 99.0,
Expand Down Expand Up @@ -699,6 +704,7 @@ mod tests {
min_aligned_threshold: 0.2,
small_genomes: false,
threads: 1,
low_memory: false,
},
&crate::skani::SkaniClusterer {
threshold: 99.0,
Expand Down Expand Up @@ -731,6 +737,41 @@ mod tests {
min_aligned_threshold: 0.2,
small_genomes: false,
threads: 1,
low_memory: false,
},
&crate::skani::SkaniClusterer {
threshold: 99.0,
min_aligned_threshold: 0.2,
small_genomes: false,
},
false,
None,
None,
);
for cluster in clusters.iter_mut() {
cluster.sort_unstable();
}
clusters.sort_unstable();
assert_eq!(vec![vec![0, 1, 3], vec![2], vec![4]], clusters)
}

#[test]
fn test_skani_skani_low_memory() {
init();
let mut clusters = cluster(
&[
"tests/data/abisko4/73.20120800_S1X.13.fna",
"tests/data/abisko4/73.20120600_S2D.19.fna",
"tests/data/abisko4/73.20120700_S3X.12.fna",
"tests/data/abisko4/73.20110800_S2D.13.fna",
"tests/data/antonio_mags/BE_RX_R2_MAG52.fna",
],
&crate::skani::SkaniPreclusterer {
threshold: 90.0,
min_aligned_threshold: 0.2,
small_genomes: false,
threads: 1,
low_memory: true,
},
&crate::skani::SkaniClusterer {
threshold: 99.0,
Expand Down Expand Up @@ -758,6 +799,7 @@ mod tests {
min_aligned_threshold: 0.2,
small_genomes: false,
threads: 1,
low_memory: false,
},
&crate::skani::SkaniClusterer {
threshold: 99.0,
Expand Down
17 changes: 11 additions & 6 deletions src/finch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,21 @@ pub struct FinchPreclusterer {
pub min_ani: f32,
pub num_kmers: usize,
pub kmer_length: u8,
pub low_memory: bool,
}

impl PreclusterDistanceFinder for FinchPreclusterer {
fn distances(&self, genome_fasta_paths: &[&str]) -> SortedPairGenomeDistanceCache {
distances(
genome_fasta_paths,
self.min_ani,
self.num_kmers,
self.kmer_length,
)
if self.low_memory {
panic!("Low-memory clustering currently only supported with skani preclusterer");
} else {
distances(
genome_fasta_paths,
self.min_ani,
self.num_kmers,
self.kmer_length,
)
}
}

fn distances_contigs(
Expand Down
10 changes: 10 additions & 0 deletions src/process_argument_parsing.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ lazy_static! {
dereplication_small_contigs_argument: "small-contigs".to_string(),
dereplication_large_contigs_argument: "large-contigs".to_string(),
dereplication_fraglen_argument: "fragment-length".to_string(),
dereplication_low_memory_argument: "low-memory".to_string(),
dereplication_reference_genomes_argument: "reference-genomes".to_string(),
dereplication_reference_genomes_list_argument: "reference-genomes-list".to_string(),
dereplication_output_cluster_definition_file: "output-cluster-definition"
Expand Down Expand Up @@ -203,15 +204,23 @@ pub fn add_process_subcommand(app: clap::Command) -> clap::Command {
.action(clap::ArgAction::SetTrue)
.requires(&*PROCESS_CLUSTER_COMMAND_DEFINITION.dereplication_cluster_contigs_argument)
.conflicts_with(&*PROCESS_CLUSTER_COMMAND_DEFINITION.dereplication_small_contigs_argument))
.arg(Arg::new(&*PROCESS_CLUSTER_COMMAND_DEFINITION.dereplication_low_memory_argument)
.long(&*PROCESS_CLUSTER_COMMAND_DEFINITION.dereplication_low_memory_argument)
.help("Reduce memory by sketching all genomes and searching instead of triangle")
.action(clap::ArgAction::SetTrue)
.conflicts_with(&*PROCESS_CLUSTER_COMMAND_DEFINITION.dereplication_reference_genomes_argument)
.conflicts_with(&*PROCESS_CLUSTER_COMMAND_DEFINITION.dereplication_reference_genomes_list_argument))
.arg(Arg::new(&*PROCESS_CLUSTER_COMMAND_DEFINITION.dereplication_reference_genomes_argument)
.long("reference-genomes")
.help("Reference genomes to cluster against. These should be representatives already clustered. Galah will only form clusters across the two groups, never within. Uses less memory than clustering together.")
.value_delimiter(' ')
.num_args(1..)
.conflicts_with(&*PROCESS_CLUSTER_COMMAND_DEFINITION.dereplication_low_memory_argument)
.conflicts_with(&*PROCESS_CLUSTER_COMMAND_DEFINITION.dereplication_reference_genomes_list_argument))
.arg(Arg::new(&*PROCESS_CLUSTER_COMMAND_DEFINITION.dereplication_reference_genomes_list_argument)
.long("reference-genomes-list")
.help("File containing paths to reference genomes (one per line). These should be representatives already clustered. Galah will only form clusters across the two groups, never within. Uses less memory than clustering together.")
.conflicts_with(&*PROCESS_CLUSTER_COMMAND_DEFINITION.dereplication_low_memory_argument)
.conflicts_with(&*PROCESS_CLUSTER_COMMAND_DEFINITION.dereplication_reference_genomes_argument))
.arg(Arg::new("threads")
.short('t')
Expand Down Expand Up @@ -385,6 +394,7 @@ pub fn process_full_help(program_basename: &str, program_version: &str) -> Manua
dereplication_large_contigs_argument: "large-contigs".to_string(),
dereplication_fraglen_argument: "fragment-length".to_string(),
dereplication_cluster_contigs_argument: "cluster-contigs".to_string(),
dereplication_low_memory_argument: "low-memory".to_string(),
dereplication_reference_genomes_argument: "reference-genomes".to_string(),
dereplication_reference_genomes_list_argument: "reference-genomes-list".to_string(),
dereplication_output_cluster_definition_file: "output-cluster-definition".to_string(),
Expand Down
Loading