From 24a9f7f4269332683522424635b26c5d0c496a27 Mon Sep 17 00:00:00 2001 From: theosanderson-agent Date: Thu, 19 Mar 2026 17:09:08 +0000 Subject: [PATCH 01/11] feat(preprocessing): move maxSequencesPerEntry validation from backend to preprocessing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the validation of maxSequencesPerEntry from the backend submission endpoint to the preprocessing pipeline. Previously, when an entry exceeded the allowed number of sequences, the entire batch submission would fail with an HTTP 422 error — causing valid entries in the same batch to also be rejected. By moving this validation to preprocessing, entries that exceed the limit now receive a per-entry processing error instead. This means other valid entries in the same batch are unaffected, which is particularly important for the ingest pipeline where one bad entry should not block the rest. Changes: - Backend: Remove maxSequencesPerEntry check from extractAndValidateFastaIds(), metadataEntryStreamAsSequence(), and revisionEntryStreamAsSequence() - Backend: Remove maxSequencesPerEntry config retrieval from SubmitModel.uploadMetadata() - Preprocessing: Add max_sequences_per_entry config option to Config - Preprocessing: Add check_max_sequences_per_entry() validation in process_all() - Kubernetes: Pass maxSequencesPerEntry from Helm values to preprocessing config - Tests: Update backend tests to reflect removed validation, add preprocessing tests Closes #6165 Co-Authored-By: Claude Opus 4.6 (1M context) --- .../org/loculus/backend/model/SubmitModel.kt | 8 +- .../loculus/backend/utils/MetadataEntry.kt | 29 +--- .../backend/utils/MetadataEntryTest.kt | 133 ++---------------- .../loculus-preprocessing-config.yaml | 5 + .../src/loculus_preprocessing/config.py | 1 + .../src/loculus_preprocessing/prepro.py | 31 ++++ .../tests/test_nextclade_preprocessing.py | 80 +++++++++++ 7 files changed, 133 insertions(+), 154 deletions(-) diff --git a/backend/src/main/kotlin/org/loculus/backend/model/SubmitModel.kt b/backend/src/main/kotlin/org/loculus/backend/model/SubmitModel.kt index 082e24cb5e..8fec3462e4 100644 --- a/backend/src/main/kotlin/org/loculus/backend/model/SubmitModel.kt +++ b/backend/src/main/kotlin/org/loculus/backend/model/SubmitModel.kt @@ -256,15 +256,11 @@ class SubmitModel( "from $submissionParams.submitter with UploadId $uploadId" } val now = dateProvider.getCurrentDateTime() - val maxSequencesPerEntry = backendConfig.getInstanceConfig(submissionParams.organism) - .schema - .submissionDataTypes - .maxSequencesPerEntry try { when (submissionParams) { is SubmissionParams.OriginalSubmissionParams -> { - metadataEntryStreamAsSequence(metadataStream, maxSequencesPerEntry) + metadataEntryStreamAsSequence(metadataStream) .chunked(batchSize) .forEach { batch -> uploadDatabaseService.batchInsertMetadataInAuxTable( @@ -280,7 +276,7 @@ class SubmitModel( } is SubmissionParams.RevisionSubmissionParams -> { - revisionEntryStreamAsSequence(metadataStream, maxSequencesPerEntry) + revisionEntryStreamAsSequence(metadataStream) .chunked(batchSize) .forEach { batch -> uploadDatabaseService.batchInsertRevisedMetadataInAuxTable( diff --git a/backend/src/main/kotlin/org/loculus/backend/utils/MetadataEntry.kt b/backend/src/main/kotlin/org/loculus/backend/utils/MetadataEntry.kt index da56271756..96cf5a5039 100644 --- a/backend/src/main/kotlin/org/loculus/backend/utils/MetadataEntry.kt +++ b/backend/src/main/kotlin/org/loculus/backend/utils/MetadataEntry.kt @@ -45,12 +45,7 @@ fun findAndValidateSubmissionIdHeader(headerNames: List): String { return submissionIdHeaders.first() } -fun extractAndValidateFastaIds( - record: CSVRecord, - submissionId: String, - recordNumber: Int, - maxSequencesPerEntry: Int? = null, -): Set { +fun extractAndValidateFastaIds(record: CSVRecord, submissionId: String, recordNumber: Int): Set { val headerNames = record.parser.headerNames return when (headerNames.contains(FASTA_IDS_HEADER)) { true -> { @@ -74,14 +69,6 @@ fun extractAndValidateFastaIds( ) } - if (maxSequencesPerEntry != null && fastaIds.size > maxSequencesPerEntry) { - throw UnprocessableEntityException( - "In metadata file: record #$recordNumber with id '$submissionId': " + - "found ${fastaIds.size} fasta ids but the maximum allowed number of " + - "sequences per entry is $maxSequencesPerEntry", - ) - } - fastaIds.toSet() } @@ -134,10 +121,7 @@ private fun throwWithCsvExceptionUnwrapped(e: Exception): Nothing { throw e } -fun metadataEntryStreamAsSequence( - metadataInputStream: InputStream, - maxSequencesPerEntry: Int? = null, -): Sequence { +fun metadataEntryStreamAsSequence(metadataInputStream: InputStream): Sequence { val csvParser = setUpCsvParser(metadataInputStream) val headerNames = csvParser.headerNames @@ -150,7 +134,7 @@ fun metadataEntryStreamAsSequence( val submissionId = getValueAndValidateNoWhitespace(record, submissionIdHeader, recordNumber) - val fastaIds = extractAndValidateFastaIds(record, submissionId, recordNumber, maxSequencesPerEntry) + val fastaIds = extractAndValidateFastaIds(record, submissionId, recordNumber) val metadata = record.toMap().filterKeys { it != submissionIdHeader && @@ -174,10 +158,7 @@ data class RevisionEntry( val fastaIds: Set? = null, ) -fun revisionEntryStreamAsSequence( - metadataInputStream: InputStream, - maxSequencesPerEntry: Int? = null, -): Sequence { +fun revisionEntryStreamAsSequence(metadataInputStream: InputStream): Sequence { val csvParser = setUpCsvParser(metadataInputStream) val headerNames = csvParser.headerNames @@ -197,7 +178,7 @@ fun revisionEntryStreamAsSequence( val submissionId = getValueAndValidateNoWhitespace(record, submissionIdHeader, recordNumber) val accession = getValueAndValidateNoWhitespace(record, ACCESSION_HEADER, recordNumber) - val fastaIds = extractAndValidateFastaIds(record, submissionId, recordNumber, maxSequencesPerEntry) + val fastaIds = extractAndValidateFastaIds(record, submissionId, recordNumber) val metadata = record.toMap().filterKeys { it != submissionIdHeader && it != ACCESSION_HEADER && diff --git a/backend/src/test/kotlin/org/loculus/backend/utils/MetadataEntryTest.kt b/backend/src/test/kotlin/org/loculus/backend/utils/MetadataEntryTest.kt index 4ffb9f7cc0..d6ce2e1a27 100644 --- a/backend/src/test/kotlin/org/loculus/backend/utils/MetadataEntryTest.kt +++ b/backend/src/test/kotlin/org/loculus/backend/utils/MetadataEntryTest.kt @@ -129,104 +129,18 @@ class MetadataEntryTest { } @Test - fun `test maxSequencesPerEntry not set allows multiple sequences`() { + fun `test multiple fasta IDs are accepted without limit`() { val str = """ submissionId${'\t'}fastaIds${'\t'}Country foo${'\t'}seq1 seq2 seq3${'\t'}bar """.trimIndent() val inputStream = ByteArrayInputStream(str.toByteArray()) - val entries = metadataEntryStreamAsSequence(inputStream, maxSequencesPerEntry = null).toList() - assertThat(entries, hasSize(1)) - assertThat(entries[0].submissionId, equalTo("foo")) - assertThat(entries[0].fastaIds, equalTo(setOf("seq1", "seq2", "seq3"))) - } - - @Test - fun `test maxSequencesPerEntry allows sequences within limit`() { - val str = """ - submissionId${'\t'}fastaIds${'\t'}Country - foo${'\t'}seq1 seq2${'\t'}bar - """.trimIndent() - val inputStream = ByteArrayInputStream(str.toByteArray()) - val entries = metadataEntryStreamAsSequence(inputStream, maxSequencesPerEntry = 3).toList() - assertThat(entries, hasSize(1)) - assertThat(entries[0].submissionId, equalTo("foo")) - assertThat(entries[0].fastaIds, equalTo(setOf("seq1", "seq2"))) - } - - @Test - fun `test maxSequencesPerEntry allows sequences at exact limit`() { - val str = """ - submissionId${'\t'}fastaIds${'\t'}Country - foo${'\t'}seq1 seq2 seq3${'\t'}bar - """.trimIndent() - val inputStream = ByteArrayInputStream(str.toByteArray()) - val entries = metadataEntryStreamAsSequence(inputStream, maxSequencesPerEntry = 3).toList() + val entries = metadataEntryStreamAsSequence(inputStream).toList() assertThat(entries, hasSize(1)) assertThat(entries[0].submissionId, equalTo("foo")) assertThat(entries[0].fastaIds, equalTo(setOf("seq1", "seq2", "seq3"))) } - @Test - fun `test maxSequencesPerEntry rejects sequences exceeding limit`() { - val str = """ - submissionId${'\t'}fastaIds${'\t'}Country - foo${'\t'}seq1 seq2 seq3 seq4${'\t'}bar - """.trimIndent() - val inputStream = ByteArrayInputStream(str.toByteArray()) - val exception = assertThrows { - metadataEntryStreamAsSequence(inputStream, maxSequencesPerEntry = 3).toList() - } - assertThat(exception.message, containsString("record #1")) - assertThat(exception.message, containsString("foo")) - assertThat(exception.message, containsString("found 4 fasta ids")) - assertThat(exception.message, containsString("maximum allowed number of sequences per entry is 3")) - } - - @Test - fun `test maxSequencesPerEntry with single sequence limit`() { - val str = """ - submissionId${'\t'}fastaIds${'\t'}Country - foo${'\t'}seq1 seq2${'\t'}bar - """.trimIndent() - val inputStream = ByteArrayInputStream(str.toByteArray()) - val exception = assertThrows { - metadataEntryStreamAsSequence(inputStream, maxSequencesPerEntry = 1).toList() - } - assertThat(exception.message, containsString("record #1")) - assertThat(exception.message, containsString("foo")) - assertThat(exception.message, containsString("found 2 fasta ids")) - assertThat(exception.message, containsString("maximum allowed number of sequences per entry is 1")) - } - - @Test - fun `test maxSequencesPerEntry allows single sequence when limit is 1`() { - val str = """ - submissionId${'\t'}fastaIds${'\t'}Country - foo${'\t'}seq1${'\t'}bar - """.trimIndent() - val inputStream = ByteArrayInputStream(str.toByteArray()) - val entries = metadataEntryStreamAsSequence(inputStream, maxSequencesPerEntry = 1).toList() - assertThat(entries, hasSize(1)) - assertThat(entries[0].submissionId, equalTo("foo")) - assertThat(entries[0].fastaIds, equalTo(setOf("seq1"))) - } - - @Test - fun `test maxSequencesPerEntry correct record number for multiple rows`() { - val str = """ - submissionId${'\t'}fastaIds${'\t'}Country - foo1${'\t'}seq1${'\t'}bar - foo2${'\t'}seq2 seq3${'\t'}bar - """.trimIndent() - val inputStream = ByteArrayInputStream(str.toByteArray()) - val exception = assertThrows { - metadataEntryStreamAsSequence(inputStream, maxSequencesPerEntry = 1).toList() - } - assertThat(exception.message, containsString("record #2")) - assertThat(exception.message, containsString("foo2")) - } - @Test fun `test multiple duplicate fasta IDs are all reported`() { val str = """ @@ -245,14 +159,14 @@ class MetadataEntryTest { } @Test - fun `test duplicate detection works with maxSequencesPerEntry`() { + fun `test duplicate detection works`() { val str = """ submissionId${'\t'}fastaIds${'\t'}Country foo${'\t'}seq1 seq1${'\t'}bar """.trimIndent() val inputStream = ByteArrayInputStream(str.toByteArray()) val exception = assertThrows { - metadataEntryStreamAsSequence(inputStream, maxSequencesPerEntry = 3).toList() + metadataEntryStreamAsSequence(inputStream).toList() } assertThat(exception.message, containsString("duplicate fasta ids")) assertThat(exception.message, containsString("seq1")) @@ -349,45 +263,16 @@ class RevisionEntryTest { } @Test - fun `test revision maxSequencesPerEntry allows sequences within limit`() { - val str = """ - submissionId${'\t'}accession${'\t'}fastaIds${'\t'}Country - foo${'\t'}ACC123${'\t'}seq1 seq2${'\t'}bar - """.trimIndent() - val inputStream = ByteArrayInputStream(str.toByteArray()) - val entries = revisionEntryStreamAsSequence(inputStream, maxSequencesPerEntry = 3).toList() - assertThat(entries, hasSize(1)) - assertThat(entries[0].submissionId, equalTo("foo")) - assertThat(entries[0].fastaIds, equalTo(setOf("seq1", "seq2"))) - } - - @Test - fun `test revision maxSequencesPerEntry rejects sequences exceeding limit`() { + fun `test revision multiple fasta IDs are accepted`() { val str = """ submissionId${'\t'}accession${'\t'}fastaIds${'\t'}Country foo${'\t'}ACC123${'\t'}seq1 seq2 seq3${'\t'}bar """.trimIndent() val inputStream = ByteArrayInputStream(str.toByteArray()) - val exception = assertThrows { - revisionEntryStreamAsSequence(inputStream, maxSequencesPerEntry = 2).toList() - } - assertThat(exception.message, containsString("record #1")) - assertThat(exception.message, containsString("foo")) - assertThat(exception.message, containsString("found 3 fasta ids")) - assertThat(exception.message, containsString("maximum allowed number of sequences per entry is 2")) - } - - @Test - fun `test revision maxSequencesPerEntry with single sequence limit`() { - val str = """ - submissionId${'\t'}accession${'\t'}fastaIds${'\t'}Country - foo${'\t'}ACC123${'\t'}seq1${'\t'}bar - """.trimIndent() - val inputStream = ByteArrayInputStream(str.toByteArray()) - val entries = revisionEntryStreamAsSequence(inputStream, maxSequencesPerEntry = 1).toList() + val entries = revisionEntryStreamAsSequence(inputStream).toList() assertThat(entries, hasSize(1)) assertThat(entries[0].submissionId, equalTo("foo")) - assertThat(entries[0].fastaIds, equalTo(setOf("seq1"))) + assertThat(entries[0].fastaIds, equalTo(setOf("seq1", "seq2", "seq3"))) } @Test @@ -407,14 +292,14 @@ class RevisionEntryTest { } @Test - fun `test revision duplicate detection works with maxSequencesPerEntry`() { + fun `test revision duplicate detection works`() { val str = """ submissionId${'\t'}accession${'\t'}fastaIds${'\t'}Country foo${'\t'}ACC123${'\t'}seq1 seq1${'\t'}bar """.trimIndent() val inputStream = ByteArrayInputStream(str.toByteArray()) val exception = assertThrows { - revisionEntryStreamAsSequence(inputStream, maxSequencesPerEntry = 3).toList() + revisionEntryStreamAsSequence(inputStream).toList() } assertThat(exception.message, containsString("duplicate fasta ids")) assertThat(exception.message, containsString("seq1")) diff --git a/kubernetes/loculus/templates/loculus-preprocessing-config.yaml b/kubernetes/loculus/templates/loculus-preprocessing-config.yaml index b5eb9b4f21..4dbbe65240 100644 --- a/kubernetes/loculus/templates/loculus-preprocessing-config.yaml +++ b/kubernetes/loculus/templates/loculus-preprocessing-config.yaml @@ -15,6 +15,11 @@ data: preprocessing-config.yaml: | organism: {{ $organism }} {{- $processingConfig.configFile | toYaml | nindent 4 }} + {{- if and (hasKey $organismConfig.schema "submissionDataTypes") (hasKey $organismConfig.schema.submissionDataTypes "maxSequencesPerEntry") }} + {{- if $organismConfig.schema.submissionDataTypes.maxSequencesPerEntry }} + max_sequences_per_entry: {{ $organismConfig.schema.submissionDataTypes.maxSequencesPerEntry }} + {{- end }} + {{- end }} processing_spec: {{- $args := dict "metadata" $metadata "referenceGenomes" $organismConfig.referenceGenomes }} {{- include "loculus.preprocessingSpecs" $args | nindent 6 }} diff --git a/preprocessing/nextclade/src/loculus_preprocessing/config.py b/preprocessing/nextclade/src/loculus_preprocessing/config.py index 8e2d088965..082da72382 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/config.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/config.py @@ -101,6 +101,7 @@ class Config(BaseModel): keycloak_token_path: str = "realms/loculus/protocol/openid-connect/token" # noqa: S105 organism: str = "mpox" + max_sequences_per_entry: int | None = None segments: list[Segment] = Field(default_factory=list) processing_spec: dict[str, ProcessingSpec] = Field(default_factory=dict) processing_order: tuple[str, ...] = () diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py index cf6f34687c..077426bcc0 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py @@ -591,6 +591,29 @@ def processed_entry_with_errors(id) -> SubmissionData: ) +def check_max_sequences_per_entry( + accession_version: AccessionVersion, + num_sequences: int, + config: Config, +) -> list[ProcessingAnnotation]: + """Check if the number of sequences exceeds the configured maximum per entry.""" + if ( + config.max_sequences_per_entry is not None + and num_sequences > config.max_sequences_per_entry + ): + return [ + ProcessingAnnotation.from_single( + ProcessingAnnotationAlignment, + AnnotationSourceType.NUCLEOTIDE_SEQUENCE, + message=( + f"Entry has {num_sequences} sequences but the maximum allowed " + f"number of sequences per entry is {config.max_sequences_per_entry}." + ), + ) + ] + return [] + + def process_all( unprocessed: Sequence[UnprocessedEntry], dataset_dir: str, config: Config ) -> Sequence[SubmissionData]: @@ -616,6 +639,14 @@ def process_all( processed_single = processed_entry_with_errors(entry.accessionVersion) processed_results.append(processed_single) + for submission_data in processed_results: + entry = submission_data.processed_entry + num_sequences = len(entry.data.unalignedNucleotideSequences) + max_seq_errors = check_max_sequences_per_entry( + f"{entry.accession}.{entry.version}", num_sequences, config + ) + entry.errors.extend(max_seq_errors) + return processed_results diff --git a/preprocessing/nextclade/tests/test_nextclade_preprocessing.py b/preprocessing/nextclade/tests/test_nextclade_preprocessing.py index e880a202a6..d2cc199526 100644 --- a/preprocessing/nextclade/tests/test_nextclade_preprocessing.py +++ b/preprocessing/nextclade/tests/test_nextclade_preprocessing.py @@ -1196,6 +1196,86 @@ def test_preprocessing_multi_segment_none_requirement(test_case_def: Case): ) +def test_max_sequences_per_entry_rejects_exceeding_limit() -> None: + config = get_config(MULTI_SEGMENT_CONFIG, ignore_args=True) + config.max_sequences_per_entry = 1 + + sequence_entry_data = UnprocessedEntry( + accessionVersion="LOC_01.1", + data=UnprocessedData( + group_id=2, + submitter="test_submitter", + submissionId="test_submission_id", + submittedAt=ts_from_ymd(2021, 12, 15), + metadata={}, + unalignedNucleotideSequences={ + "ebola-sudan": sequence_with_mutation("ebola-sudan"), + "ebola-zaire": sequence_with_mutation("ebola-zaire"), + }, + ), + ) + + result = process_all([sequence_entry_data], MULTI_EBOLA_DATASET, config) + processed_entry = result[0].processed_entry + + max_seq_errors = [e for e in processed_entry.errors if "maximum allowed" in e.message] + assert len(max_seq_errors) == 1 + assert "2 sequences" in max_seq_errors[0].message + assert "maximum allowed number of sequences per entry is 1" in max_seq_errors[0].message + + +def test_max_sequences_per_entry_allows_within_limit() -> None: + config = get_config(MULTI_SEGMENT_CONFIG, ignore_args=True) + config.max_sequences_per_entry = 3 + + sequence_entry_data = UnprocessedEntry( + accessionVersion="LOC_01.1", + data=UnprocessedData( + group_id=2, + submitter="test_submitter", + submissionId="test_submission_id", + submittedAt=ts_from_ymd(2021, 12, 15), + metadata={}, + unalignedNucleotideSequences={ + "ebola-sudan": sequence_with_mutation("ebola-sudan"), + "ebola-zaire": sequence_with_mutation("ebola-zaire"), + }, + ), + ) + + result = process_all([sequence_entry_data], MULTI_EBOLA_DATASET, config) + processed_entry = result[0].processed_entry + + max_seq_errors = [e for e in processed_entry.errors if "maximum allowed" in e.message] + assert len(max_seq_errors) == 0 + + +def test_max_sequences_per_entry_not_set_allows_any() -> None: + config = get_config(MULTI_SEGMENT_CONFIG, ignore_args=True) + config.max_sequences_per_entry = None + + sequence_entry_data = UnprocessedEntry( + accessionVersion="LOC_01.1", + data=UnprocessedData( + group_id=2, + submitter="test_submitter", + submissionId="test_submission_id", + submittedAt=ts_from_ymd(2021, 12, 15), + metadata={}, + unalignedNucleotideSequences={ + "ebola-sudan": sequence_with_mutation("ebola-sudan"), + "ebola-zaire": sequence_with_mutation("ebola-zaire"), + }, + ), + ) + + result = process_all([sequence_entry_data], MULTI_EBOLA_DATASET, config) + processed_entry = result[0].processed_entry + + max_seq_errors = [e for e in processed_entry.errors if "maximum allowed" in e.message] + assert len(max_seq_errors) == 0 + + def test_preprocessing_without_metadata() -> None: config = get_config(MULTI_SEGMENT_CONFIG, ignore_args=True) sequence_entry_data = UnprocessedEntry( From e75566d5b2a3152a93932115dcb0217965275efb Mon Sep 17 00:00:00 2001 From: anna-parker <50943381+anna-parker@users.noreply.github.com> Date: Mon, 23 Mar 2026 17:07:27 +0100 Subject: [PATCH 02/11] use consistent structure --- .../src/loculus_preprocessing/prepro.py | 33 +++++++++++++------ 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py index 077426bcc0..573a2a7fb1 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py @@ -501,6 +501,12 @@ def process_single( """Process a single sequence per config""" iupac_errors = errors_if_non_iupac(unprocessed.unalignedNucleotideSequences) + max_seq_errors = check_max_sequences_per_entry( + accession_version, + len(unprocessed.unalignedNucleotideSequences), + config, + ) + alignment_errors, alignment_warnings = alignment_errors_warnings( unprocessed, config, @@ -522,7 +528,15 @@ def process_single( aminoAcidInsertions=unprocessed.aminoAcidInsertions, sequenceNameToFastaId=unprocessed.sequenceNameToFastaId, ), - errors=list(set(unprocessed.errors + iupac_errors + alignment_errors + metadata_errors)), + errors=list( + set( + unprocessed.errors + + iupac_errors + + max_seq_errors + + alignment_errors + + metadata_errors + ) + ), warnings=list(set(unprocessed.warnings + alignment_warnings + metadata_warnings)), ) @@ -546,6 +560,11 @@ def process_single_unaligned( ) unprocessed.unalignedNucleotideSequences = segment_assignment.unalignedNucleotideSequences iupac_errors = errors_if_non_iupac(unprocessed.unalignedNucleotideSequences) + max_seq_errors = check_max_sequences_per_entry( + accession_version, + len(unprocessed.unalignedNucleotideSequences), + config, + ) output_metadata, metadata_errors, metadata_warnings = get_output_metadata( accession_version, unprocessed, config @@ -555,7 +574,9 @@ def process_single_unaligned( accession_version=accession_version, unprocessed=unprocessed, output_metadata=output_metadata, - errors=list(set(iupac_errors + metadata_errors + segment_assignment.alert.errors)), + errors=list( + set(iupac_errors + max_seq_errors + metadata_errors + segment_assignment.alert.errors) + ), warnings=list(set(metadata_warnings)), sequenceNameToFastaId=segment_assignment.sequenceNameToFastaId, ) @@ -639,14 +660,6 @@ def process_all( processed_single = processed_entry_with_errors(entry.accessionVersion) processed_results.append(processed_single) - for submission_data in processed_results: - entry = submission_data.processed_entry - num_sequences = len(entry.data.unalignedNucleotideSequences) - max_seq_errors = check_max_sequences_per_entry( - f"{entry.accession}.{entry.version}", num_sequences, config - ) - entry.errors.extend(max_seq_errors) - return processed_results From c844a23a03429e2f9090ff45b7775072e030b6df Mon Sep 17 00:00:00 2001 From: anna-parker <50943381+anna-parker@users.noreply.github.com> Date: Thu, 26 Mar 2026 11:34:22 +0100 Subject: [PATCH 03/11] check if there are too many sequences before assigning segments --- .../src/loculus_preprocessing/nextclade.py | 56 +++++++++++++------ .../src/loculus_preprocessing/prepro.py | 9 +-- 2 files changed, 40 insertions(+), 25 deletions(-) diff --git a/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py b/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py index 33e9b165ae..c51f521975 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py @@ -15,6 +15,8 @@ import pandas as pd from Bio import SeqIO +from loculus_preprocessing.prepro import check_max_sequences_per_entry + from .config import AlignmentRequirement, Config, NextcladeSequenceAndDataset, SequenceName from .datatypes import ( AccessionVersion, @@ -763,7 +765,42 @@ def load_aligned_aa_sequences( return aligned_aminoacid_sequences -def enrich_with_nextclade( # noqa: C901, PLR0914 +def assign_segment_for_alignment( + unprocessed: Sequence[UnprocessedEntry], config: Config, dataset_dir: str +) -> SequenceAssignmentBatch: + errors = {} + for entry in unprocessed: + errors[entry.accessionVersion] = check_max_sequences_per_entry( + len(entry.data.unalignedNucleotideSequences), + config, + ) + if not config.multi_datasets: + batch = assign_all_single_segments(unprocessed, config=config) + else: + match config.segment_classification_method: + case SegmentClassificationMethod.DIAMOND: + batch = assign_segment_with_diamond( + unprocessed, config=config, dataset_dir=dataset_dir + ) + case SegmentClassificationMethod.MINIMIZER: + batch = assign_segment_with_nextclade_sort( + unprocessed, config=config, dataset_dir=dataset_dir + ) + case SegmentClassificationMethod.ALIGN: + batch = assign_segment_with_nextclade_align( + unprocessed, config=config, dataset_dir=dataset_dir + ) + batch.alerts = { + id: Alert( + errors=[*batch.alerts[id].errors, error] if error else batch.alerts[id].errors, + warnings=batch.alerts[id].warnings, + ) + for id, error in errors.items() + } + return batch + + +def enrich_with_nextclade( # noqa: PLR0914 unprocessed: Sequence[UnprocessedEntry], dataset_dir: str, config: Config ) -> dict[AccessionVersion, UnprocessedAfterNextclade]: """ @@ -791,22 +828,7 @@ def enrich_with_nextclade( # noqa: C901, PLR0914 for entry in unprocessed } - if not config.multi_datasets: - batch = assign_all_single_segments(unprocessed, config=config) - else: - match config.segment_classification_method: - case SegmentClassificationMethod.DIAMOND: - batch = assign_segment_with_diamond( - unprocessed, config=config, dataset_dir=dataset_dir - ) - case SegmentClassificationMethod.MINIMIZER: - batch = assign_segment_with_nextclade_sort( - unprocessed, config=config, dataset_dir=dataset_dir - ) - case SegmentClassificationMethod.ALIGN: - batch = assign_segment_with_nextclade_align( - unprocessed, config=config, dataset_dir=dataset_dir - ) + batch = assign_segment_for_alignment(unprocessed, config=config, dataset_dir=dataset_dir) unaligned_nucleotide_sequences = batch.unalignedNucleotideSequences segment_assignment_map = batch.sequenceNameToFastaId alerts: Alerts = batch.alerts diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py index 573a2a7fb1..5db57e7b4e 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py @@ -502,7 +502,6 @@ def process_single( iupac_errors = errors_if_non_iupac(unprocessed.unalignedNucleotideSequences) max_seq_errors = check_max_sequences_per_entry( - accession_version, len(unprocessed.unalignedNucleotideSequences), config, ) @@ -560,11 +559,6 @@ def process_single_unaligned( ) unprocessed.unalignedNucleotideSequences = segment_assignment.unalignedNucleotideSequences iupac_errors = errors_if_non_iupac(unprocessed.unalignedNucleotideSequences) - max_seq_errors = check_max_sequences_per_entry( - accession_version, - len(unprocessed.unalignedNucleotideSequences), - config, - ) output_metadata, metadata_errors, metadata_warnings = get_output_metadata( accession_version, unprocessed, config @@ -575,7 +569,7 @@ def process_single_unaligned( unprocessed=unprocessed, output_metadata=output_metadata, errors=list( - set(iupac_errors + max_seq_errors + metadata_errors + segment_assignment.alert.errors) + set(iupac_errors + metadata_errors + segment_assignment.alert.errors) ), warnings=list(set(metadata_warnings)), sequenceNameToFastaId=segment_assignment.sequenceNameToFastaId, @@ -613,7 +607,6 @@ def processed_entry_with_errors(id) -> SubmissionData: def check_max_sequences_per_entry( - accession_version: AccessionVersion, num_sequences: int, config: Config, ) -> list[ProcessingAnnotation]: From fb956d64441e43c6622200ea34d4fb84d88b3f10 Mon Sep 17 00:00:00 2001 From: anna-parker <50943381+anna-parker@users.noreply.github.com> Date: Thu, 26 Mar 2026 11:38:41 +0100 Subject: [PATCH 04/11] also allow 0 values --- kubernetes/loculus/templates/loculus-preprocessing-config.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/kubernetes/loculus/templates/loculus-preprocessing-config.yaml b/kubernetes/loculus/templates/loculus-preprocessing-config.yaml index 4dbbe65240..dabe36aa3b 100644 --- a/kubernetes/loculus/templates/loculus-preprocessing-config.yaml +++ b/kubernetes/loculus/templates/loculus-preprocessing-config.yaml @@ -16,10 +16,8 @@ data: organism: {{ $organism }} {{- $processingConfig.configFile | toYaml | nindent 4 }} {{- if and (hasKey $organismConfig.schema "submissionDataTypes") (hasKey $organismConfig.schema.submissionDataTypes "maxSequencesPerEntry") }} - {{- if $organismConfig.schema.submissionDataTypes.maxSequencesPerEntry }} max_sequences_per_entry: {{ $organismConfig.schema.submissionDataTypes.maxSequencesPerEntry }} {{- end }} - {{- end }} processing_spec: {{- $args := dict "metadata" $metadata "referenceGenomes" $organismConfig.referenceGenomes }} {{- include "loculus.preprocessingSpecs" $args | nindent 6 }} From 3ce2e7e1738574ad62349d00fce2f1fa4a65fc23 Mon Sep 17 00:00:00 2001 From: anna-parker <50943381+anna-parker@users.noreply.github.com> Date: Thu, 26 Mar 2026 11:39:17 +0100 Subject: [PATCH 05/11] reformat --- preprocessing/nextclade/src/loculus_preprocessing/prepro.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py index 5db57e7b4e..8de4b11305 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py @@ -568,9 +568,7 @@ def process_single_unaligned( accession_version=accession_version, unprocessed=unprocessed, output_metadata=output_metadata, - errors=list( - set(iupac_errors + metadata_errors + segment_assignment.alert.errors) - ), + errors=list(set(iupac_errors + metadata_errors + segment_assignment.alert.errors)), warnings=list(set(metadata_warnings)), sequenceNameToFastaId=segment_assignment.sequenceNameToFastaId, ) From 260831a41664651a216a89ade83d0ca03fa7e92e Mon Sep 17 00:00:00 2001 From: anna-parker <50943381+anna-parker@users.noreply.github.com> Date: Thu, 26 Mar 2026 11:41:47 +0100 Subject: [PATCH 06/11] fix imports --- .../src/loculus_preprocessing/nextclade.py | 2 +- .../src/loculus_preprocessing/prepro.py | 24 +----------------- .../loculus_preprocessing/sequence_checks.py | 25 +++++++++++++++++++ 3 files changed, 27 insertions(+), 24 deletions(-) diff --git a/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py b/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py index c51f521975..4785d8fa25 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py @@ -15,7 +15,7 @@ import pandas as pd from Bio import SeqIO -from loculus_preprocessing.prepro import check_max_sequences_per_entry +from loculus_preprocessing.sequence_checks import check_max_sequences_per_entry from .config import AlignmentRequirement, Config, NextcladeSequenceAndDataset, SequenceName from .datatypes import ( diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py index 8de4b11305..7f57830c5f 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py @@ -63,7 +63,7 @@ process_phenotype_values, process_stop_codons, ) -from .sequence_checks import errors_if_non_iupac +from .sequence_checks import check_max_sequences_per_entry, errors_if_non_iupac logger = logging.getLogger(__name__) @@ -604,28 +604,6 @@ def processed_entry_with_errors(id) -> SubmissionData: ) -def check_max_sequences_per_entry( - num_sequences: int, - config: Config, -) -> list[ProcessingAnnotation]: - """Check if the number of sequences exceeds the configured maximum per entry.""" - if ( - config.max_sequences_per_entry is not None - and num_sequences > config.max_sequences_per_entry - ): - return [ - ProcessingAnnotation.from_single( - ProcessingAnnotationAlignment, - AnnotationSourceType.NUCLEOTIDE_SEQUENCE, - message=( - f"Entry has {num_sequences} sequences but the maximum allowed " - f"number of sequences per entry is {config.max_sequences_per_entry}." - ), - ) - ] - return [] - - def process_all( unprocessed: Sequence[UnprocessedEntry], dataset_dir: str, config: Config ) -> Sequence[SubmissionData]: diff --git a/preprocessing/nextclade/src/loculus_preprocessing/sequence_checks.py b/preprocessing/nextclade/src/loculus_preprocessing/sequence_checks.py index 4bd9c50674..456c7dfe07 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/sequence_checks.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/sequence_checks.py @@ -1,7 +1,10 @@ +from loculus_preprocessing.config import Config + from .datatypes import ( AnnotationSourceType, NucleotideSequence, ProcessingAnnotation, + ProcessingAnnotationAlignment, SegmentName, ) @@ -48,3 +51,25 @@ def errors_if_non_iupac( ) ) return errors + + +def check_max_sequences_per_entry( + num_sequences: int, + config: Config, +) -> list[ProcessingAnnotation]: + """Check if the number of sequences exceeds the configured maximum per entry.""" + if ( + config.max_sequences_per_entry is not None + and num_sequences > config.max_sequences_per_entry + ): + return [ + ProcessingAnnotation.from_single( + ProcessingAnnotationAlignment, + AnnotationSourceType.NUCLEOTIDE_SEQUENCE, + message=( + f"Entry has {num_sequences} sequences but the maximum allowed " + f"number of sequences per entry is {config.max_sequences_per_entry}." + ), + ) + ] + return [] From 1fae50f22837bb29523f8167d99ecf08df34806b Mon Sep 17 00:00:00 2001 From: anna-parker <50943381+anna-parker@users.noreply.github.com> Date: Thu, 26 Mar 2026 11:46:36 +0100 Subject: [PATCH 07/11] fix tests --- preprocessing/nextclade/src/loculus_preprocessing/nextclade.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py b/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py index 4785d8fa25..211bcacd02 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py @@ -792,7 +792,7 @@ def assign_segment_for_alignment( ) batch.alerts = { id: Alert( - errors=[*batch.alerts[id].errors, error] if error else batch.alerts[id].errors, + errors=[*batch.alerts[id].errors, *error] if error else batch.alerts[id].errors, warnings=batch.alerts[id].warnings, ) for id, error in errors.items() From 641c503e5f53600fd843c6233ab2cf1004dc750d Mon Sep 17 00:00:00 2001 From: anna-parker <50943381+anna-parker@users.noreply.github.com> Date: Thu, 26 Mar 2026 11:56:33 +0100 Subject: [PATCH 08/11] add docs --- preprocessing/specification.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/preprocessing/specification.md b/preprocessing/specification.md index b7d360712b..28709a1f98 100644 --- a/preprocessing/specification.md +++ b/preprocessing/specification.md @@ -177,6 +177,16 @@ Nucleotide insertions for a multi-segmented organism: } ``` +#### SequenceNameToFastaId map + +The preprocessing pipeline is expected to classify which segment/reference each sequence best aligns to and return this assignment in the `sequenceNameToFastaId` field. The `sequenceName` should use the segment-reference structure expected by the backend (and query engine): + + - For an organisms without multiple references the `sequenceName` in the name of the segment (the segment name `main` is used for the single segment edge case). + - For single-segmented, multi-reference organisms the `sequenceName` is the name of the reference. + - For multi-segment, multi-reference organisms the `sequenceName` is the `{segmentName}-{referenceName}`. + +Additionally, the pipeline will receive a `max_sequences_per_entry` parameter via the config and is expected to return an error if the submission entry contains more sequences than are allowed per one submission entry. + ## Reprocessing The backend accepts processed data from pipelines that have the current or newer version. It will automatically switch to a newer pipeline version if the newer version has successfully processed all sequences that had also been successfully processed by the current version. From 0e3650983d1805d32600e35eaf3b354c4bd45f8c Mon Sep 17 00:00:00 2001 From: theosanderson-agent Date: Thu, 26 Mar 2026 11:04:26 +0000 Subject: [PATCH 09/11] test(preprocessing): add batch isolation test for maxSequencesPerEntry Add test_max_sequences_per_entry_batch_isolation that verifies when one entry in a batch exceeds the sequence limit, only that entry receives errors while other entries in the same batch succeed without max-sequence errors. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../tests/test_nextclade_preprocessing.py | 55 +++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/preprocessing/nextclade/tests/test_nextclade_preprocessing.py b/preprocessing/nextclade/tests/test_nextclade_preprocessing.py index d2cc199526..c1b39ffcdd 100644 --- a/preprocessing/nextclade/tests/test_nextclade_preprocessing.py +++ b/preprocessing/nextclade/tests/test_nextclade_preprocessing.py @@ -1276,6 +1276,61 @@ def test_max_sequences_per_entry_not_set_allows_any() -> None: assert len(max_seq_errors) == 0 +def test_max_sequences_per_entry_batch_isolation() -> None: + """If one entry in a batch exceeds maxSequencesPerEntry, only that entry is flagged; + other entries in the same batch should succeed without max-sequence errors.""" + config = get_config(MULTI_SEGMENT_CONFIG, ignore_args=True) + config.max_sequences_per_entry = 1 + + # Entry with too many sequences (2 sequences, limit is 1) + bad_entry = UnprocessedEntry( + accessionVersion="LOC_01.1", + data=UnprocessedData( + group_id=2, + submitter="test_submitter", + submissionId="test_submission_id", + submittedAt=ts_from_ymd(2021, 12, 15), + metadata={}, + unalignedNucleotideSequences={ + "ebola-sudan": sequence_with_mutation("ebola-sudan"), + "ebola-zaire": sequence_with_mutation("ebola-zaire"), + }, + ), + ) + + # Entry with acceptable number of sequences (1 sequence, limit is 1) + good_entry = UnprocessedEntry( + accessionVersion="LOC_02.1", + data=UnprocessedData( + group_id=2, + submitter="test_submitter", + submissionId="test_submission_id", + submittedAt=ts_from_ymd(2021, 12, 15), + metadata={}, + unalignedNucleotideSequences={ + "ebola-sudan": sequence_with_mutation("ebola-sudan"), + }, + ), + ) + + result = process_all([bad_entry, good_entry], MULTI_EBOLA_DATASET, config) + + # The entry with too many sequences should have a max-sequence error + bad_result = [r for r in result if r.processed_entry.accession == "LOC_01"][0] + bad_max_errors = [ + e for e in bad_result.processed_entry.errors if "maximum allowed" in e.message + ] + assert len(bad_max_errors) == 1 + assert "2 sequences" in bad_max_errors[0].message + + # The entry within the limit should have no max-sequence errors + good_result = [r for r in result if r.processed_entry.accession == "LOC_02"][0] + good_max_errors = [ + e for e in good_result.processed_entry.errors if "maximum allowed" in e.message + ] + assert len(good_max_errors) == 0 + + def test_preprocessing_without_metadata() -> None: config = get_config(MULTI_SEGMENT_CONFIG, ignore_args=True) sequence_entry_data = UnprocessedEntry( From 0cc89d6c110c52f927140f291b91fc42060f15e3 Mon Sep 17 00:00:00 2001 From: anna-parker <50943381+anna-parker@users.noreply.github.com> Date: Thu, 26 Mar 2026 12:07:34 +0100 Subject: [PATCH 10/11] remove unnecessary tests --- .../tests/test_nextclade_preprocessing.py | 88 +------------------ 1 file changed, 2 insertions(+), 86 deletions(-) diff --git a/preprocessing/nextclade/tests/test_nextclade_preprocessing.py b/preprocessing/nextclade/tests/test_nextclade_preprocessing.py index c1b39ffcdd..91df9cf5b8 100644 --- a/preprocessing/nextclade/tests/test_nextclade_preprocessing.py +++ b/preprocessing/nextclade/tests/test_nextclade_preprocessing.py @@ -1196,93 +1196,12 @@ def test_preprocessing_multi_segment_none_requirement(test_case_def: Case): ) -def test_max_sequences_per_entry_rejects_exceeding_limit() -> None: - config = get_config(MULTI_SEGMENT_CONFIG, ignore_args=True) - config.max_sequences_per_entry = 1 - - sequence_entry_data = UnprocessedEntry( - accessionVersion="LOC_01.1", - data=UnprocessedData( - group_id=2, - submitter="test_submitter", - submissionId="test_submission_id", - submittedAt=ts_from_ymd(2021, 12, 15), - metadata={}, - unalignedNucleotideSequences={ - "ebola-sudan": sequence_with_mutation("ebola-sudan"), - "ebola-zaire": sequence_with_mutation("ebola-zaire"), - }, - ), - ) - - result = process_all([sequence_entry_data], MULTI_EBOLA_DATASET, config) - processed_entry = result[0].processed_entry - - max_seq_errors = [e for e in processed_entry.errors if "maximum allowed" in e.message] - assert len(max_seq_errors) == 1 - assert "2 sequences" in max_seq_errors[0].message - assert "maximum allowed number of sequences per entry is 1" in max_seq_errors[0].message - - -def test_max_sequences_per_entry_allows_within_limit() -> None: - config = get_config(MULTI_SEGMENT_CONFIG, ignore_args=True) - config.max_sequences_per_entry = 3 - - sequence_entry_data = UnprocessedEntry( - accessionVersion="LOC_01.1", - data=UnprocessedData( - group_id=2, - submitter="test_submitter", - submissionId="test_submission_id", - submittedAt=ts_from_ymd(2021, 12, 15), - metadata={}, - unalignedNucleotideSequences={ - "ebola-sudan": sequence_with_mutation("ebola-sudan"), - "ebola-zaire": sequence_with_mutation("ebola-zaire"), - }, - ), - ) - - result = process_all([sequence_entry_data], MULTI_EBOLA_DATASET, config) - processed_entry = result[0].processed_entry - - max_seq_errors = [e for e in processed_entry.errors if "maximum allowed" in e.message] - assert len(max_seq_errors) == 0 - - -def test_max_sequences_per_entry_not_set_allows_any() -> None: - config = get_config(MULTI_SEGMENT_CONFIG, ignore_args=True) - config.max_sequences_per_entry = None - - sequence_entry_data = UnprocessedEntry( - accessionVersion="LOC_01.1", - data=UnprocessedData( - group_id=2, - submitter="test_submitter", - submissionId="test_submission_id", - submittedAt=ts_from_ymd(2021, 12, 15), - metadata={}, - unalignedNucleotideSequences={ - "ebola-sudan": sequence_with_mutation("ebola-sudan"), - "ebola-zaire": sequence_with_mutation("ebola-zaire"), - }, - ), - ) - - result = process_all([sequence_entry_data], MULTI_EBOLA_DATASET, config) - processed_entry = result[0].processed_entry - - max_seq_errors = [e for e in processed_entry.errors if "maximum allowed" in e.message] - assert len(max_seq_errors) == 0 - - def test_max_sequences_per_entry_batch_isolation() -> None: """If one entry in a batch exceeds maxSequencesPerEntry, only that entry is flagged; other entries in the same batch should succeed without max-sequence errors.""" config = get_config(MULTI_SEGMENT_CONFIG, ignore_args=True) config.max_sequences_per_entry = 1 - # Entry with too many sequences (2 sequences, limit is 1) bad_entry = UnprocessedEntry( accessionVersion="LOC_01.1", data=UnprocessedData( @@ -1298,7 +1217,6 @@ def test_max_sequences_per_entry_batch_isolation() -> None: ), ) - # Entry with acceptable number of sequences (1 sequence, limit is 1) good_entry = UnprocessedEntry( accessionVersion="LOC_02.1", data=UnprocessedData( @@ -1315,16 +1233,14 @@ def test_max_sequences_per_entry_batch_isolation() -> None: result = process_all([bad_entry, good_entry], MULTI_EBOLA_DATASET, config) - # The entry with too many sequences should have a max-sequence error - bad_result = [r for r in result if r.processed_entry.accession == "LOC_01"][0] + bad_result = next(r for r in result if r.processed_entry.accession == "LOC_01") bad_max_errors = [ e for e in bad_result.processed_entry.errors if "maximum allowed" in e.message ] assert len(bad_max_errors) == 1 assert "2 sequences" in bad_max_errors[0].message - # The entry within the limit should have no max-sequence errors - good_result = [r for r in result if r.processed_entry.accession == "LOC_02"][0] + good_result = next(r for r in result if r.processed_entry.accession == "LOC_02") good_max_errors = [ e for e in good_result.processed_entry.errors if "maximum allowed" in e.message ] From 20a0de3a12f81a70a757c526970535e3c67260a6 Mon Sep 17 00:00:00 2001 From: anna-parker <50943381+anna-parker@users.noreply.github.com> Date: Thu, 26 Mar 2026 15:13:42 +0100 Subject: [PATCH 11/11] rename function --- .../nextclade/src/loculus_preprocessing/nextclade.py | 4 ++-- preprocessing/nextclade/src/loculus_preprocessing/prepro.py | 4 ++-- .../nextclade/src/loculus_preprocessing/sequence_checks.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py b/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py index 211bcacd02..742bf07643 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py @@ -15,7 +15,7 @@ import pandas as pd from Bio import SeqIO -from loculus_preprocessing.sequence_checks import check_max_sequences_per_entry +from loculus_preprocessing.sequence_checks import error_on_excess_sequences from .config import AlignmentRequirement, Config, NextcladeSequenceAndDataset, SequenceName from .datatypes import ( @@ -770,7 +770,7 @@ def assign_segment_for_alignment( ) -> SequenceAssignmentBatch: errors = {} for entry in unprocessed: - errors[entry.accessionVersion] = check_max_sequences_per_entry( + errors[entry.accessionVersion] = error_on_excess_sequences( len(entry.data.unalignedNucleotideSequences), config, ) diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py index 7f57830c5f..8ce20d3175 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py @@ -63,7 +63,7 @@ process_phenotype_values, process_stop_codons, ) -from .sequence_checks import check_max_sequences_per_entry, errors_if_non_iupac +from .sequence_checks import error_on_excess_sequences, errors_if_non_iupac logger = logging.getLogger(__name__) @@ -501,7 +501,7 @@ def process_single( """Process a single sequence per config""" iupac_errors = errors_if_non_iupac(unprocessed.unalignedNucleotideSequences) - max_seq_errors = check_max_sequences_per_entry( + max_seq_errors = error_on_excess_sequences( len(unprocessed.unalignedNucleotideSequences), config, ) diff --git a/preprocessing/nextclade/src/loculus_preprocessing/sequence_checks.py b/preprocessing/nextclade/src/loculus_preprocessing/sequence_checks.py index 456c7dfe07..447d562002 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/sequence_checks.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/sequence_checks.py @@ -53,7 +53,7 @@ def errors_if_non_iupac( return errors -def check_max_sequences_per_entry( +def error_on_excess_sequences( num_sequences: int, config: Config, ) -> list[ProcessingAnnotation]: