From 0aa01495796f3669504f95d5821c32ac28e76886 Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Wed, 6 Aug 2025 16:35:53 +0200 Subject: [PATCH 01/26] feat(backend): remove fasta header validation from backend and refactor how fasta and metadata are merged --- .../org/loculus/backend/model/SubmitModel.kt | 73 ++++++++++--- .../submission/SequenceUploadAuxTable.kt | 4 +- .../submission/UploadDatabaseService.kt | 22 ++-- .../loculus/backend/utils/ParseFastaHeader.kt | 40 ------- .../V1.16__rename_aux_table_columns.sql | 6 ++ .../submission/PreparedOriginalData.kt | 2 +- .../SubmitEndpointSingleSegmentedTest.kt | 27 +---- .../submission/SubmitEndpointTest.kt | 100 +++--------------- 8 files changed, 92 insertions(+), 182 deletions(-) delete mode 100644 backend/src/main/kotlin/org/loculus/backend/utils/ParseFastaHeader.kt create mode 100644 backend/src/main/resources/db/migration/V1.16__rename_aux_table_columns.sql diff --git a/backend/src/main/kotlin/org/loculus/backend/model/SubmitModel.kt b/backend/src/main/kotlin/org/loculus/backend/model/SubmitModel.kt index cf447175b7..c6b9cbba5c 100644 --- a/backend/src/main/kotlin/org/loculus/backend/model/SubmitModel.kt +++ b/backend/src/main/kotlin/org/loculus/backend/model/SubmitModel.kt @@ -4,6 +4,8 @@ import mu.KotlinLogging import org.apache.commons.compress.archivers.zip.ZipFile import org.apache.commons.compress.compressors.CompressorStreamFactory import org.jetbrains.exposed.exceptions.ExposedSQLException +import org.jetbrains.exposed.sql.transactions.transaction +import org.jetbrains.exposed.sql.update import org.loculus.backend.api.DataUseTerms import org.loculus.backend.api.Organism import org.loculus.backend.api.SubmissionIdFilesMap @@ -21,6 +23,7 @@ import org.loculus.backend.service.groupmanagement.GroupManagementPreconditionVa import org.loculus.backend.service.submission.CompressionAlgorithm import org.loculus.backend.service.submission.MetadataUploadAuxTable import org.loculus.backend.service.submission.SequenceUploadAuxTable +import org.loculus.backend.service.submission.SequenceUploadAuxTable.metadataSubmissionIdColumn import org.loculus.backend.service.submission.SubmissionIdFilesMappingPreconditionValidator import org.loculus.backend.service.submission.UploadDatabaseService import org.loculus.backend.utils.DateProvider @@ -132,7 +135,7 @@ class SubmitModel( if (requiresConsensusSequenceFile(submissionParams.organism)) { log.debug { "Validating submission with uploadId $uploadId" } val sequenceSubmissionIds = uploadDatabaseService.getSequenceUploadSubmissionIds(uploadId).toSet() - validateSubmissionIdSetsForConsensusSequences(metadataSubmissionIds, sequenceSubmissionIds) + mapMetadataKeysToSequenceKeys(metadataSubmissionIds, sequenceSubmissionIds) } if (submissionParams is SubmissionParams.RevisionSubmissionParams) { @@ -348,27 +351,67 @@ class SubmitModel( ) } - private fun validateSubmissionIdSetsForConsensusSequences( - metadataKeysSet: Set, - sequenceKeysSet: Set, - ) { - val metadataKeysNotInSequences = metadataKeysSet.subtract(sequenceKeysSet) - val sequenceKeysNotInMetadata = sequenceKeysSet.subtract(metadataKeysSet) + private fun SubmissionId.removeSuffixPattern(): SubmissionId { + val lastDelimiter = this.lastIndexOf("_") + if (lastDelimiter == -1) { + return this + } + val cleaned = this.substring(0, lastDelimiter) + return cleaned + } + + @Transactional + private fun mapMetadataKeysToSequenceKeys(metadataKeysSet: Set, sequenceKeysSet: Set) { + val metadataKeyToSequences = mutableMapOf>() + val unmatchedSequenceKeys = mutableSetOf() + + for (seqKey in sequenceKeysSet) { + val baseKey = seqKey.removeSuffixPattern() + val matchedMetadataKey = when { + metadataKeysSet.contains(seqKey) -> seqKey + metadataKeysSet.contains(baseKey) -> baseKey + else -> null + } - if (metadataKeysNotInSequences.isNotEmpty() || sequenceKeysNotInMetadata.isNotEmpty()) { - val metadataNotPresentErrorText = if (metadataKeysNotInSequences.isNotEmpty()) { - "Metadata file contains ${metadataKeysNotInSequences.size} ids that are not present " + - "in the sequence file: " + metadataKeysNotInSequences.toList().joinToString(limit = 10) + "; " + if (matchedMetadataKey != null) { + metadataKeyToSequences.computeIfAbsent(matchedMetadataKey) { mutableListOf() }.add(seqKey) + } else { + unmatchedSequenceKeys.add(seqKey) + } + } + + val metadataKeysWithoutSequences = metadataKeysSet.filterNot { metadataKeyToSequences.containsKey(it) } + + if (unmatchedSequenceKeys.isNotEmpty() || metadataKeysWithoutSequences.isNotEmpty()) { + val unmatchedSeqText = if (unmatchedSequenceKeys.isNotEmpty()) { + "Sequence file contains ${unmatchedSequenceKeys.size} ids that are not present in the metadata file: ${ + unmatchedSequenceKeys.joinToString(limit = 10) + }; " } else { "" } - val sequenceNotPresentErrorText = if (sequenceKeysNotInMetadata.isNotEmpty()) { - "Sequence file contains ${sequenceKeysNotInMetadata.size} ids that are not present " + - "in the metadata file: " + sequenceKeysNotInMetadata.toList().joinToString(limit = 10) + val unmatchedMetadataText = if (metadataKeysWithoutSequences.isNotEmpty()) { + "Metadata file contains ${metadataKeysWithoutSequences.size} ids that are not present in " + + "the sequence file: ${metadataKeysWithoutSequences.joinToString(limit = 10)};" } else { "" } - throw UnprocessableEntityException(metadataNotPresentErrorText + sequenceNotPresentErrorText) + throw UnprocessableEntityException(unmatchedSeqText + unmatchedMetadataText) + } + + transaction { + for ((metadataSubmissionId, sequenceSubmissionIds) in metadataKeyToSequences) { + for (sequenceSubmissionId in sequenceSubmissionIds) { + SequenceUploadAuxTable.update( + { + SequenceUploadAuxTable.sequenceSubmissionIdColumn eq + sequenceSubmissionId + }, + ) { + it[metadataSubmissionIdColumn] = metadataSubmissionId + } + } + } } } diff --git a/backend/src/main/kotlin/org/loculus/backend/service/submission/SequenceUploadAuxTable.kt b/backend/src/main/kotlin/org/loculus/backend/service/submission/SequenceUploadAuxTable.kt index 9f931ac91b..73e973f323 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/submission/SequenceUploadAuxTable.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/submission/SequenceUploadAuxTable.kt @@ -8,8 +8,8 @@ const val SEQUENCE_UPLOAD_AUX_TABLE_NAME = "sequence_upload_aux_table" object SequenceUploadAuxTable : Table(SEQUENCE_UPLOAD_AUX_TABLE_NAME) { val sequenceUploadIdColumn = varchar("upload_id", 255) val sequenceSubmissionIdColumn = varchar("submission_id", 255) - val segmentNameColumn = varchar("segment_name", 255) + val metadataSubmissionIdColumn = varchar("metadata_submission_id", 255) val compressedSequenceDataColumn = jacksonSerializableJsonb("compressed_sequence_data") - override val primaryKey = PrimaryKey(sequenceUploadIdColumn, sequenceSubmissionIdColumn, segmentNameColumn) + override val primaryKey = PrimaryKey(sequenceUploadIdColumn, sequenceSubmissionIdColumn) } diff --git a/backend/src/main/kotlin/org/loculus/backend/service/submission/UploadDatabaseService.kt b/backend/src/main/kotlin/org/loculus/backend/service/submission/UploadDatabaseService.kt index ca9cef8919..e40e4f5e01 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/submission/UploadDatabaseService.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/submission/UploadDatabaseService.kt @@ -30,13 +30,12 @@ import org.loculus.backend.service.submission.MetadataUploadAuxTable.submitterCo import org.loculus.backend.service.submission.MetadataUploadAuxTable.uploadIdColumn import org.loculus.backend.service.submission.MetadataUploadAuxTable.uploadedAtColumn import org.loculus.backend.service.submission.SequenceUploadAuxTable.compressedSequenceDataColumn -import org.loculus.backend.service.submission.SequenceUploadAuxTable.segmentNameColumn +import org.loculus.backend.service.submission.SequenceUploadAuxTable.metadataSubmissionIdColumn import org.loculus.backend.service.submission.SequenceUploadAuxTable.sequenceSubmissionIdColumn import org.loculus.backend.service.submission.SequenceUploadAuxTable.sequenceUploadIdColumn import org.loculus.backend.utils.DatabaseConstants import org.loculus.backend.utils.FastaEntry import org.loculus.backend.utils.MetadataEntry -import org.loculus.backend.utils.ParseFastaHeader import org.loculus.backend.utils.RevisionEntry import org.loculus.backend.utils.chunkedForDatabase import org.loculus.backend.utils.getNextSequenceNumbers @@ -54,7 +53,6 @@ private const val METADATA_BATCH_SIZE = DatabaseConstants.POSTGRESQL_PARAMETER_L @Service @Transactional class UploadDatabaseService( - private val parseFastaHeader: ParseFastaHeader, private val compressor: CompressionService, private val accessionPreconditionValidator: AccessionPreconditionValidator, private val dataUseTermsDatabaseService: DataUseTermsDatabaseService, @@ -112,20 +110,16 @@ class UploadDatabaseService( submittedOrganism: Organism, uploadedSequencesBatch: List, ) { - uploadedSequencesBatch.chunkedForDatabase({ batch -> - SequenceUploadAuxTable.batchInsert(batch) { - val (submissionId, segmentName) = parseFastaHeader.parse(it.sampleName, submittedOrganism) - this[sequenceSubmissionIdColumn] = submissionId - this[segmentNameColumn] = segmentName + SequenceUploadAuxTable.batchInsert(uploadedSequencesBatch) { + this[sequenceSubmissionIdColumn] = it.sampleName + this[metadataSubmissionIdColumn] = it.sampleName this[sequenceUploadIdColumn] = uploadId this[compressedSequenceDataColumn] = compressor.compressNucleotideSequence( it.sequence, - segmentName, + it.sampleName, submittedOrganism, ) } - emptyList() - }, SEQUENCE_INSERT_COLUMNS) } fun getMetadataUploadSubmissionIds(uploadId: String): List = MetadataUploadAuxTable @@ -176,9 +170,9 @@ class UploadDatabaseService( 'unalignedNucleotideSequences', COALESCE( jsonb_object_agg( - sequence_upload_aux_table.segment_name, + sequence_upload_aux_table.submission_id, sequence_upload_aux_table.compressed_sequence_data::jsonb - ) FILTER (WHERE sequence_upload_aux_table.segment_name IS NOT NULL), + ) FILTER (WHERE sequence_upload_aux_table.submission_id IS NOT NULL), '{}'::jsonb ) ) @@ -187,7 +181,7 @@ class UploadDatabaseService( LEFT JOIN sequence_upload_aux_table ON metadata_upload_aux_table.upload_id = sequence_upload_aux_table.upload_id - AND metadata_upload_aux_table.submission_id = sequence_upload_aux_table.submission_id + AND metadata_upload_aux_table.submission_id = sequence_upload_aux_table.metadata_submission_id WHERE metadata_upload_aux_table.upload_id = ? GROUP BY metadata_upload_aux_table.upload_id, diff --git a/backend/src/main/kotlin/org/loculus/backend/utils/ParseFastaHeader.kt b/backend/src/main/kotlin/org/loculus/backend/utils/ParseFastaHeader.kt deleted file mode 100644 index 4815e61e2b..0000000000 --- a/backend/src/main/kotlin/org/loculus/backend/utils/ParseFastaHeader.kt +++ /dev/null @@ -1,40 +0,0 @@ -package org.loculus.backend.utils - -import org.loculus.backend.api.Organism -import org.loculus.backend.config.BackendConfig -import org.loculus.backend.controller.BadRequestException -import org.loculus.backend.model.HEADER_TO_CONNECT_METADATA_AND_SEQUENCES -import org.loculus.backend.model.SegmentName -import org.loculus.backend.model.SubmissionId -import org.springframework.stereotype.Service - -@Service -class ParseFastaHeader(private val backendConfig: BackendConfig) { - fun parse(submissionId: String, organism: Organism): Pair { - val referenceGenome = backendConfig.getInstanceConfig(organism).referenceGenome - - if (referenceGenome.nucleotideSequences.size == 1) { - return Pair(submissionId, "main") - } - - val validSegmentIds = referenceGenome.nucleotideSequences.map { it.name } - - val lastDelimiter = submissionId.lastIndexOf("_") - if (lastDelimiter == -1) { - throw BadRequestException( - "The FASTA header $submissionId does not contain the segment name. Please provide the" + - " segment name in the format <$HEADER_TO_CONNECT_METADATA_AND_SEQUENCES>_", - ) - } - val isolateId = submissionId.substring(0, lastDelimiter) - val segmentId = submissionId.substring(lastDelimiter + 1) - if (!validSegmentIds.contains(segmentId)) { - throw BadRequestException( - "The FASTA header $submissionId ends with the segment name $segmentId, which is not valid. " + - "Valid segment names: ${validSegmentIds.joinToString(", ")}", - ) - } - - return Pair(isolateId, segmentId) - } -} diff --git a/backend/src/main/resources/db/migration/V1.16__rename_aux_table_columns.sql b/backend/src/main/resources/db/migration/V1.16__rename_aux_table_columns.sql new file mode 100644 index 0000000000..69b8c2e351 --- /dev/null +++ b/backend/src/main/resources/db/migration/V1.16__rename_aux_table_columns.sql @@ -0,0 +1,6 @@ +alter table sequence_upload_aux_table drop CONSTRAINT sequence_upload_aux_table_pkey; + +alter table sequence_upload_aux_table drop column segment_name; +alter table sequence_upload_aux_table add column metadata_submission_id text; + +alter table sequence_upload_aux_table add PRIMARY KEY (upload_id, submission_id); diff --git a/backend/src/test/kotlin/org/loculus/backend/controller/submission/PreparedOriginalData.kt b/backend/src/test/kotlin/org/loculus/backend/controller/submission/PreparedOriginalData.kt index 20f05380bc..525cab15b0 100644 --- a/backend/src/test/kotlin/org/loculus/backend/controller/submission/PreparedOriginalData.kt +++ b/backend/src/test/kotlin/org/loculus/backend/controller/submission/PreparedOriginalData.kt @@ -11,7 +11,7 @@ val defaultOriginalData = OriginalData( "country" to "Switzerland", "division" to "Bern", ), - mapOf("main" to "ACTG"), + mapOf("custom0" to "ACTG"), ) val emptyOriginalData = OriginalData(metadata = emptyMap(), unalignedNucleotideSequences = emptyMap()) diff --git a/backend/src/test/kotlin/org/loculus/backend/controller/submission/SubmitEndpointSingleSegmentedTest.kt b/backend/src/test/kotlin/org/loculus/backend/controller/submission/SubmitEndpointSingleSegmentedTest.kt index 45c0595cdc..5fa453dae9 100644 --- a/backend/src/test/kotlin/org/loculus/backend/controller/submission/SubmitEndpointSingleSegmentedTest.kt +++ b/backend/src/test/kotlin/org/loculus/backend/controller/submission/SubmitEndpointSingleSegmentedTest.kt @@ -60,31 +60,6 @@ class SubmitEndpointSingleSegmentedTest( .data .unalignedNucleotideSequences - assertThat(unalignedNucleotideSequences, hasEntry(DEFAULT_SEQUENCE_NAME, "AC")) - } - - @Test - fun `GIVEN input data with explicit default segment name THEN data is rejected`() { - val groupId = groupManagementClient.createNewGroup().andGetGroupId() - val expectedDetail = "Metadata file contains 1 ids that are not present in the sequence file: header1" - - submissionControllerClient.submit( - SubmitFiles.metadataFileWith( - content = """ - submissionId firstColumn - header1 someValue - """.trimIndent(), - ), - SubmitFiles.sequenceFileWith( - content = """ - >header1_$DEFAULT_SEQUENCE_NAME - AC - """.trimIndent(), - ), - groupId = groupId, - ) - .andExpect(status().isUnprocessableEntity) - .andExpect(jsonPath("\$.title").value("Unprocessable Entity")) - .andExpect(jsonPath("\$.detail", containsString(expectedDetail))) + assertThat(unalignedNucleotideSequences, hasEntry("header1", "AC")) } } diff --git a/backend/src/test/kotlin/org/loculus/backend/controller/submission/SubmitEndpointTest.kt b/backend/src/test/kotlin/org/loculus/backend/controller/submission/SubmitEndpointTest.kt index a533c503e5..c64b116fe0 100644 --- a/backend/src/test/kotlin/org/loculus/backend/controller/submission/SubmitEndpointTest.kt +++ b/backend/src/test/kotlin/org/loculus/backend/controller/submission/SubmitEndpointTest.kt @@ -164,65 +164,6 @@ class SubmitEndpointTest( .andExpect(status().isBadRequest) } - @Test - fun `GIVEN fasta data with unknown segment THEN data is not accepted`() { - submissionControllerClient.submit( - SubmitFiles.metadataFileWith( - content = """ - submissionId firstColumn - commonHeader someValue - """.trimIndent(), - ), - SubmitFiles.sequenceFileWith( - content = """ - >commonHeader_nonExistingSegmentName - AC - """.trimIndent(), - ), - organism = OTHER_ORGANISM, - groupId = groupId, - ) - .andExpect(status().isBadRequest) - .andExpect(content().contentType(APPLICATION_PROBLEM_JSON)) - .andExpect( - jsonPath( - "\$.detail", - ).value( - "The FASTA header commonHeader_nonExistingSegmentName ends with the segment name nonExistingSegmentName, which is not valid. Valid segment names: notOnlySegment, secondSegment", - ), - ) - } - - @Test - fun `GIVEN fasta data with empty segment THEN data is not accepted`() { - submissionControllerClient.submit( - SubmitFiles.metadataFileWith( - content = """ - submissionId firstColumn - commonHeader someValue - """.trimIndent(), - ), - SubmitFiles.sequenceFileWith( - content = """ - >commonHeader_notOnlySegment - AC - >commonHeader_secondSegment - """.trimIndent(), - ), - organism = OTHER_ORGANISM, - groupId = groupId, - ) - .andExpect(status().isUnprocessableEntity()) - .andExpect(content().contentType(APPLICATION_PROBLEM_JSON)) - .andExpect( - jsonPath( - "\$.detail", - ).value( - "No sequence data given for sample commonHeader_secondSegment.", - ), - ) - } - @Test fun `GIVEN fasta data with whitespace-only segment THEN data is not accepted`() { submissionControllerClient.submit( @@ -406,9 +347,15 @@ class SubmitEndpointTest( status().isBadRequest, "Bad Request", "${metadataFileTypes.displayName} has wrong extension. Must be " + - ".${metadataFileTypes.validExtensions.joinToString(", .")} for uncompressed submissions or " + ".${ - metadataFileTypes.getCompressedExtensions().filterKeys { it != CompressionAlgorithm.NONE } + metadataFileTypes.validExtensions.joinToString( + ", .", + ) + } for uncompressed submissions or " + + ".${ + metadataFileTypes.getCompressedExtensions().filterKeys { + it != CompressionAlgorithm.NONE + } .flatMap { it.value }.joinToString(", .") } for compressed submissions", DEFAULT_ORGANISM, @@ -421,9 +368,15 @@ class SubmitEndpointTest( status().isBadRequest, "Bad Request", "${sequenceFileTypes.displayName} has wrong extension. Must be " + - ".${sequenceFileTypes.validExtensions.joinToString(", .")} for uncompressed submissions or " + ".${ - sequenceFileTypes.getCompressedExtensions().filterKeys { it != CompressionAlgorithm.NONE } + sequenceFileTypes.validExtensions.joinToString( + ", .", + ) + } for uncompressed submissions or " + + ".${ + sequenceFileTypes.getCompressedExtensions().filterKeys { + it != CompressionAlgorithm.NONE + } .flatMap { it.value }.joinToString(", .") } for compressed submissions", DEFAULT_ORGANISM, @@ -536,27 +489,6 @@ class SubmitEndpointTest( DEFAULT_ORGANISM, DataUseTerms.Open, ), - Arguments.of( - "FASTA header misses segment name", - SubmitFiles.metadataFileWith( - content = """ - submissionId firstColumn - commonHeader someValue - """.trimIndent(), - ), - SubmitFiles.sequenceFileWith( - content = """ - >commonHeader - AC - """.trimIndent(), - ), - status().isBadRequest, - "Bad Request", - "The FASTA header commonHeader does not contain the segment name. Please provide the segment " + - "name in the format _", - OTHER_ORGANISM, - DataUseTerms.Open, - ), Arguments.of( "restricted use data with until date in the past", DefaultFiles.metadataFile, From 32ab96b616e84b02814750d096606a9da24a71dc Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Wed, 6 Aug 2025 16:54:32 +0200 Subject: [PATCH 02/26] feat(prepro): move segment validation to prepro --- .../src/loculus_preprocessing/prepro.py | 138 +++++++++++++----- 1 file changed, 99 insertions(+), 39 deletions(-) diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py index 13f9eef580..7493708ff5 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py @@ -230,6 +230,92 @@ def run_sort( return alerts +def assign_segment( + input_unaligned_sequences: dict[str, NucleotideSequence | None], + unaligned_nucleotide_sequences: dict[SegmentName, NucleotideSequence | None], + errors: list[ProcessingAnnotation], + aligned_nucleotide_sequences: dict[SegmentName, NucleotideSequence | None], + config: Config, +): + valid_segments = set() + duplicate_segments = set() + if not config.multi_segment: + if len(input_unaligned_sequences) > 1: + errors.append( + ProcessingAnnotation.from_single( + ProcessingAnnotationAlignment, + AnnotationSourceType.NUCLEOTIDE_SEQUENCE, + message=( + f"Multiple sequences: {list(input_unaligned_sequences.keys())} found in the" + f" input data, but organism: {config.organism} is single-segmented. " + "Please check that your metadata and sequences are annotated correctly." + "Each metadata entry should have a single corresponding fasta sequence " + "entry with the same submissionId." + ), + ) + ) + else: + _, value = next(iter(input_unaligned_sequences.items())) + unaligned_nucleotide_sequences["main"] = value + aligned_nucleotide_sequences["main"] = None + return ( + unaligned_nucleotide_sequences, + aligned_nucleotide_sequences, + errors, + ) + for segment in config.nucleotideSequences: + unaligned_segment = [ + data + for data in input_unaligned_sequences + if re.match(r"[\w\d]+_" + segment + "$", data, re.IGNORECASE) + ] + if len(unaligned_segment) > 1: + duplicate_segments.update(unaligned_segment) + errors.append( + ProcessingAnnotation.from_single( + ProcessingAnnotationAlignment, + AnnotationSourceType.NUCLEOTIDE_SEQUENCE, + message=( + f"Found multiple sequences with the same segment name: {segment}. " + "Each metadata entry can have multiple corresponding fasta sequence " + "entries with format _." + ), + ) + ) + elif len(unaligned_segment) == 1: + valid_segments.add(unaligned_segment[0]) + unaligned_nucleotide_sequences[segment] = input_unaligned_sequences[ + unaligned_segment[0] + ] + aligned_nucleotide_sequences[segment] = None + remaining_segments = set(input_unaligned_sequences.keys()) - valid_segments - duplicate_segments + if len(remaining_segments) > 0: + errors.append( + ProcessingAnnotation( + unprocessedFields=[ + AnnotationSource( + name="alignment", + type=AnnotationSourceType.NUCLEOTIDE_SEQUENCE, + ), + ], + processedFields=[ + AnnotationSource( + name="alignment", + type=AnnotationSourceType.NUCLEOTIDE_SEQUENCE, + ), + ], + message=( + f"Found sequences in the input data with segments that are not in the config: " + f"{', '.join(remaining_segments)}. " + "Each metadata entry can have multiple corresponding fasta sequence " + "entries with format _ valid segments are: " + f"{', '.join(config.nucleotideSequences)}." + ), + ) + ) + return (unaligned_nucleotide_sequences, aligned_nucleotide_sequences, errors) + + def enrich_with_nextclade( # noqa: C901, PLR0912, PLR0914, PLR0915 unprocessed: Sequence[UnprocessedEntry], dataset_dir: str, config: Config ) -> dict[AccessionVersion, UnprocessedAfterNextclade]: @@ -268,45 +354,19 @@ def enrich_with_nextclade( # noqa: C901, PLR0912, PLR0914, PLR0915 aligned_nucleotide_sequences[id] = {} alerts.warnings[id] = [] alerts.errors[id] = [] - num_valid_segments = 0 - num_duplicate_segments = 0 - for segment in config.nucleotideSequences: - unaligned_segment = [ - data - for data in entry.data.unalignedNucleotideSequences - if re.match(segment + "$", data, re.IGNORECASE) - ] - if len(unaligned_segment) > 1: - num_duplicate_segments += len(unaligned_segment) - alerts.errors[id].append( - ProcessingAnnotation.from_single( - segment, - AnnotationSourceType.NUCLEOTIDE_SEQUENCE, - message="Found multiple sequences with the same segment name.", - ), - ) - elif len(unaligned_segment) == 1: - num_valid_segments += 1 - unaligned_nucleotide_sequences[id][segment] = ( - entry.data.unalignedNucleotideSequences[unaligned_segment[0]] - ) - aligned_nucleotide_sequences[id][segment] = None - if ( - len(entry.data.unalignedNucleotideSequences) - - num_valid_segments - - num_duplicate_segments - > 0 - ): - alerts.errors[id].append( - ProcessingAnnotation.from_single( - ProcessingAnnotationAlignment, - AnnotationSourceType.NUCLEOTIDE_SEQUENCE, - message=( - "Found unknown segments in the input data - " - "check your segments are annotated correctly." - ), - ), - ) + for gene in config.genes: + aligned_aminoacid_sequences[id][gene] = None + ( + unaligned_nucleotide_sequences[id], + aligned_nucleotide_sequences[id], + alerts.errors[id], + ) = assign_segment( + input_unaligned_sequences=entry.data.unalignedNucleotideSequences, + unaligned_nucleotide_sequences=unaligned_nucleotide_sequences[id], + errors=alerts.errors[id], + aligned_nucleotide_sequences=aligned_nucleotide_sequences[id], + config=config, + ) nextclade_metadata: defaultdict[ AccessionVersion, defaultdict[SegmentName, dict[str, Any] | None] From 8ab5d1449c237494b2783f8c88640e8031eca6e7 Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Wed, 6 Aug 2025 17:03:28 +0200 Subject: [PATCH 03/26] feat(prepro): fix tests --- preprocessing/nextclade/src/loculus_preprocessing/prepro.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py index 7493708ff5..eac9e72973 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py @@ -240,6 +240,7 @@ def assign_segment( valid_segments = set() duplicate_segments = set() if not config.multi_segment: + aligned_nucleotide_sequences["main"] = None if len(input_unaligned_sequences) > 1: errors.append( ProcessingAnnotation.from_single( @@ -257,17 +258,20 @@ def assign_segment( else: _, value = next(iter(input_unaligned_sequences.items())) unaligned_nucleotide_sequences["main"] = value - aligned_nucleotide_sequences["main"] = None return ( unaligned_nucleotide_sequences, aligned_nucleotide_sequences, errors, ) for segment in config.nucleotideSequences: + aligned_nucleotide_sequences[segment] = None unaligned_segment = [ data for data in input_unaligned_sequences if re.match(r"[\w\d]+_" + segment + "$", data, re.IGNORECASE) + or re.match( + segment + "$", data, re.IGNORECASE + ) # backward compatibility allow only segment name in submission dict ] if len(unaligned_segment) > 1: duplicate_segments.update(unaligned_segment) From 6313f6bd9b1473928a6dfa991567545caf1887aa Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Wed, 6 Aug 2025 15:04:50 +0000 Subject: [PATCH 04/26] Update schema documentation based on migration changes --- backend/docs/db/schema.sql | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/backend/docs/db/schema.sql b/backend/docs/db/schema.sql index 5bf4c965b8..e5b2f8271b 100644 --- a/backend/docs/db/schema.sql +++ b/backend/docs/db/schema.sql @@ -528,8 +528,8 @@ ALTER VIEW public.sequence_entries_view OWNER TO postgres; CREATE TABLE public.sequence_upload_aux_table ( upload_id text NOT NULL, submission_id text NOT NULL, - segment_name text NOT NULL, - compressed_sequence_data text NOT NULL + compressed_sequence_data text NOT NULL, + metadata_submission_id text ); @@ -718,7 +718,7 @@ ALTER TABLE ONLY public.sequence_entries_preprocessed_data -- ALTER TABLE ONLY public.sequence_upload_aux_table - ADD CONSTRAINT sequence_upload_aux_table_pkey PRIMARY KEY (upload_id, submission_id, segment_name); + ADD CONSTRAINT sequence_upload_aux_table_pkey PRIMARY KEY (upload_id, submission_id); -- From 05c8a6c858a536e3a1bff6afe25e9de73233cc6c Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Wed, 6 Aug 2025 17:08:12 +0200 Subject: [PATCH 05/26] format --- .../submission/UploadDatabaseService.kt | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/backend/src/main/kotlin/org/loculus/backend/service/submission/UploadDatabaseService.kt b/backend/src/main/kotlin/org/loculus/backend/service/submission/UploadDatabaseService.kt index e40e4f5e01..34db8acfb4 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/submission/UploadDatabaseService.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/submission/UploadDatabaseService.kt @@ -111,15 +111,15 @@ class UploadDatabaseService( uploadedSequencesBatch: List, ) { SequenceUploadAuxTable.batchInsert(uploadedSequencesBatch) { - this[sequenceSubmissionIdColumn] = it.sampleName - this[metadataSubmissionIdColumn] = it.sampleName - this[sequenceUploadIdColumn] = uploadId - this[compressedSequenceDataColumn] = compressor.compressNucleotideSequence( - it.sequence, - it.sampleName, - submittedOrganism, - ) - } + this[sequenceSubmissionIdColumn] = it.sampleName + this[metadataSubmissionIdColumn] = it.sampleName + this[sequenceUploadIdColumn] = uploadId + this[compressedSequenceDataColumn] = compressor.compressNucleotideSequence( + it.sequence, + it.sampleName, + submittedOrganism, + ) + } } fun getMetadataUploadSubmissionIds(uploadId: String): List = MetadataUploadAuxTable From 9af182621d016c18be2c6a6353db1dc785673752 Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Wed, 6 Aug 2025 19:53:07 +0200 Subject: [PATCH 06/26] fix prepro --- preprocessing/nextclade/src/loculus_preprocessing/prepro.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py index eac9e72973..5176ab6124 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py @@ -268,7 +268,7 @@ def assign_segment( unaligned_segment = [ data for data in input_unaligned_sequences - if re.match(r"[\w\d]+_" + segment + "$", data, re.IGNORECASE) + if re.match(r"[A-Za-z0-9]+_" + segment + "$", data, re.IGNORECASE) or re.match( segment + "$", data, re.IGNORECASE ) # backward compatibility allow only segment name in submission dict From 5cb0014bc66a6dada342cf9510295aa604839ece Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Wed, 6 Aug 2025 20:00:29 +0200 Subject: [PATCH 07/26] retry --- preprocessing/nextclade/src/loculus_preprocessing/prepro.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py index 5176ab6124..5468bfdf2a 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py @@ -268,7 +268,7 @@ def assign_segment( unaligned_segment = [ data for data in input_unaligned_sequences - if re.match(r"[A-Za-z0-9]+_" + segment + "$", data, re.IGNORECASE) + if re.match(segment + "$", data.split("_", 1)[-1], re.IGNORECASE) or re.match( segment + "$", data, re.IGNORECASE ) # backward compatibility allow only segment name in submission dict From 01418ef121067e6d36754cfa38a465a19ad18b64 Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Wed, 6 Aug 2025 20:04:01 +0200 Subject: [PATCH 08/26] again --- preprocessing/nextclade/src/loculus_preprocessing/prepro.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py index 5468bfdf2a..b39e07771b 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py @@ -268,7 +268,7 @@ def assign_segment( unaligned_segment = [ data for data in input_unaligned_sequences - if re.match(segment + "$", data.split("_", 1)[-1], re.IGNORECASE) + if re.match(segment + "$", data.split("_")[-1], re.IGNORECASE) or re.match( segment + "$", data, re.IGNORECASE ) # backward compatibility allow only segment name in submission dict From c9c4b67690e18c3bfb37b6654a86aa3005d3d4e1 Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Mon, 11 Aug 2025 08:54:12 +0200 Subject: [PATCH 09/26] feat(prepro,backend): fix merge conflicts --- .../submission/UploadDatabaseService.kt | 23 +++++++++++-------- .../src/loculus_preprocessing/prepro.py | 3 --- .../tests/test_nextclade_preprocessing.py | 4 ++-- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/backend/src/main/kotlin/org/loculus/backend/service/submission/UploadDatabaseService.kt b/backend/src/main/kotlin/org/loculus/backend/service/submission/UploadDatabaseService.kt index 34db8acfb4..70b6ad1e33 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/submission/UploadDatabaseService.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/submission/UploadDatabaseService.kt @@ -110,16 +110,19 @@ class UploadDatabaseService( submittedOrganism: Organism, uploadedSequencesBatch: List, ) { - SequenceUploadAuxTable.batchInsert(uploadedSequencesBatch) { - this[sequenceSubmissionIdColumn] = it.sampleName - this[metadataSubmissionIdColumn] = it.sampleName - this[sequenceUploadIdColumn] = uploadId - this[compressedSequenceDataColumn] = compressor.compressNucleotideSequence( - it.sequence, - it.sampleName, - submittedOrganism, - ) - } + uploadedSequencesBatch.chunkedForDatabase({ batch -> + SequenceUploadAuxTable.batchInsert(batch) { + this[sequenceSubmissionIdColumn] = it.sampleName + this[segmentNameColumn] = it.sampleName + this[sequenceUploadIdColumn] = uploadId + this[compressedSequenceDataColumn] = compressor.compressNucleotideSequence( + it.sequence, + it.sampleName, + submittedOrganism, + ) + } + emptyList() + }, SEQUENCE_INSERT_COLUMNS) } fun getMetadataUploadSubmissionIds(uploadId: String): List = MetadataUploadAuxTable diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py index b39e07771b..c21e59cadc 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py @@ -264,7 +264,6 @@ def assign_segment( errors, ) for segment in config.nucleotideSequences: - aligned_nucleotide_sequences[segment] = None unaligned_segment = [ data for data in input_unaligned_sequences @@ -358,8 +357,6 @@ def enrich_with_nextclade( # noqa: C901, PLR0912, PLR0914, PLR0915 aligned_nucleotide_sequences[id] = {} alerts.warnings[id] = [] alerts.errors[id] = [] - for gene in config.genes: - aligned_aminoacid_sequences[id][gene] = None ( unaligned_nucleotide_sequences[id], aligned_nucleotide_sequences[id], diff --git a/preprocessing/nextclade/tests/test_nextclade_preprocessing.py b/preprocessing/nextclade/tests/test_nextclade_preprocessing.py index 5c4fd621f1..067fe4def1 100644 --- a/preprocessing/nextclade/tests/test_nextclade_preprocessing.py +++ b/preprocessing/nextclade/tests/test_nextclade_preprocessing.py @@ -622,8 +622,8 @@ def test_preprocessing_without_metadata() -> None: submittedAt=ts_from_ymd(2021, 12, 15), metadata={}, unalignedNucleotideSequences={ - "ebola-sudan": sequence_with_mutation("ebola-sudan"), - "ebola-zaire": sequence_with_mutation("ebola-zaire"), + "test_NIHPAK-19_ebola-sudan": sequence_with_mutation("ebola-sudan"), + "test_NIHPAK-19_ebola-zaire": sequence_with_mutation("ebola-zaire"), }, ), ) From 48cbc4737ca1de7476e134c8ac8f254ef18f3e9b Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Mon, 11 Aug 2025 09:06:08 +0200 Subject: [PATCH 10/26] feat(backend): fix merge conflict --- .../loculus/backend/service/submission/UploadDatabaseService.kt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/src/main/kotlin/org/loculus/backend/service/submission/UploadDatabaseService.kt b/backend/src/main/kotlin/org/loculus/backend/service/submission/UploadDatabaseService.kt index 70b6ad1e33..4ccb0a9689 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/submission/UploadDatabaseService.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/submission/UploadDatabaseService.kt @@ -113,7 +113,7 @@ class UploadDatabaseService( uploadedSequencesBatch.chunkedForDatabase({ batch -> SequenceUploadAuxTable.batchInsert(batch) { this[sequenceSubmissionIdColumn] = it.sampleName - this[segmentNameColumn] = it.sampleName + this[metadataSubmissionIdColumn] = it.sampleName this[sequenceUploadIdColumn] = uploadId this[compressedSequenceDataColumn] = compressor.compressNucleotideSequence( it.sequence, From 458ce5fdbe82af1e3a25dbfcb2d001f4234df4e1 Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Mon, 11 Aug 2025 15:11:21 +0200 Subject: [PATCH 11/26] feat(website): try to fix revisions --- website/src/components/Edit/SequencesForm.tsx | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/website/src/components/Edit/SequencesForm.tsx b/website/src/components/Edit/SequencesForm.tsx index 3af8c91a2b..1ff31794c3 100644 --- a/website/src/components/Edit/SequencesForm.tsx +++ b/website/src/components/Edit/SequencesForm.tsx @@ -52,14 +52,14 @@ export class EditableSequences { */ static fromInitialData(initialData: SequenceEntryToEdit, segmentNames: string[]): EditableSequences { const emptyRows = this.emptyRows(segmentNames); - const existingDataRows = Object.entries(initialData.originalData.unalignedNucleotideSequences).map( - ([key, value]) => ({ + const existingDataRows = Object.entries(initialData.processedData.unalignedNucleotideSequences) + .filter(([_, value]) => value !== null) + .map(([key, value]) => ({ key, - initialValue: value, - value: value, + initialValue: value!.toString(), + value: value!.toString(), ...mapErrorsAndWarnings(initialData, key, 'NucleotideSequence'), - }), - ); + })); const mergedRows: Row[] = []; // merge in this way to retain the order of segment names as they were given. emptyRows.forEach((row) => { From 2a4db349c96c7a863fc8e62cb3c595df0115af0d Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Mon, 11 Aug 2025 15:17:07 +0200 Subject: [PATCH 12/26] feat(prepro): clean up more --- .../src/loculus_preprocessing/prepro.py | 26 ++++--------------- .../tests/test_nextclade_preprocessing.py | 8 +++--- 2 files changed, 8 insertions(+), 26 deletions(-) diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py index c21e59cadc..300f7cdcc1 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py @@ -234,13 +234,11 @@ def assign_segment( input_unaligned_sequences: dict[str, NucleotideSequence | None], unaligned_nucleotide_sequences: dict[SegmentName, NucleotideSequence | None], errors: list[ProcessingAnnotation], - aligned_nucleotide_sequences: dict[SegmentName, NucleotideSequence | None], config: Config, ): valid_segments = set() duplicate_segments = set() if not config.multi_segment: - aligned_nucleotide_sequences["main"] = None if len(input_unaligned_sequences) > 1: errors.append( ProcessingAnnotation.from_single( @@ -260,7 +258,6 @@ def assign_segment( unaligned_nucleotide_sequences["main"] = value return ( unaligned_nucleotide_sequences, - aligned_nucleotide_sequences, errors, ) for segment in config.nucleotideSequences: @@ -290,23 +287,12 @@ def assign_segment( unaligned_nucleotide_sequences[segment] = input_unaligned_sequences[ unaligned_segment[0] ] - aligned_nucleotide_sequences[segment] = None remaining_segments = set(input_unaligned_sequences.keys()) - valid_segments - duplicate_segments if len(remaining_segments) > 0: errors.append( - ProcessingAnnotation( - unprocessedFields=[ - AnnotationSource( - name="alignment", - type=AnnotationSourceType.NUCLEOTIDE_SEQUENCE, - ), - ], - processedFields=[ - AnnotationSource( - name="alignment", - type=AnnotationSourceType.NUCLEOTIDE_SEQUENCE, - ), - ], + ProcessingAnnotation.from_single( + ProcessingAnnotationAlignment, + AnnotationSourceType.NUCLEOTIDE_SEQUENCE, message=( f"Found sequences in the input data with segments that are not in the config: " f"{', '.join(remaining_segments)}. " @@ -316,10 +302,10 @@ def assign_segment( ), ) ) - return (unaligned_nucleotide_sequences, aligned_nucleotide_sequences, errors) + return (unaligned_nucleotide_sequences, errors) -def enrich_with_nextclade( # noqa: C901, PLR0912, PLR0914, PLR0915 +def enrich_with_nextclade( # noqa: C901, PLR0914, PLR0915 unprocessed: Sequence[UnprocessedEntry], dataset_dir: str, config: Config ) -> dict[AccessionVersion, UnprocessedAfterNextclade]: """ @@ -359,13 +345,11 @@ def enrich_with_nextclade( # noqa: C901, PLR0912, PLR0914, PLR0915 alerts.errors[id] = [] ( unaligned_nucleotide_sequences[id], - aligned_nucleotide_sequences[id], alerts.errors[id], ) = assign_segment( input_unaligned_sequences=entry.data.unalignedNucleotideSequences, unaligned_nucleotide_sequences=unaligned_nucleotide_sequences[id], errors=alerts.errors[id], - aligned_nucleotide_sequences=aligned_nucleotide_sequences[id], config=config, ) diff --git a/preprocessing/nextclade/tests/test_nextclade_preprocessing.py b/preprocessing/nextclade/tests/test_nextclade_preprocessing.py index 067fe4def1..1b714271fd 100644 --- a/preprocessing/nextclade/tests/test_nextclade_preprocessing.py +++ b/preprocessing/nextclade/tests/test_nextclade_preprocessing.py @@ -213,7 +213,7 @@ def invalid_sequence() -> str: expected_warnings=[], expected_processed_alignment=ProcessedAlignment( unalignedNucleotideSequences={"main": invalid_sequence()}, - alignedNucleotideSequences={"main": None}, + alignedNucleotideSequences={}, nucleotideInsertions={}, alignedAminoAcidSequences={}, aminoAcidInsertions={}, @@ -420,7 +420,7 @@ def invalid_sequence() -> str: unalignedNucleotideSequences={ "ebola-sudan": invalid_sequence(), }, - alignedNucleotideSequences={"ebola-sudan": None}, + alignedNucleotideSequences={}, nucleotideInsertions={}, alignedAminoAcidSequences={}, aminoAcidInsertions={}, @@ -459,7 +459,6 @@ def invalid_sequence() -> str: "ebola-zaire": sequence_with_mutation("ebola-zaire"), }, alignedNucleotideSequences={ - "ebola-sudan": None, "ebola-zaire": sequence_with_mutation("ebola-zaire"), }, nucleotideInsertions={}, @@ -508,7 +507,7 @@ def invalid_sequence() -> str: unalignedNucleotideSequences={ "ebola-sudan": invalid_sequence(), }, - alignedNucleotideSequences={"ebola-sudan": None}, + alignedNucleotideSequences={}, nucleotideInsertions={}, alignedAminoAcidSequences={}, aminoAcidInsertions={}, @@ -547,7 +546,6 @@ def invalid_sequence() -> str: "ebola-zaire": sequence_with_mutation("ebola-zaire"), }, alignedNucleotideSequences={ - "ebola-sudan": None, "ebola-zaire": sequence_with_mutation("ebola-zaire"), }, nucleotideInsertions={}, From 28832911d7b9785846544457607828550f8a2258 Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Mon, 11 Aug 2025 15:45:09 +0200 Subject: [PATCH 13/26] feat(prepro): add tests --- .../src/loculus_preprocessing/prepro.py | 2 +- .../tests/test_nextclade_preprocessing.py | 95 ++++++++++++++++++- 2 files changed, 94 insertions(+), 3 deletions(-) diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py index 300f7cdcc1..143628c1e6 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py @@ -533,7 +533,7 @@ def add_input_metadata( nextclade_prefix = "nextclade." if input_path.startswith(nextclade_prefix): segment = str(spec.args["segment"]) if spec.args and "segment" in spec.args else "main" - if not unprocessed.nextcladeMetadata: + if not unprocessed.nextcladeMetadata and unprocessed.unalignedNucleotideSequences: # This field should never be empty message = ( "An unknown internal error occurred while aligning sequences, " diff --git a/preprocessing/nextclade/tests/test_nextclade_preprocessing.py b/preprocessing/nextclade/tests/test_nextclade_preprocessing.py index 1b714271fd..74b227f4ef 100644 --- a/preprocessing/nextclade/tests/test_nextclade_preprocessing.py +++ b/preprocessing/nextclade/tests/test_nextclade_preprocessing.py @@ -558,6 +558,93 @@ def invalid_sequence() -> str: ), ] +segment_validation_tests_multi_segments = [ + Case( + name="don't allow multiple segments with the same name", + input_metadata={}, + input_sequence={ + "ebola-sudan": invalid_sequence(), + "duplicate_ebola-sudan": invalid_sequence(), + }, + accession_id="1", + expected_metadata={ + "totalInsertedNucs_ebola-sudan": None, + "totalSnps_ebola-sudan": None, + "totalDeletedNucs_ebola-sudan": None, + "length_ebola-sudan": 0, + "totalInsertedNucs_ebola-zaire": None, + "totalSnps_ebola-zaire": None, + "totalDeletedNucs_ebola-zaire": None, + "length_ebola-zaire": 0, + }, + expected_errors=[ + ProcessingAnnotationHelper( + [ProcessingAnnotationAlignment], + [ProcessingAnnotationAlignment], + "Found multiple sequences with the same segment name: ebola-sudan. " + "Each metadata entry can have multiple corresponding fasta sequence " + "entries with format _.", + AnnotationSourceType.NUCLEOTIDE_SEQUENCE, + ), + ProcessingAnnotationHelper( + [ProcessingAnnotationAlignment], + [ProcessingAnnotationAlignment], + "No segment aligned.", + AnnotationSourceType.NUCLEOTIDE_SEQUENCE, + ) + ], + expected_warnings=[], + expected_processed_alignment=ProcessedAlignment( + unalignedNucleotideSequences={}, + alignedNucleotideSequences={}, + nucleotideInsertions={}, + alignedAminoAcidSequences={}, + aminoAcidInsertions={}, + ), + ), + Case( + name="don't allow unknown segments", + input_metadata={}, + input_sequence={ + "ebola-sudan": sequence_with_mutation("ebola-sudan"), + "unknown_segment": invalid_sequence(), + }, + accession_id="1", + expected_metadata={ + "totalInsertedNucs_ebola-sudan": 0, + "totalSnps_ebola-sudan": 1, + "totalDeletedNucs_ebola-sudan": 0, + "length_ebola-sudan": len(consensus_sequence("ebola-sudan")), + "totalInsertedNucs_ebola-zaire": None, + "totalSnps_ebola-zaire": None, + "totalDeletedNucs_ebola-zaire": None, + "length_ebola-zaire": 0, + }, + expected_errors=[ + ProcessingAnnotationHelper( + [ProcessingAnnotationAlignment], + [ProcessingAnnotationAlignment], + "Found sequences in the input data with segments that are not in the config: " + "unknown_segment. Each metadata entry can have multiple corresponding fasta sequence " + "entries with format _ valid segments are: " + "ebola-sudan, ebola-zaire.", + AnnotationSourceType.NUCLEOTIDE_SEQUENCE, + ) + ], + expected_warnings=[], + expected_processed_alignment=ProcessedAlignment( + unalignedNucleotideSequences={"ebola-sudan": sequence_with_mutation("ebola-sudan")}, + alignedNucleotideSequences={"ebola-sudan": sequence_with_mutation("ebola-sudan")}, + nucleotideInsertions={}, + alignedAminoAcidSequences={ + "NPEbolaSudan": ebola_sudan_aa(sequence_with_mutation("single"), "NP"), + "VP35EbolaSudan": ebola_sudan_aa(sequence_with_mutation("single"), "VP35"), + }, + aminoAcidInsertions={}, + ), + ), +] + def process_single_entry( test_case: ProcessingTestCase, config: Config, dataset_dir: str = "temp" @@ -581,7 +668,9 @@ def test_preprocessing_single_segment(test_case_def: Case): @pytest.mark.parametrize( "test_case_def", - multi_segment_case_definitions + multi_segment_case_definitions_all_requirement, + multi_segment_case_definitions + + segment_validation_tests_multi_segments + + multi_segment_case_definitions_all_requirement, ids=lambda tc: f"multi segment {tc.name}", ) def test_preprocessing_multi_segment_all_requirement(test_case_def: Case): @@ -596,7 +685,9 @@ def test_preprocessing_multi_segment_all_requirement(test_case_def: Case): @pytest.mark.parametrize( "test_case_def", - multi_segment_case_definitions + multi_segment_case_definitions_any_requirement, + multi_segment_case_definitions + + segment_validation_tests_multi_segments + + multi_segment_case_definitions_any_requirement, ids=lambda tc: f"multi segment {tc.name}", ) def test_preprocessing_multi_segment_any_requirement(test_case_def: Case): From fbee2332b0e0e2e47f9497aa8d1adf3cd8149f5e Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Mon, 11 Aug 2025 15:45:20 +0200 Subject: [PATCH 14/26] feat(prepro): increase timeout --- integration-tests/tests/pages/review.page.ts | 2 +- preprocessing/nextclade/src/loculus_preprocessing/prepro.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/integration-tests/tests/pages/review.page.ts b/integration-tests/tests/pages/review.page.ts index 8a50b6d63e..079037bc05 100644 --- a/integration-tests/tests/pages/review.page.ts +++ b/integration-tests/tests/pages/review.page.ts @@ -22,7 +22,7 @@ export class ReviewPage { async waitForZeroProcessing() { await expect(this.page.locator('[data-testid="review-page-control-panel"]')).toContainText( '0 awaiting processing', - { timeout: 60000 }, + { timeout: 100000 }, ); } diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py index 143628c1e6..351ad22921 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py @@ -546,7 +546,7 @@ def add_input_metadata( ) return None sub_path = input_path[len(nextclade_prefix) :] - if segment in unprocessed.nextcladeMetadata: + if unprocessed.nextcladeMetadata and segment in unprocessed.nextcladeMetadata: if not unprocessed.nextcladeMetadata[segment]: message = ( "Nucleotide sequence failed to align" From 28c4b46d948c3c596b048445ebcae7b0631e9b1e Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Mon, 11 Aug 2025 16:03:47 +0200 Subject: [PATCH 15/26] feat(prepro): add more tests --- .../tests/test_nextclade_preprocessing.py | 115 +++++++++++++++++- 1 file changed, 110 insertions(+), 5 deletions(-) diff --git a/preprocessing/nextclade/tests/test_nextclade_preprocessing.py b/preprocessing/nextclade/tests/test_nextclade_preprocessing.py index 74b227f4ef..ba10a24c3d 100644 --- a/preprocessing/nextclade/tests/test_nextclade_preprocessing.py +++ b/preprocessing/nextclade/tests/test_nextclade_preprocessing.py @@ -558,7 +558,110 @@ def invalid_sequence() -> str: ), ] +segment_validation_tests_single_segment = [ + Case( + name="accept any key for single segment", + input_metadata={}, + input_sequence={"randomKey": sequence_with_mutation("single")}, + accession_id="1", + expected_metadata={ + "completeness": 1.0, + "totalInsertedNucs": 0, + "totalSnps": 1, + "totalDeletedNucs": 0, + "length": len(consensus_sequence("single")), + }, + expected_errors=[], + expected_warnings=[], + expected_processed_alignment=ProcessedAlignment( + unalignedNucleotideSequences={"main": sequence_with_mutation("single")}, + alignedNucleotideSequences={"main": sequence_with_mutation("single")}, + nucleotideInsertions={}, + alignedAminoAcidSequences={ + "NPEbolaSudan": ebola_sudan_aa(sequence_with_mutation("single"), "NP"), + "VP35EbolaSudan": ebola_sudan_aa(sequence_with_mutation("single"), "VP35"), + }, + aminoAcidInsertions={}, + ), + ), + Case( + name="do not accept multiple segments for single segment", + input_metadata={}, + input_sequence={ + "main": sequence_with_mutation("single"), + "randomKey": sequence_with_mutation("single"), + }, + accession_id="2", + expected_metadata={ + "completeness": None, + "totalInsertedNucs": None, + "totalSnps": None, + "totalDeletedNucs": None, + "length": 0, + }, + expected_errors=[ + ProcessingAnnotationHelper( + [ProcessingAnnotationAlignment], + [ProcessingAnnotationAlignment], + "Multiple sequences: ['main', 'randomKey'] found in the" + " input data, but organism: ebola-sudan-test is single-segmented. " + "Please check that your metadata and sequences are annotated correctly." + "Each metadata entry should have a single corresponding fasta sequence " + "entry with the same submissionId.", + AnnotationSourceType.NUCLEOTIDE_SEQUENCE, + ), + ], + expected_warnings=[], + expected_processed_alignment=ProcessedAlignment( + unalignedNucleotideSequences={}, + alignedNucleotideSequences={}, + nucleotideInsertions={}, + alignedAminoAcidSequences={}, + aminoAcidInsertions={}, + ), + ), +] + segment_validation_tests_multi_segments = [ + Case( + name="accept any prefix for multi-segment", + input_metadata={}, + input_sequence={ + "prefix_ebola-sudan": sequence_with_mutation("ebola-sudan"), + "other_prefix_ebola-zaire": sequence_with_mutation("ebola-zaire"), + }, + accession_id="1", + expected_metadata={ + "totalInsertedNucs_ebola-sudan": 0, + "totalSnps_ebola-sudan": 1, + "totalDeletedNucs_ebola-sudan": 0, + "length_ebola-sudan": len(consensus_sequence("ebola-sudan")), + "totalInsertedNucs_ebola-zaire": 0, + "totalSnps_ebola-zaire": 1, + "totalDeletedNucs_ebola-zaire": 0, + "length_ebola-zaire": len(consensus_sequence("ebola-zaire")), + }, + expected_errors=[], + expected_warnings=[], + expected_processed_alignment=ProcessedAlignment( + unalignedNucleotideSequences={ + "ebola-sudan": sequence_with_mutation("ebola-sudan"), + "ebola-zaire": sequence_with_mutation("ebola-zaire"), + }, + alignedNucleotideSequences={ + "ebola-sudan": sequence_with_mutation("ebola-sudan"), + "ebola-zaire": sequence_with_mutation("ebola-zaire"), + }, + nucleotideInsertions={}, + alignedAminoAcidSequences={ + "NPEbolaSudan": ebola_sudan_aa(sequence_with_mutation("single"), "NP"), + "VP35EbolaSudan": ebola_sudan_aa(sequence_with_mutation("single"), "VP35"), + "VP24EbolaZaire": ebola_zaire_aa(sequence_with_mutation("ebola-zaire"), "VP24"), + "LEbolaZaire": ebola_zaire_aa(sequence_with_mutation("ebola-zaire"), "L"), + }, + aminoAcidInsertions={}, + ), + ), Case( name="don't allow multiple segments with the same name", input_metadata={}, @@ -591,7 +694,7 @@ def invalid_sequence() -> str: [ProcessingAnnotationAlignment], "No segment aligned.", AnnotationSourceType.NUCLEOTIDE_SEQUENCE, - ) + ), ], expected_warnings=[], expected_processed_alignment=ProcessedAlignment( @@ -609,7 +712,7 @@ def invalid_sequence() -> str: "ebola-sudan": sequence_with_mutation("ebola-sudan"), "unknown_segment": invalid_sequence(), }, - accession_id="1", + accession_id="2", expected_metadata={ "totalInsertedNucs_ebola-sudan": 0, "totalSnps_ebola-sudan": 1, @@ -654,7 +757,9 @@ def process_single_entry( @pytest.mark.parametrize( - "test_case_def", single_segment_case_definitions, ids=lambda tc: f"single segment {tc.name}" + "test_case_def", + single_segment_case_definitions + segment_validation_tests_single_segment, + ids=lambda tc: f"single segment {tc.name}", ) def test_preprocessing_single_segment(test_case_def: Case): config = get_config(SINGLE_SEGMENT_CONFIG, ignore_args=True) @@ -711,8 +816,8 @@ def test_preprocessing_without_metadata() -> None: submittedAt=ts_from_ymd(2021, 12, 15), metadata={}, unalignedNucleotideSequences={ - "test_NIHPAK-19_ebola-sudan": sequence_with_mutation("ebola-sudan"), - "test_NIHPAK-19_ebola-zaire": sequence_with_mutation("ebola-zaire"), + "ebola-sudan": sequence_with_mutation("ebola-sudan"), + "ebola-zaire": sequence_with_mutation("ebola-zaire"), }, ), ) From 5fd624253f255e256a00b62bcc837c416259381f Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Mon, 11 Aug 2025 16:31:19 +0200 Subject: [PATCH 16/26] feat(backend): improve migration --- .../submission/SequenceUploadAuxTable.kt | 2 +- .../V1.16__rename_aux_table_columns.sql | 18 +++++++++-- .../submission/SubmitEndpointTest.kt | 30 +++++++++++++++++++ 3 files changed, 47 insertions(+), 3 deletions(-) diff --git a/backend/src/main/kotlin/org/loculus/backend/service/submission/SequenceUploadAuxTable.kt b/backend/src/main/kotlin/org/loculus/backend/service/submission/SequenceUploadAuxTable.kt index 73e973f323..808663a030 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/submission/SequenceUploadAuxTable.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/submission/SequenceUploadAuxTable.kt @@ -7,7 +7,7 @@ const val SEQUENCE_UPLOAD_AUX_TABLE_NAME = "sequence_upload_aux_table" object SequenceUploadAuxTable : Table(SEQUENCE_UPLOAD_AUX_TABLE_NAME) { val sequenceUploadIdColumn = varchar("upload_id", 255) - val sequenceSubmissionIdColumn = varchar("submission_id", 255) + val sequenceSubmissionIdColumn = varchar("sequence_submission_id", 255) val metadataSubmissionIdColumn = varchar("metadata_submission_id", 255) val compressedSequenceDataColumn = jacksonSerializableJsonb("compressed_sequence_data") diff --git a/backend/src/main/resources/db/migration/V1.16__rename_aux_table_columns.sql b/backend/src/main/resources/db/migration/V1.16__rename_aux_table_columns.sql index 69b8c2e351..a785a6a2f8 100644 --- a/backend/src/main/resources/db/migration/V1.16__rename_aux_table_columns.sql +++ b/backend/src/main/resources/db/migration/V1.16__rename_aux_table_columns.sql @@ -1,6 +1,20 @@ alter table sequence_upload_aux_table drop CONSTRAINT sequence_upload_aux_table_pkey; -alter table sequence_upload_aux_table drop column segment_name; +alter table sequence_upload_aux_table add column sequence_submission_id text; alter table sequence_upload_aux_table add column metadata_submission_id text; -alter table sequence_upload_aux_table add PRIMARY KEY (upload_id, submission_id); +update sequence_upload_aux_table +set metadata_submission_id = submission_id +where metadata_submission_id is null; + +update sequence_upload_aux_table +set sequence_submission_id = case + when segment_name is NULL or segment_name = '' then submission_id + else submission_id || '_' || segment_name +end +where sequence_submission_id is NULL; + +alter table sequence_upload_aux_table drop column segment_name; +alter table sequence_upload_aux_table drop column submission_id; + +alter table sequence_upload_aux_table add PRIMARY KEY (upload_id, sequence_submission_id); diff --git a/backend/src/test/kotlin/org/loculus/backend/controller/submission/SubmitEndpointTest.kt b/backend/src/test/kotlin/org/loculus/backend/controller/submission/SubmitEndpointTest.kt index c64b116fe0..b6ba724c0f 100644 --- a/backend/src/test/kotlin/org/loculus/backend/controller/submission/SubmitEndpointTest.kt +++ b/backend/src/test/kotlin/org/loculus/backend/controller/submission/SubmitEndpointTest.kt @@ -164,6 +164,36 @@ class SubmitEndpointTest( .andExpect(status().isBadRequest) } + @Test + fun `GIVEN fasta data with empty segment THEN data is not accepted`() { + submissionControllerClient.submit( + SubmitFiles.metadataFileWith( + content = """ + submissionId firstColumn + commonHeader someValue + """.trimIndent(), + ), + SubmitFiles.sequenceFileWith( + content = """ + >commonHeader_notOnlySegment + AC + >commonHeader_secondSegment + """.trimIndent(), + ), + organism = OTHER_ORGANISM, + groupId = groupId, + ) + .andExpect(status().isUnprocessableEntity()) + .andExpect(content().contentType(APPLICATION_PROBLEM_JSON)) + .andExpect( + jsonPath( + "\$.detail", + ).value( + "No sequence data given for sample commonHeader_secondSegment.", + ), + ) + } + @Test fun `GIVEN fasta data with whitespace-only segment THEN data is not accepted`() { submissionControllerClient.submit( From af93e19189b5b0543d30568bb62a1db1f62be154 Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Mon, 11 Aug 2025 14:41:02 +0000 Subject: [PATCH 17/26] Update schema documentation based on migration changes --- backend/docs/db/schema.sql | 4 ++-- .../backend/service/submission/UploadDatabaseService.kt | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/backend/docs/db/schema.sql b/backend/docs/db/schema.sql index e5b2f8271b..8f7e8346b4 100644 --- a/backend/docs/db/schema.sql +++ b/backend/docs/db/schema.sql @@ -527,8 +527,8 @@ ALTER VIEW public.sequence_entries_view OWNER TO postgres; CREATE TABLE public.sequence_upload_aux_table ( upload_id text NOT NULL, - submission_id text NOT NULL, compressed_sequence_data text NOT NULL, + sequence_submission_id text NOT NULL, metadata_submission_id text ); @@ -718,7 +718,7 @@ ALTER TABLE ONLY public.sequence_entries_preprocessed_data -- ALTER TABLE ONLY public.sequence_upload_aux_table - ADD CONSTRAINT sequence_upload_aux_table_pkey PRIMARY KEY (upload_id, submission_id); + ADD CONSTRAINT sequence_upload_aux_table_pkey PRIMARY KEY (upload_id, sequence_submission_id); -- diff --git a/backend/src/main/kotlin/org/loculus/backend/service/submission/UploadDatabaseService.kt b/backend/src/main/kotlin/org/loculus/backend/service/submission/UploadDatabaseService.kt index 4ccb0a9689..b11fdaf642 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/submission/UploadDatabaseService.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/submission/UploadDatabaseService.kt @@ -173,9 +173,9 @@ class UploadDatabaseService( 'unalignedNucleotideSequences', COALESCE( jsonb_object_agg( - sequence_upload_aux_table.submission_id, + sequence_upload_aux_table.sequence_submission_id, sequence_upload_aux_table.compressed_sequence_data::jsonb - ) FILTER (WHERE sequence_upload_aux_table.submission_id IS NOT NULL), + ) FILTER (WHERE sequence_upload_aux_table.sequence_submission_id IS NOT NULL), '{}'::jsonb ) ) From d182c08a6a1dc39d618aa10e88f2febf44a6891a Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Mon, 11 Aug 2025 16:50:48 +0200 Subject: [PATCH 18/26] feat(kotlin): correctly define fields --- .../backend/service/submission/SequenceUploadAuxTable.kt | 2 +- .../loculus/backend/service/submission/UploadDatabaseService.kt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/src/main/kotlin/org/loculus/backend/service/submission/SequenceUploadAuxTable.kt b/backend/src/main/kotlin/org/loculus/backend/service/submission/SequenceUploadAuxTable.kt index 808663a030..e17006b49a 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/submission/SequenceUploadAuxTable.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/submission/SequenceUploadAuxTable.kt @@ -8,7 +8,7 @@ const val SEQUENCE_UPLOAD_AUX_TABLE_NAME = "sequence_upload_aux_table" object SequenceUploadAuxTable : Table(SEQUENCE_UPLOAD_AUX_TABLE_NAME) { val sequenceUploadIdColumn = varchar("upload_id", 255) val sequenceSubmissionIdColumn = varchar("sequence_submission_id", 255) - val metadataSubmissionIdColumn = varchar("metadata_submission_id", 255) + val metadataSubmissionIdColumn = varchar("metadata_submission_id", 255).nullable() val compressedSequenceDataColumn = jacksonSerializableJsonb("compressed_sequence_data") override val primaryKey = PrimaryKey(sequenceUploadIdColumn, sequenceSubmissionIdColumn) diff --git a/backend/src/main/kotlin/org/loculus/backend/service/submission/UploadDatabaseService.kt b/backend/src/main/kotlin/org/loculus/backend/service/submission/UploadDatabaseService.kt index b11fdaf642..c1358b41c3 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/submission/UploadDatabaseService.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/submission/UploadDatabaseService.kt @@ -113,7 +113,7 @@ class UploadDatabaseService( uploadedSequencesBatch.chunkedForDatabase({ batch -> SequenceUploadAuxTable.batchInsert(batch) { this[sequenceSubmissionIdColumn] = it.sampleName - this[metadataSubmissionIdColumn] = it.sampleName + this[metadataSubmissionIdColumn] = null this[sequenceUploadIdColumn] = uploadId this[compressedSequenceDataColumn] = compressor.compressNucleotideSequence( it.sequence, From 4a950cf2784c58bd7fdea6d35649d9b2a43018af Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Mon, 11 Aug 2025 17:19:46 +0200 Subject: [PATCH 19/26] feat(backend): add tests for edge case --- .../org/loculus/backend/model/SubmitModel.kt | 20 +++++++++++-- .../SubmitEndpointSingleSegmentedTest.kt | 29 +++++++++++++++++++ 2 files changed, 47 insertions(+), 2 deletions(-) diff --git a/backend/src/main/kotlin/org/loculus/backend/model/SubmitModel.kt b/backend/src/main/kotlin/org/loculus/backend/model/SubmitModel.kt index c6b9cbba5c..aba0c07968 100644 --- a/backend/src/main/kotlin/org/loculus/backend/model/SubmitModel.kt +++ b/backend/src/main/kotlin/org/loculus/backend/model/SubmitModel.kt @@ -364,6 +364,7 @@ class SubmitModel( private fun mapMetadataKeysToSequenceKeys(metadataKeysSet: Set, sequenceKeysSet: Set) { val metadataKeyToSequences = mutableMapOf>() val unmatchedSequenceKeys = mutableSetOf() + val ambiguousSequenceKeys = mutableMapOf>() for (seqKey in sequenceKeysSet) { val baseKey = seqKey.removeSuffixPattern() @@ -373,6 +374,10 @@ class SubmitModel( else -> null } + if ((seqKey != baseKey) && metadataKeysSet.contains(seqKey) && metadataKeysSet.contains(baseKey)) { + ambiguousSequenceKeys[seqKey] = listOf(seqKey, baseKey) + } + if (matchedMetadataKey != null) { metadataKeyToSequences.computeIfAbsent(matchedMetadataKey) { mutableListOf() }.add(seqKey) } else { @@ -382,7 +387,9 @@ class SubmitModel( val metadataKeysWithoutSequences = metadataKeysSet.filterNot { metadataKeyToSequences.containsKey(it) } - if (unmatchedSequenceKeys.isNotEmpty() || metadataKeysWithoutSequences.isNotEmpty()) { + if (unmatchedSequenceKeys.isNotEmpty() || metadataKeysWithoutSequences.isNotEmpty() || + ambiguousSequenceKeys.isNotEmpty() + ) { val unmatchedSeqText = if (unmatchedSequenceKeys.isNotEmpty()) { "Sequence file contains ${unmatchedSequenceKeys.size} ids that are not present in the metadata file: ${ unmatchedSequenceKeys.joinToString(limit = 10) @@ -396,7 +403,16 @@ class SubmitModel( } else { "" } - throw UnprocessableEntityException(unmatchedSeqText + unmatchedMetadataText) + val ambiguousSequenceText = if (ambiguousSequenceKeys.isNotEmpty()) { + "Sequence file contains ${ambiguousSequenceKeys.size} ids that could be matched to multiple metadata " + + "keys, e.g. ${ambiguousSequenceKeys.entries.joinToString(limit = 3) { (key, value) -> + "Sequence key: $key matches $value" + }} " + + "- to avoid future issues we recommend not using the separator `_` in your metadata submissionIds;" + } else { + "" + } + throw UnprocessableEntityException(unmatchedSeqText + unmatchedMetadataText + ambiguousSequenceText) } transaction { diff --git a/backend/src/test/kotlin/org/loculus/backend/controller/submission/SubmitEndpointSingleSegmentedTest.kt b/backend/src/test/kotlin/org/loculus/backend/controller/submission/SubmitEndpointSingleSegmentedTest.kt index 5fa453dae9..ccae2cc372 100644 --- a/backend/src/test/kotlin/org/loculus/backend/controller/submission/SubmitEndpointSingleSegmentedTest.kt +++ b/backend/src/test/kotlin/org/loculus/backend/controller/submission/SubmitEndpointSingleSegmentedTest.kt @@ -62,4 +62,33 @@ class SubmitEndpointSingleSegmentedTest( assertThat(unalignedNucleotideSequences, hasEntry("header1", "AC")) } + + @Test + fun `GIVEN input data with ambiguous submissionIds THEN data is rejected`() { + val groupId = groupManagementClient.createNewGroup().andGetGroupId() + val expectedDetail = "Sequence file contains 1 ids that could be matched to multiple metadata keys" + + ", e.g. Sequence key: header1_2 matches [header1_2, header1]" + + submissionControllerClient.submit( + SubmitFiles.metadataFileWith( + content = """ + submissionId firstColumn + header1 someValue + header1_2 someValue + """.trimIndent(), + ), + SubmitFiles.sequenceFileWith( + content = """ + >header1_2 + AC + >header1 + AC + """.trimIndent(), + ), + groupId = groupId, + ) + .andExpect(status().isUnprocessableEntity) + .andExpect(jsonPath("\$.title").value("Unprocessable Entity")) + .andExpect(jsonPath("\$.detail", containsString(expectedDetail))) + } } From b76d1aa7d9d926bb15c8bd5780c8dab5d1a1a3ad Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Mon, 11 Aug 2025 17:33:18 +0200 Subject: [PATCH 20/26] double timeout while I investigate --- .../tests/specs/features/search/lineage-field.spec.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integration-tests/tests/specs/features/search/lineage-field.spec.ts b/integration-tests/tests/specs/features/search/lineage-field.spec.ts index 6d31683df6..bfe8b6d373 100644 --- a/integration-tests/tests/specs/features/search/lineage-field.spec.ts +++ b/integration-tests/tests/specs/features/search/lineage-field.spec.ts @@ -7,7 +7,7 @@ import { v4 as uuidv4 } from 'uuid'; const SEQUENCE = 'ATTGATCTCATCATTT'; test('Lineage field lineage counts', async ({ page, pageWithGroup }) => { - test.setTimeout(95_000); + test.setTimeout(100_000); const uuid = uuidv4(); await page.goto('/'); From 477ffa30178ef109d6720326f1fe64bfccef74a9 Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Mon, 11 Aug 2025 17:49:39 +0200 Subject: [PATCH 21/26] feat(backend): only change for multi-segmented organisms --- .../org/loculus/backend/model/SubmitModel.kt | 24 +++++++++++---- .../SubmitEndpointSingleSegmentedTest.kt | 10 ++----- .../submission/SubmitEndpointTest.kt | 29 +++++++++++++++++++ 3 files changed, 50 insertions(+), 13 deletions(-) diff --git a/backend/src/main/kotlin/org/loculus/backend/model/SubmitModel.kt b/backend/src/main/kotlin/org/loculus/backend/model/SubmitModel.kt index aba0c07968..3badf43c18 100644 --- a/backend/src/main/kotlin/org/loculus/backend/model/SubmitModel.kt +++ b/backend/src/main/kotlin/org/loculus/backend/model/SubmitModel.kt @@ -135,7 +135,7 @@ class SubmitModel( if (requiresConsensusSequenceFile(submissionParams.organism)) { log.debug { "Validating submission with uploadId $uploadId" } val sequenceSubmissionIds = uploadDatabaseService.getSequenceUploadSubmissionIds(uploadId).toSet() - mapMetadataKeysToSequenceKeys(metadataSubmissionIds, sequenceSubmissionIds) + mapMetadataKeysToSequenceKeys(metadataSubmissionIds, sequenceSubmissionIds, submissionParams.organism) } if (submissionParams is SubmissionParams.RevisionSubmissionParams) { @@ -361,13 +361,23 @@ class SubmitModel( } @Transactional - private fun mapMetadataKeysToSequenceKeys(metadataKeysSet: Set, sequenceKeysSet: Set) { + private fun mapMetadataKeysToSequenceKeys( + metadataKeysSet: Set, + sequenceKeysSet: Set, + organism: Organism, + ) { val metadataKeyToSequences = mutableMapOf>() val unmatchedSequenceKeys = mutableSetOf() val ambiguousSequenceKeys = mutableMapOf>() + val referenceGenome = backendConfig.getInstanceConfig(organism).referenceGenome + for (seqKey in sequenceKeysSet) { - val baseKey = seqKey.removeSuffixPattern() + val baseKey = if (referenceGenome.nucleotideSequences.size == 1) { + seqKey + } else { + seqKey.removeSuffixPattern() + } val matchedMetadataKey = when { metadataKeysSet.contains(seqKey) -> seqKey metadataKeysSet.contains(baseKey) -> baseKey @@ -405,9 +415,11 @@ class SubmitModel( } val ambiguousSequenceText = if (ambiguousSequenceKeys.isNotEmpty()) { "Sequence file contains ${ambiguousSequenceKeys.size} ids that could be matched to multiple metadata " + - "keys, e.g. ${ambiguousSequenceKeys.entries.joinToString(limit = 3) { (key, value) -> - "Sequence key: $key matches $value" - }} " + + "keys, e.g. ${ + ambiguousSequenceKeys.entries.joinToString(limit = 3) { (key, value) -> + "Sequence key: $key matches $value" + } + } " + "- to avoid future issues we recommend not using the separator `_` in your metadata submissionIds;" } else { "" diff --git a/backend/src/test/kotlin/org/loculus/backend/controller/submission/SubmitEndpointSingleSegmentedTest.kt b/backend/src/test/kotlin/org/loculus/backend/controller/submission/SubmitEndpointSingleSegmentedTest.kt index ccae2cc372..e0b651ecf6 100644 --- a/backend/src/test/kotlin/org/loculus/backend/controller/submission/SubmitEndpointSingleSegmentedTest.kt +++ b/backend/src/test/kotlin/org/loculus/backend/controller/submission/SubmitEndpointSingleSegmentedTest.kt @@ -64,24 +64,20 @@ class SubmitEndpointSingleSegmentedTest( } @Test - fun `GIVEN input data with ambiguous submissionIds THEN data is rejected`() { + fun `GIVEN input data with explicit default segment name THEN data is rejected`() { val groupId = groupManagementClient.createNewGroup().andGetGroupId() - val expectedDetail = "Sequence file contains 1 ids that could be matched to multiple metadata keys" + - ", e.g. Sequence key: header1_2 matches [header1_2, header1]" + val expectedDetail = "Metadata file contains 1 ids that are not present in the sequence file: header1" submissionControllerClient.submit( SubmitFiles.metadataFileWith( content = """ submissionId firstColumn header1 someValue - header1_2 someValue """.trimIndent(), ), SubmitFiles.sequenceFileWith( content = """ - >header1_2 - AC - >header1 + >header1_$DEFAULT_SEQUENCE_NAME AC """.trimIndent(), ), diff --git a/backend/src/test/kotlin/org/loculus/backend/controller/submission/SubmitEndpointTest.kt b/backend/src/test/kotlin/org/loculus/backend/controller/submission/SubmitEndpointTest.kt index b6ba724c0f..c57647ce0d 100644 --- a/backend/src/test/kotlin/org/loculus/backend/controller/submission/SubmitEndpointTest.kt +++ b/backend/src/test/kotlin/org/loculus/backend/controller/submission/SubmitEndpointTest.kt @@ -153,6 +153,35 @@ class SubmitEndpointTest( .andExpect(jsonPath("\$[0].version").value(1)) } + @Test + fun `GIVEN input data with ambiguous submissionIds THEN data is rejected`() { + val expectedDetail = "Sequence file contains 1 ids that could be matched to multiple metadata keys" + + ", e.g. Sequence key: header1_2 matches [header1_2, header1]" + + submissionControllerClient.submit( + SubmitFiles.metadataFileWith( + content = """ + submissionId firstColumn + header1 someValue + header1_2 someValue + """.trimIndent(), + ), + SubmitFiles.sequenceFileWith( + content = """ + >header1_2 + AC + >header1 + AC + """.trimIndent(), + ), + groupId = groupId, + organism = OTHER_ORGANISM, + ) + .andExpect(status().isUnprocessableEntity) + .andExpect(jsonPath("\$.title").value("Unprocessable Entity")) + .andExpect(jsonPath("\$.detail", containsString(expectedDetail))) + } + @Test fun `GIVEN submission without data use terms THEN returns an error`() { submissionControllerClient.submitWithoutDataUseTerms( From 0c3dff2e40b154274782e44fb732dd13c1d734ac Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Mon, 11 Aug 2025 17:50:51 +0200 Subject: [PATCH 22/26] for kicks --- .../org/loculus/backend/model/SubmitModel.kt | 29 +++++++++++-------- integration-tests/tests/pages/review.page.ts | 2 +- .../features/search/lineage-field.spec.ts | 2 +- 3 files changed, 19 insertions(+), 14 deletions(-) diff --git a/backend/src/main/kotlin/org/loculus/backend/model/SubmitModel.kt b/backend/src/main/kotlin/org/loculus/backend/model/SubmitModel.kt index 3badf43c18..2690d403d7 100644 --- a/backend/src/main/kotlin/org/loculus/backend/model/SubmitModel.kt +++ b/backend/src/main/kotlin/org/loculus/backend/model/SubmitModel.kt @@ -373,19 +373,24 @@ class SubmitModel( val referenceGenome = backendConfig.getInstanceConfig(organism).referenceGenome for (seqKey in sequenceKeysSet) { - val baseKey = if (referenceGenome.nucleotideSequences.size == 1) { - seqKey + val matchedMetadataKey = if (referenceGenome.nucleotideSequences.size == 1) { + val seqKeyInMeta = metadataKeysSet.contains(seqKey) + when { + seqKeyInMeta -> seqKey + else -> null + } } else { - seqKey.removeSuffixPattern() - } - val matchedMetadataKey = when { - metadataKeysSet.contains(seqKey) -> seqKey - metadataKeysSet.contains(baseKey) -> baseKey - else -> null - } - - if ((seqKey != baseKey) && metadataKeysSet.contains(seqKey) && metadataKeysSet.contains(baseKey)) { - ambiguousSequenceKeys[seqKey] = listOf(seqKey, baseKey) + val baseKey = seqKey.removeSuffixPattern() + val seqKeyInMeta = metadataKeysSet.contains(seqKey) + val baseKeyInMeta = metadataKeysSet.contains(baseKey) + if ((seqKey != baseKey) && seqKeyInMeta && baseKeyInMeta) { + ambiguousSequenceKeys[seqKey] = listOf(seqKey, baseKey) + } + when { + seqKeyInMeta -> seqKey + baseKeyInMeta -> baseKey + else -> null + } } if (matchedMetadataKey != null) { diff --git a/integration-tests/tests/pages/review.page.ts b/integration-tests/tests/pages/review.page.ts index 079037bc05..1b26860d58 100644 --- a/integration-tests/tests/pages/review.page.ts +++ b/integration-tests/tests/pages/review.page.ts @@ -22,7 +22,7 @@ export class ReviewPage { async waitForZeroProcessing() { await expect(this.page.locator('[data-testid="review-page-control-panel"]')).toContainText( '0 awaiting processing', - { timeout: 100000 }, + { timeout: 200000 }, ); } diff --git a/integration-tests/tests/specs/features/search/lineage-field.spec.ts b/integration-tests/tests/specs/features/search/lineage-field.spec.ts index bfe8b6d373..6cef8fab28 100644 --- a/integration-tests/tests/specs/features/search/lineage-field.spec.ts +++ b/integration-tests/tests/specs/features/search/lineage-field.spec.ts @@ -7,7 +7,7 @@ import { v4 as uuidv4 } from 'uuid'; const SEQUENCE = 'ATTGATCTCATCATTT'; test('Lineage field lineage counts', async ({ page, pageWithGroup }) => { - test.setTimeout(100_000); + test.setTimeout(900_000); const uuid = uuidv4(); await page.goto('/'); From 49140691933d2d21a68431a8683f712e92bea488 Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Tue, 12 Aug 2025 09:14:33 +0200 Subject: [PATCH 23/26] feat(prepro): update specification docs --- preprocessing/specification.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/preprocessing/specification.md b/preprocessing/specification.md index 00d4ffb645..8b9df92140 100644 --- a/preprocessing/specification.md +++ b/preprocessing/specification.md @@ -43,6 +43,8 @@ In the unprocessed NDJSON, each line contains a sequence entry represented as a {"accession": 2, "version": 1, "data": {"metadata": {...}, "unalignedNucleotideSequences": {...}, "files": {...}}, "submitter": john_smith, ...} ``` +The `unalignedNucleotideSequences` field is a dictionary from the fasta header (of the form _ or where `submissionId` is the metadata submission Id) of the submitted entry to the sequence. + The `metadata` field contains a flat JSON object in which all values are strings. The fields and values correspond to the columns and values as provided by the submitter. Fields present in the metadata file header but left empty by the submitter will be returned as "". Only columns present in the submission metadata file will be keys in the JSON object - independent of the metadata schema. The `files` field contains a JSON object in which the keys are the names of the file categories and the values are lists of objects containing file IDs, names and URLs. This is only available if the Loculus instance and organism are configured to support files. The URLs are S3 presigned, read-only URLs that are only valid for a limited amount of time. From e160c4d3deac4a5e566c5be8c087333962a55782 Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Tue, 12 Aug 2025 15:34:02 +0200 Subject: [PATCH 24/26] feat(prepro): try to fix integration tests --- .../nextclade/src/loculus_preprocessing/prepro.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py index 351ad22921..f9b99c0d7b 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py @@ -238,6 +238,8 @@ def assign_segment( ): valid_segments = set() duplicate_segments = set() + if not config.nucleotideSequences: + return (unaligned_nucleotide_sequences, errors) if not config.multi_segment: if len(input_unaligned_sequences) > 1: errors.append( @@ -711,7 +713,12 @@ def process_single( # noqa: C901 else: submitter = unprocessed.submitter group_id = unprocessed.group_id - unaligned_nucleotide_sequences = unprocessed.unalignedNucleotideSequences + unaligned_nucleotide_sequences, errors = assign_segment( + input_unaligned_sequences=unprocessed.unalignedNucleotideSequences, + unaligned_nucleotide_sequences={}, + errors=errors, + config=config, + ) for segment in config.nucleotideSequences: sequence = unaligned_nucleotide_sequences.get(segment, None) From 278d08519e0f6e9fc961ea9334dd11910b3d94d2 Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Tue, 12 Aug 2025 15:48:51 +0200 Subject: [PATCH 25/26] feat: revert changes that are not required --- integration-tests/tests/pages/review.page.ts | 2 +- .../tests/specs/features/search/lineage-field.spec.ts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/integration-tests/tests/pages/review.page.ts b/integration-tests/tests/pages/review.page.ts index 1b26860d58..8a50b6d63e 100644 --- a/integration-tests/tests/pages/review.page.ts +++ b/integration-tests/tests/pages/review.page.ts @@ -22,7 +22,7 @@ export class ReviewPage { async waitForZeroProcessing() { await expect(this.page.locator('[data-testid="review-page-control-panel"]')).toContainText( '0 awaiting processing', - { timeout: 200000 }, + { timeout: 60000 }, ); } diff --git a/integration-tests/tests/specs/features/search/lineage-field.spec.ts b/integration-tests/tests/specs/features/search/lineage-field.spec.ts index 6cef8fab28..6d31683df6 100644 --- a/integration-tests/tests/specs/features/search/lineage-field.spec.ts +++ b/integration-tests/tests/specs/features/search/lineage-field.spec.ts @@ -7,7 +7,7 @@ import { v4 as uuidv4 } from 'uuid'; const SEQUENCE = 'ATTGATCTCATCATTT'; test('Lineage field lineage counts', async ({ page, pageWithGroup }) => { - test.setTimeout(900_000); + test.setTimeout(95_000); const uuid = uuidv4(); await page.goto('/'); From c46e22bf6ee209019ced052faba93a4b8204396f Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Tue, 12 Aug 2025 16:07:23 +0200 Subject: [PATCH 26/26] try again --- .../nextclade/src/loculus_preprocessing/prepro.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py index f9b99c0d7b..1148472cbe 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py @@ -239,7 +239,10 @@ def assign_segment( valid_segments = set() duplicate_segments = set() if not config.nucleotideSequences: - return (unaligned_nucleotide_sequences, errors) + return ( + unaligned_nucleotide_sequences, + errors, + ) if not config.multi_segment: if len(input_unaligned_sequences) > 1: errors.append( @@ -709,11 +712,10 @@ def process_single( # noqa: C901 submitter = unprocessed.inputMetadata["submitter"] group_id = int(str(unprocessed.inputMetadata["group_id"])) - unaligned_nucleotide_sequences = unprocessed.unalignedNucleotideSequences else: submitter = unprocessed.submitter group_id = unprocessed.group_id - unaligned_nucleotide_sequences, errors = assign_segment( + unprocessed.unalignedNucleotideSequences, errors = assign_segment( input_unaligned_sequences=unprocessed.unalignedNucleotideSequences, unaligned_nucleotide_sequences={}, errors=errors, @@ -721,7 +723,7 @@ def process_single( # noqa: C901 ) for segment in config.nucleotideSequences: - sequence = unaligned_nucleotide_sequences.get(segment, None) + sequence = unprocessed.unalignedNucleotideSequences.get(segment, None) key = "length" if segment == "main" else "length_" + segment if key in config.processing_spec: output_metadata[key] = len(sequence) if sequence else 0