diff --git a/backend/docs/db/schema.sql b/backend/docs/db/schema.sql index 5bf4c965b8..8f7e8346b4 100644 --- a/backend/docs/db/schema.sql +++ b/backend/docs/db/schema.sql @@ -527,9 +527,9 @@ ALTER VIEW public.sequence_entries_view OWNER TO postgres; CREATE TABLE public.sequence_upload_aux_table ( upload_id text NOT NULL, - submission_id text NOT NULL, - segment_name text NOT NULL, - compressed_sequence_data text NOT NULL + compressed_sequence_data text NOT NULL, + sequence_submission_id text NOT NULL, + metadata_submission_id text ); @@ -718,7 +718,7 @@ ALTER TABLE ONLY public.sequence_entries_preprocessed_data -- ALTER TABLE ONLY public.sequence_upload_aux_table - ADD CONSTRAINT sequence_upload_aux_table_pkey PRIMARY KEY (upload_id, submission_id, segment_name); + ADD CONSTRAINT sequence_upload_aux_table_pkey PRIMARY KEY (upload_id, sequence_submission_id); -- diff --git a/backend/src/main/kotlin/org/loculus/backend/model/SubmitModel.kt b/backend/src/main/kotlin/org/loculus/backend/model/SubmitModel.kt index cf447175b7..2690d403d7 100644 --- a/backend/src/main/kotlin/org/loculus/backend/model/SubmitModel.kt +++ b/backend/src/main/kotlin/org/loculus/backend/model/SubmitModel.kt @@ -4,6 +4,8 @@ import mu.KotlinLogging import org.apache.commons.compress.archivers.zip.ZipFile import org.apache.commons.compress.compressors.CompressorStreamFactory import org.jetbrains.exposed.exceptions.ExposedSQLException +import org.jetbrains.exposed.sql.transactions.transaction +import org.jetbrains.exposed.sql.update import org.loculus.backend.api.DataUseTerms import org.loculus.backend.api.Organism import org.loculus.backend.api.SubmissionIdFilesMap @@ -21,6 +23,7 @@ import org.loculus.backend.service.groupmanagement.GroupManagementPreconditionVa import org.loculus.backend.service.submission.CompressionAlgorithm import org.loculus.backend.service.submission.MetadataUploadAuxTable import org.loculus.backend.service.submission.SequenceUploadAuxTable +import org.loculus.backend.service.submission.SequenceUploadAuxTable.metadataSubmissionIdColumn import org.loculus.backend.service.submission.SubmissionIdFilesMappingPreconditionValidator import org.loculus.backend.service.submission.UploadDatabaseService import org.loculus.backend.utils.DateProvider @@ -132,7 +135,7 @@ class SubmitModel( if (requiresConsensusSequenceFile(submissionParams.organism)) { log.debug { "Validating submission with uploadId $uploadId" } val sequenceSubmissionIds = uploadDatabaseService.getSequenceUploadSubmissionIds(uploadId).toSet() - validateSubmissionIdSetsForConsensusSequences(metadataSubmissionIds, sequenceSubmissionIds) + mapMetadataKeysToSequenceKeys(metadataSubmissionIds, sequenceSubmissionIds, submissionParams.organism) } if (submissionParams is SubmissionParams.RevisionSubmissionParams) { @@ -348,27 +351,100 @@ class SubmitModel( ) } - private fun validateSubmissionIdSetsForConsensusSequences( + private fun SubmissionId.removeSuffixPattern(): SubmissionId { + val lastDelimiter = this.lastIndexOf("_") + if (lastDelimiter == -1) { + return this + } + val cleaned = this.substring(0, lastDelimiter) + return cleaned + } + + @Transactional + private fun mapMetadataKeysToSequenceKeys( metadataKeysSet: Set, sequenceKeysSet: Set, + organism: Organism, ) { - val metadataKeysNotInSequences = metadataKeysSet.subtract(sequenceKeysSet) - val sequenceKeysNotInMetadata = sequenceKeysSet.subtract(metadataKeysSet) + val metadataKeyToSequences = mutableMapOf>() + val unmatchedSequenceKeys = mutableSetOf() + val ambiguousSequenceKeys = mutableMapOf>() + + val referenceGenome = backendConfig.getInstanceConfig(organism).referenceGenome + + for (seqKey in sequenceKeysSet) { + val matchedMetadataKey = if (referenceGenome.nucleotideSequences.size == 1) { + val seqKeyInMeta = metadataKeysSet.contains(seqKey) + when { + seqKeyInMeta -> seqKey + else -> null + } + } else { + val baseKey = seqKey.removeSuffixPattern() + val seqKeyInMeta = metadataKeysSet.contains(seqKey) + val baseKeyInMeta = metadataKeysSet.contains(baseKey) + if ((seqKey != baseKey) && seqKeyInMeta && baseKeyInMeta) { + ambiguousSequenceKeys[seqKey] = listOf(seqKey, baseKey) + } + when { + seqKeyInMeta -> seqKey + baseKeyInMeta -> baseKey + else -> null + } + } + + if (matchedMetadataKey != null) { + metadataKeyToSequences.computeIfAbsent(matchedMetadataKey) { mutableListOf() }.add(seqKey) + } else { + unmatchedSequenceKeys.add(seqKey) + } + } + + val metadataKeysWithoutSequences = metadataKeysSet.filterNot { metadataKeyToSequences.containsKey(it) } - if (metadataKeysNotInSequences.isNotEmpty() || sequenceKeysNotInMetadata.isNotEmpty()) { - val metadataNotPresentErrorText = if (metadataKeysNotInSequences.isNotEmpty()) { - "Metadata file contains ${metadataKeysNotInSequences.size} ids that are not present " + - "in the sequence file: " + metadataKeysNotInSequences.toList().joinToString(limit = 10) + "; " + if (unmatchedSequenceKeys.isNotEmpty() || metadataKeysWithoutSequences.isNotEmpty() || + ambiguousSequenceKeys.isNotEmpty() + ) { + val unmatchedSeqText = if (unmatchedSequenceKeys.isNotEmpty()) { + "Sequence file contains ${unmatchedSequenceKeys.size} ids that are not present in the metadata file: ${ + unmatchedSequenceKeys.joinToString(limit = 10) + }; " + } else { + "" + } + val unmatchedMetadataText = if (metadataKeysWithoutSequences.isNotEmpty()) { + "Metadata file contains ${metadataKeysWithoutSequences.size} ids that are not present in " + + "the sequence file: ${metadataKeysWithoutSequences.joinToString(limit = 10)};" } else { "" } - val sequenceNotPresentErrorText = if (sequenceKeysNotInMetadata.isNotEmpty()) { - "Sequence file contains ${sequenceKeysNotInMetadata.size} ids that are not present " + - "in the metadata file: " + sequenceKeysNotInMetadata.toList().joinToString(limit = 10) + val ambiguousSequenceText = if (ambiguousSequenceKeys.isNotEmpty()) { + "Sequence file contains ${ambiguousSequenceKeys.size} ids that could be matched to multiple metadata " + + "keys, e.g. ${ + ambiguousSequenceKeys.entries.joinToString(limit = 3) { (key, value) -> + "Sequence key: $key matches $value" + } + } " + + "- to avoid future issues we recommend not using the separator `_` in your metadata submissionIds;" } else { "" } - throw UnprocessableEntityException(metadataNotPresentErrorText + sequenceNotPresentErrorText) + throw UnprocessableEntityException(unmatchedSeqText + unmatchedMetadataText + ambiguousSequenceText) + } + + transaction { + for ((metadataSubmissionId, sequenceSubmissionIds) in metadataKeyToSequences) { + for (sequenceSubmissionId in sequenceSubmissionIds) { + SequenceUploadAuxTable.update( + { + SequenceUploadAuxTable.sequenceSubmissionIdColumn eq + sequenceSubmissionId + }, + ) { + it[metadataSubmissionIdColumn] = metadataSubmissionId + } + } + } } } diff --git a/backend/src/main/kotlin/org/loculus/backend/service/submission/SequenceUploadAuxTable.kt b/backend/src/main/kotlin/org/loculus/backend/service/submission/SequenceUploadAuxTable.kt index 9f931ac91b..e17006b49a 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/submission/SequenceUploadAuxTable.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/submission/SequenceUploadAuxTable.kt @@ -7,9 +7,9 @@ const val SEQUENCE_UPLOAD_AUX_TABLE_NAME = "sequence_upload_aux_table" object SequenceUploadAuxTable : Table(SEQUENCE_UPLOAD_AUX_TABLE_NAME) { val sequenceUploadIdColumn = varchar("upload_id", 255) - val sequenceSubmissionIdColumn = varchar("submission_id", 255) - val segmentNameColumn = varchar("segment_name", 255) + val sequenceSubmissionIdColumn = varchar("sequence_submission_id", 255) + val metadataSubmissionIdColumn = varchar("metadata_submission_id", 255).nullable() val compressedSequenceDataColumn = jacksonSerializableJsonb("compressed_sequence_data") - override val primaryKey = PrimaryKey(sequenceUploadIdColumn, sequenceSubmissionIdColumn, segmentNameColumn) + override val primaryKey = PrimaryKey(sequenceUploadIdColumn, sequenceSubmissionIdColumn) } diff --git a/backend/src/main/kotlin/org/loculus/backend/service/submission/UploadDatabaseService.kt b/backend/src/main/kotlin/org/loculus/backend/service/submission/UploadDatabaseService.kt index ca9cef8919..c1358b41c3 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/submission/UploadDatabaseService.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/submission/UploadDatabaseService.kt @@ -30,13 +30,12 @@ import org.loculus.backend.service.submission.MetadataUploadAuxTable.submitterCo import org.loculus.backend.service.submission.MetadataUploadAuxTable.uploadIdColumn import org.loculus.backend.service.submission.MetadataUploadAuxTable.uploadedAtColumn import org.loculus.backend.service.submission.SequenceUploadAuxTable.compressedSequenceDataColumn -import org.loculus.backend.service.submission.SequenceUploadAuxTable.segmentNameColumn +import org.loculus.backend.service.submission.SequenceUploadAuxTable.metadataSubmissionIdColumn import org.loculus.backend.service.submission.SequenceUploadAuxTable.sequenceSubmissionIdColumn import org.loculus.backend.service.submission.SequenceUploadAuxTable.sequenceUploadIdColumn import org.loculus.backend.utils.DatabaseConstants import org.loculus.backend.utils.FastaEntry import org.loculus.backend.utils.MetadataEntry -import org.loculus.backend.utils.ParseFastaHeader import org.loculus.backend.utils.RevisionEntry import org.loculus.backend.utils.chunkedForDatabase import org.loculus.backend.utils.getNextSequenceNumbers @@ -54,7 +53,6 @@ private const val METADATA_BATCH_SIZE = DatabaseConstants.POSTGRESQL_PARAMETER_L @Service @Transactional class UploadDatabaseService( - private val parseFastaHeader: ParseFastaHeader, private val compressor: CompressionService, private val accessionPreconditionValidator: AccessionPreconditionValidator, private val dataUseTermsDatabaseService: DataUseTermsDatabaseService, @@ -114,13 +112,12 @@ class UploadDatabaseService( ) { uploadedSequencesBatch.chunkedForDatabase({ batch -> SequenceUploadAuxTable.batchInsert(batch) { - val (submissionId, segmentName) = parseFastaHeader.parse(it.sampleName, submittedOrganism) - this[sequenceSubmissionIdColumn] = submissionId - this[segmentNameColumn] = segmentName + this[sequenceSubmissionIdColumn] = it.sampleName + this[metadataSubmissionIdColumn] = null this[sequenceUploadIdColumn] = uploadId this[compressedSequenceDataColumn] = compressor.compressNucleotideSequence( it.sequence, - segmentName, + it.sampleName, submittedOrganism, ) } @@ -176,9 +173,9 @@ class UploadDatabaseService( 'unalignedNucleotideSequences', COALESCE( jsonb_object_agg( - sequence_upload_aux_table.segment_name, + sequence_upload_aux_table.sequence_submission_id, sequence_upload_aux_table.compressed_sequence_data::jsonb - ) FILTER (WHERE sequence_upload_aux_table.segment_name IS NOT NULL), + ) FILTER (WHERE sequence_upload_aux_table.sequence_submission_id IS NOT NULL), '{}'::jsonb ) ) @@ -187,7 +184,7 @@ class UploadDatabaseService( LEFT JOIN sequence_upload_aux_table ON metadata_upload_aux_table.upload_id = sequence_upload_aux_table.upload_id - AND metadata_upload_aux_table.submission_id = sequence_upload_aux_table.submission_id + AND metadata_upload_aux_table.submission_id = sequence_upload_aux_table.metadata_submission_id WHERE metadata_upload_aux_table.upload_id = ? GROUP BY metadata_upload_aux_table.upload_id, diff --git a/backend/src/main/kotlin/org/loculus/backend/utils/ParseFastaHeader.kt b/backend/src/main/kotlin/org/loculus/backend/utils/ParseFastaHeader.kt deleted file mode 100644 index 4815e61e2b..0000000000 --- a/backend/src/main/kotlin/org/loculus/backend/utils/ParseFastaHeader.kt +++ /dev/null @@ -1,40 +0,0 @@ -package org.loculus.backend.utils - -import org.loculus.backend.api.Organism -import org.loculus.backend.config.BackendConfig -import org.loculus.backend.controller.BadRequestException -import org.loculus.backend.model.HEADER_TO_CONNECT_METADATA_AND_SEQUENCES -import org.loculus.backend.model.SegmentName -import org.loculus.backend.model.SubmissionId -import org.springframework.stereotype.Service - -@Service -class ParseFastaHeader(private val backendConfig: BackendConfig) { - fun parse(submissionId: String, organism: Organism): Pair { - val referenceGenome = backendConfig.getInstanceConfig(organism).referenceGenome - - if (referenceGenome.nucleotideSequences.size == 1) { - return Pair(submissionId, "main") - } - - val validSegmentIds = referenceGenome.nucleotideSequences.map { it.name } - - val lastDelimiter = submissionId.lastIndexOf("_") - if (lastDelimiter == -1) { - throw BadRequestException( - "The FASTA header $submissionId does not contain the segment name. Please provide the" + - " segment name in the format <$HEADER_TO_CONNECT_METADATA_AND_SEQUENCES>_", - ) - } - val isolateId = submissionId.substring(0, lastDelimiter) - val segmentId = submissionId.substring(lastDelimiter + 1) - if (!validSegmentIds.contains(segmentId)) { - throw BadRequestException( - "The FASTA header $submissionId ends with the segment name $segmentId, which is not valid. " + - "Valid segment names: ${validSegmentIds.joinToString(", ")}", - ) - } - - return Pair(isolateId, segmentId) - } -} diff --git a/backend/src/main/resources/db/migration/V1.16__rename_aux_table_columns.sql b/backend/src/main/resources/db/migration/V1.16__rename_aux_table_columns.sql new file mode 100644 index 0000000000..a785a6a2f8 --- /dev/null +++ b/backend/src/main/resources/db/migration/V1.16__rename_aux_table_columns.sql @@ -0,0 +1,20 @@ +alter table sequence_upload_aux_table drop CONSTRAINT sequence_upload_aux_table_pkey; + +alter table sequence_upload_aux_table add column sequence_submission_id text; +alter table sequence_upload_aux_table add column metadata_submission_id text; + +update sequence_upload_aux_table +set metadata_submission_id = submission_id +where metadata_submission_id is null; + +update sequence_upload_aux_table +set sequence_submission_id = case + when segment_name is NULL or segment_name = '' then submission_id + else submission_id || '_' || segment_name +end +where sequence_submission_id is NULL; + +alter table sequence_upload_aux_table drop column segment_name; +alter table sequence_upload_aux_table drop column submission_id; + +alter table sequence_upload_aux_table add PRIMARY KEY (upload_id, sequence_submission_id); diff --git a/backend/src/test/kotlin/org/loculus/backend/controller/submission/PreparedOriginalData.kt b/backend/src/test/kotlin/org/loculus/backend/controller/submission/PreparedOriginalData.kt index 20f05380bc..525cab15b0 100644 --- a/backend/src/test/kotlin/org/loculus/backend/controller/submission/PreparedOriginalData.kt +++ b/backend/src/test/kotlin/org/loculus/backend/controller/submission/PreparedOriginalData.kt @@ -11,7 +11,7 @@ val defaultOriginalData = OriginalData( "country" to "Switzerland", "division" to "Bern", ), - mapOf("main" to "ACTG"), + mapOf("custom0" to "ACTG"), ) val emptyOriginalData = OriginalData(metadata = emptyMap(), unalignedNucleotideSequences = emptyMap()) diff --git a/backend/src/test/kotlin/org/loculus/backend/controller/submission/SubmitEndpointSingleSegmentedTest.kt b/backend/src/test/kotlin/org/loculus/backend/controller/submission/SubmitEndpointSingleSegmentedTest.kt index 45c0595cdc..e0b651ecf6 100644 --- a/backend/src/test/kotlin/org/loculus/backend/controller/submission/SubmitEndpointSingleSegmentedTest.kt +++ b/backend/src/test/kotlin/org/loculus/backend/controller/submission/SubmitEndpointSingleSegmentedTest.kt @@ -60,7 +60,7 @@ class SubmitEndpointSingleSegmentedTest( .data .unalignedNucleotideSequences - assertThat(unalignedNucleotideSequences, hasEntry(DEFAULT_SEQUENCE_NAME, "AC")) + assertThat(unalignedNucleotideSequences, hasEntry("header1", "AC")) } @Test diff --git a/backend/src/test/kotlin/org/loculus/backend/controller/submission/SubmitEndpointTest.kt b/backend/src/test/kotlin/org/loculus/backend/controller/submission/SubmitEndpointTest.kt index a533c503e5..c57647ce0d 100644 --- a/backend/src/test/kotlin/org/loculus/backend/controller/submission/SubmitEndpointTest.kt +++ b/backend/src/test/kotlin/org/loculus/backend/controller/submission/SubmitEndpointTest.kt @@ -154,43 +154,43 @@ class SubmitEndpointTest( } @Test - fun `GIVEN submission without data use terms THEN returns an error`() { - submissionControllerClient.submitWithoutDataUseTerms( - DefaultFiles.metadataFile, - DefaultFiles.sequencesFileMultiSegmented, - organism = OTHER_ORGANISM, - groupId = groupId, - ) - .andExpect(status().isBadRequest) - } + fun `GIVEN input data with ambiguous submissionIds THEN data is rejected`() { + val expectedDetail = "Sequence file contains 1 ids that could be matched to multiple metadata keys" + + ", e.g. Sequence key: header1_2 matches [header1_2, header1]" - @Test - fun `GIVEN fasta data with unknown segment THEN data is not accepted`() { submissionControllerClient.submit( SubmitFiles.metadataFileWith( content = """ submissionId firstColumn - commonHeader someValue + header1 someValue + header1_2 someValue """.trimIndent(), ), SubmitFiles.sequenceFileWith( content = """ - >commonHeader_nonExistingSegmentName + >header1_2 + AC + >header1 AC """.trimIndent(), ), + groupId = groupId, + organism = OTHER_ORGANISM, + ) + .andExpect(status().isUnprocessableEntity) + .andExpect(jsonPath("\$.title").value("Unprocessable Entity")) + .andExpect(jsonPath("\$.detail", containsString(expectedDetail))) + } + + @Test + fun `GIVEN submission without data use terms THEN returns an error`() { + submissionControllerClient.submitWithoutDataUseTerms( + DefaultFiles.metadataFile, + DefaultFiles.sequencesFileMultiSegmented, organism = OTHER_ORGANISM, groupId = groupId, ) .andExpect(status().isBadRequest) - .andExpect(content().contentType(APPLICATION_PROBLEM_JSON)) - .andExpect( - jsonPath( - "\$.detail", - ).value( - "The FASTA header commonHeader_nonExistingSegmentName ends with the segment name nonExistingSegmentName, which is not valid. Valid segment names: notOnlySegment, secondSegment", - ), - ) } @Test @@ -406,9 +406,15 @@ class SubmitEndpointTest( status().isBadRequest, "Bad Request", "${metadataFileTypes.displayName} has wrong extension. Must be " + - ".${metadataFileTypes.validExtensions.joinToString(", .")} for uncompressed submissions or " + ".${ - metadataFileTypes.getCompressedExtensions().filterKeys { it != CompressionAlgorithm.NONE } + metadataFileTypes.validExtensions.joinToString( + ", .", + ) + } for uncompressed submissions or " + + ".${ + metadataFileTypes.getCompressedExtensions().filterKeys { + it != CompressionAlgorithm.NONE + } .flatMap { it.value }.joinToString(", .") } for compressed submissions", DEFAULT_ORGANISM, @@ -421,9 +427,15 @@ class SubmitEndpointTest( status().isBadRequest, "Bad Request", "${sequenceFileTypes.displayName} has wrong extension. Must be " + - ".${sequenceFileTypes.validExtensions.joinToString(", .")} for uncompressed submissions or " + ".${ - sequenceFileTypes.getCompressedExtensions().filterKeys { it != CompressionAlgorithm.NONE } + sequenceFileTypes.validExtensions.joinToString( + ", .", + ) + } for uncompressed submissions or " + + ".${ + sequenceFileTypes.getCompressedExtensions().filterKeys { + it != CompressionAlgorithm.NONE + } .flatMap { it.value }.joinToString(", .") } for compressed submissions", DEFAULT_ORGANISM, @@ -536,27 +548,6 @@ class SubmitEndpointTest( DEFAULT_ORGANISM, DataUseTerms.Open, ), - Arguments.of( - "FASTA header misses segment name", - SubmitFiles.metadataFileWith( - content = """ - submissionId firstColumn - commonHeader someValue - """.trimIndent(), - ), - SubmitFiles.sequenceFileWith( - content = """ - >commonHeader - AC - """.trimIndent(), - ), - status().isBadRequest, - "Bad Request", - "The FASTA header commonHeader does not contain the segment name. Please provide the segment " + - "name in the format _", - OTHER_ORGANISM, - DataUseTerms.Open, - ), Arguments.of( "restricted use data with until date in the past", DefaultFiles.metadataFile, diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py index 13f9eef580..1148472cbe 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py @@ -230,7 +230,87 @@ def run_sort( return alerts -def enrich_with_nextclade( # noqa: C901, PLR0912, PLR0914, PLR0915 +def assign_segment( + input_unaligned_sequences: dict[str, NucleotideSequence | None], + unaligned_nucleotide_sequences: dict[SegmentName, NucleotideSequence | None], + errors: list[ProcessingAnnotation], + config: Config, +): + valid_segments = set() + duplicate_segments = set() + if not config.nucleotideSequences: + return ( + unaligned_nucleotide_sequences, + errors, + ) + if not config.multi_segment: + if len(input_unaligned_sequences) > 1: + errors.append( + ProcessingAnnotation.from_single( + ProcessingAnnotationAlignment, + AnnotationSourceType.NUCLEOTIDE_SEQUENCE, + message=( + f"Multiple sequences: {list(input_unaligned_sequences.keys())} found in the" + f" input data, but organism: {config.organism} is single-segmented. " + "Please check that your metadata and sequences are annotated correctly." + "Each metadata entry should have a single corresponding fasta sequence " + "entry with the same submissionId." + ), + ) + ) + else: + _, value = next(iter(input_unaligned_sequences.items())) + unaligned_nucleotide_sequences["main"] = value + return ( + unaligned_nucleotide_sequences, + errors, + ) + for segment in config.nucleotideSequences: + unaligned_segment = [ + data + for data in input_unaligned_sequences + if re.match(segment + "$", data.split("_")[-1], re.IGNORECASE) + or re.match( + segment + "$", data, re.IGNORECASE + ) # backward compatibility allow only segment name in submission dict + ] + if len(unaligned_segment) > 1: + duplicate_segments.update(unaligned_segment) + errors.append( + ProcessingAnnotation.from_single( + ProcessingAnnotationAlignment, + AnnotationSourceType.NUCLEOTIDE_SEQUENCE, + message=( + f"Found multiple sequences with the same segment name: {segment}. " + "Each metadata entry can have multiple corresponding fasta sequence " + "entries with format _." + ), + ) + ) + elif len(unaligned_segment) == 1: + valid_segments.add(unaligned_segment[0]) + unaligned_nucleotide_sequences[segment] = input_unaligned_sequences[ + unaligned_segment[0] + ] + remaining_segments = set(input_unaligned_sequences.keys()) - valid_segments - duplicate_segments + if len(remaining_segments) > 0: + errors.append( + ProcessingAnnotation.from_single( + ProcessingAnnotationAlignment, + AnnotationSourceType.NUCLEOTIDE_SEQUENCE, + message=( + f"Found sequences in the input data with segments that are not in the config: " + f"{', '.join(remaining_segments)}. " + "Each metadata entry can have multiple corresponding fasta sequence " + "entries with format _ valid segments are: " + f"{', '.join(config.nucleotideSequences)}." + ), + ) + ) + return (unaligned_nucleotide_sequences, errors) + + +def enrich_with_nextclade( # noqa: C901, PLR0914, PLR0915 unprocessed: Sequence[UnprocessedEntry], dataset_dir: str, config: Config ) -> dict[AccessionVersion, UnprocessedAfterNextclade]: """ @@ -268,45 +348,15 @@ def enrich_with_nextclade( # noqa: C901, PLR0912, PLR0914, PLR0915 aligned_nucleotide_sequences[id] = {} alerts.warnings[id] = [] alerts.errors[id] = [] - num_valid_segments = 0 - num_duplicate_segments = 0 - for segment in config.nucleotideSequences: - unaligned_segment = [ - data - for data in entry.data.unalignedNucleotideSequences - if re.match(segment + "$", data, re.IGNORECASE) - ] - if len(unaligned_segment) > 1: - num_duplicate_segments += len(unaligned_segment) - alerts.errors[id].append( - ProcessingAnnotation.from_single( - segment, - AnnotationSourceType.NUCLEOTIDE_SEQUENCE, - message="Found multiple sequences with the same segment name.", - ), - ) - elif len(unaligned_segment) == 1: - num_valid_segments += 1 - unaligned_nucleotide_sequences[id][segment] = ( - entry.data.unalignedNucleotideSequences[unaligned_segment[0]] - ) - aligned_nucleotide_sequences[id][segment] = None - if ( - len(entry.data.unalignedNucleotideSequences) - - num_valid_segments - - num_duplicate_segments - > 0 - ): - alerts.errors[id].append( - ProcessingAnnotation.from_single( - ProcessingAnnotationAlignment, - AnnotationSourceType.NUCLEOTIDE_SEQUENCE, - message=( - "Found unknown segments in the input data - " - "check your segments are annotated correctly." - ), - ), - ) + ( + unaligned_nucleotide_sequences[id], + alerts.errors[id], + ) = assign_segment( + input_unaligned_sequences=entry.data.unalignedNucleotideSequences, + unaligned_nucleotide_sequences=unaligned_nucleotide_sequences[id], + errors=alerts.errors[id], + config=config, + ) nextclade_metadata: defaultdict[ AccessionVersion, defaultdict[SegmentName, dict[str, Any] | None] @@ -488,7 +538,7 @@ def add_input_metadata( nextclade_prefix = "nextclade." if input_path.startswith(nextclade_prefix): segment = str(spec.args["segment"]) if spec.args and "segment" in spec.args else "main" - if not unprocessed.nextcladeMetadata: + if not unprocessed.nextcladeMetadata and unprocessed.unalignedNucleotideSequences: # This field should never be empty message = ( "An unknown internal error occurred while aligning sequences, " @@ -501,7 +551,7 @@ def add_input_metadata( ) return None sub_path = input_path[len(nextclade_prefix) :] - if segment in unprocessed.nextcladeMetadata: + if unprocessed.nextcladeMetadata and segment in unprocessed.nextcladeMetadata: if not unprocessed.nextcladeMetadata[segment]: message = ( "Nucleotide sequence failed to align" @@ -662,14 +712,18 @@ def process_single( # noqa: C901 submitter = unprocessed.inputMetadata["submitter"] group_id = int(str(unprocessed.inputMetadata["group_id"])) - unaligned_nucleotide_sequences = unprocessed.unalignedNucleotideSequences else: submitter = unprocessed.submitter group_id = unprocessed.group_id - unaligned_nucleotide_sequences = unprocessed.unalignedNucleotideSequences + unprocessed.unalignedNucleotideSequences, errors = assign_segment( + input_unaligned_sequences=unprocessed.unalignedNucleotideSequences, + unaligned_nucleotide_sequences={}, + errors=errors, + config=config, + ) for segment in config.nucleotideSequences: - sequence = unaligned_nucleotide_sequences.get(segment, None) + sequence = unprocessed.unalignedNucleotideSequences.get(segment, None) key = "length" if segment == "main" else "length_" + segment if key in config.processing_spec: output_metadata[key] = len(sequence) if sequence else 0 diff --git a/preprocessing/nextclade/tests/test_nextclade_preprocessing.py b/preprocessing/nextclade/tests/test_nextclade_preprocessing.py index 5c4fd621f1..ba10a24c3d 100644 --- a/preprocessing/nextclade/tests/test_nextclade_preprocessing.py +++ b/preprocessing/nextclade/tests/test_nextclade_preprocessing.py @@ -213,7 +213,7 @@ def invalid_sequence() -> str: expected_warnings=[], expected_processed_alignment=ProcessedAlignment( unalignedNucleotideSequences={"main": invalid_sequence()}, - alignedNucleotideSequences={"main": None}, + alignedNucleotideSequences={}, nucleotideInsertions={}, alignedAminoAcidSequences={}, aminoAcidInsertions={}, @@ -420,7 +420,7 @@ def invalid_sequence() -> str: unalignedNucleotideSequences={ "ebola-sudan": invalid_sequence(), }, - alignedNucleotideSequences={"ebola-sudan": None}, + alignedNucleotideSequences={}, nucleotideInsertions={}, alignedAminoAcidSequences={}, aminoAcidInsertions={}, @@ -459,7 +459,6 @@ def invalid_sequence() -> str: "ebola-zaire": sequence_with_mutation("ebola-zaire"), }, alignedNucleotideSequences={ - "ebola-sudan": None, "ebola-zaire": sequence_with_mutation("ebola-zaire"), }, nucleotideInsertions={}, @@ -508,7 +507,7 @@ def invalid_sequence() -> str: unalignedNucleotideSequences={ "ebola-sudan": invalid_sequence(), }, - alignedNucleotideSequences={"ebola-sudan": None}, + alignedNucleotideSequences={}, nucleotideInsertions={}, alignedAminoAcidSequences={}, aminoAcidInsertions={}, @@ -547,7 +546,6 @@ def invalid_sequence() -> str: "ebola-zaire": sequence_with_mutation("ebola-zaire"), }, alignedNucleotideSequences={ - "ebola-sudan": None, "ebola-zaire": sequence_with_mutation("ebola-zaire"), }, nucleotideInsertions={}, @@ -560,6 +558,196 @@ def invalid_sequence() -> str: ), ] +segment_validation_tests_single_segment = [ + Case( + name="accept any key for single segment", + input_metadata={}, + input_sequence={"randomKey": sequence_with_mutation("single")}, + accession_id="1", + expected_metadata={ + "completeness": 1.0, + "totalInsertedNucs": 0, + "totalSnps": 1, + "totalDeletedNucs": 0, + "length": len(consensus_sequence("single")), + }, + expected_errors=[], + expected_warnings=[], + expected_processed_alignment=ProcessedAlignment( + unalignedNucleotideSequences={"main": sequence_with_mutation("single")}, + alignedNucleotideSequences={"main": sequence_with_mutation("single")}, + nucleotideInsertions={}, + alignedAminoAcidSequences={ + "NPEbolaSudan": ebola_sudan_aa(sequence_with_mutation("single"), "NP"), + "VP35EbolaSudan": ebola_sudan_aa(sequence_with_mutation("single"), "VP35"), + }, + aminoAcidInsertions={}, + ), + ), + Case( + name="do not accept multiple segments for single segment", + input_metadata={}, + input_sequence={ + "main": sequence_with_mutation("single"), + "randomKey": sequence_with_mutation("single"), + }, + accession_id="2", + expected_metadata={ + "completeness": None, + "totalInsertedNucs": None, + "totalSnps": None, + "totalDeletedNucs": None, + "length": 0, + }, + expected_errors=[ + ProcessingAnnotationHelper( + [ProcessingAnnotationAlignment], + [ProcessingAnnotationAlignment], + "Multiple sequences: ['main', 'randomKey'] found in the" + " input data, but organism: ebola-sudan-test is single-segmented. " + "Please check that your metadata and sequences are annotated correctly." + "Each metadata entry should have a single corresponding fasta sequence " + "entry with the same submissionId.", + AnnotationSourceType.NUCLEOTIDE_SEQUENCE, + ), + ], + expected_warnings=[], + expected_processed_alignment=ProcessedAlignment( + unalignedNucleotideSequences={}, + alignedNucleotideSequences={}, + nucleotideInsertions={}, + alignedAminoAcidSequences={}, + aminoAcidInsertions={}, + ), + ), +] + +segment_validation_tests_multi_segments = [ + Case( + name="accept any prefix for multi-segment", + input_metadata={}, + input_sequence={ + "prefix_ebola-sudan": sequence_with_mutation("ebola-sudan"), + "other_prefix_ebola-zaire": sequence_with_mutation("ebola-zaire"), + }, + accession_id="1", + expected_metadata={ + "totalInsertedNucs_ebola-sudan": 0, + "totalSnps_ebola-sudan": 1, + "totalDeletedNucs_ebola-sudan": 0, + "length_ebola-sudan": len(consensus_sequence("ebola-sudan")), + "totalInsertedNucs_ebola-zaire": 0, + "totalSnps_ebola-zaire": 1, + "totalDeletedNucs_ebola-zaire": 0, + "length_ebola-zaire": len(consensus_sequence("ebola-zaire")), + }, + expected_errors=[], + expected_warnings=[], + expected_processed_alignment=ProcessedAlignment( + unalignedNucleotideSequences={ + "ebola-sudan": sequence_with_mutation("ebola-sudan"), + "ebola-zaire": sequence_with_mutation("ebola-zaire"), + }, + alignedNucleotideSequences={ + "ebola-sudan": sequence_with_mutation("ebola-sudan"), + "ebola-zaire": sequence_with_mutation("ebola-zaire"), + }, + nucleotideInsertions={}, + alignedAminoAcidSequences={ + "NPEbolaSudan": ebola_sudan_aa(sequence_with_mutation("single"), "NP"), + "VP35EbolaSudan": ebola_sudan_aa(sequence_with_mutation("single"), "VP35"), + "VP24EbolaZaire": ebola_zaire_aa(sequence_with_mutation("ebola-zaire"), "VP24"), + "LEbolaZaire": ebola_zaire_aa(sequence_with_mutation("ebola-zaire"), "L"), + }, + aminoAcidInsertions={}, + ), + ), + Case( + name="don't allow multiple segments with the same name", + input_metadata={}, + input_sequence={ + "ebola-sudan": invalid_sequence(), + "duplicate_ebola-sudan": invalid_sequence(), + }, + accession_id="1", + expected_metadata={ + "totalInsertedNucs_ebola-sudan": None, + "totalSnps_ebola-sudan": None, + "totalDeletedNucs_ebola-sudan": None, + "length_ebola-sudan": 0, + "totalInsertedNucs_ebola-zaire": None, + "totalSnps_ebola-zaire": None, + "totalDeletedNucs_ebola-zaire": None, + "length_ebola-zaire": 0, + }, + expected_errors=[ + ProcessingAnnotationHelper( + [ProcessingAnnotationAlignment], + [ProcessingAnnotationAlignment], + "Found multiple sequences with the same segment name: ebola-sudan. " + "Each metadata entry can have multiple corresponding fasta sequence " + "entries with format _.", + AnnotationSourceType.NUCLEOTIDE_SEQUENCE, + ), + ProcessingAnnotationHelper( + [ProcessingAnnotationAlignment], + [ProcessingAnnotationAlignment], + "No segment aligned.", + AnnotationSourceType.NUCLEOTIDE_SEQUENCE, + ), + ], + expected_warnings=[], + expected_processed_alignment=ProcessedAlignment( + unalignedNucleotideSequences={}, + alignedNucleotideSequences={}, + nucleotideInsertions={}, + alignedAminoAcidSequences={}, + aminoAcidInsertions={}, + ), + ), + Case( + name="don't allow unknown segments", + input_metadata={}, + input_sequence={ + "ebola-sudan": sequence_with_mutation("ebola-sudan"), + "unknown_segment": invalid_sequence(), + }, + accession_id="2", + expected_metadata={ + "totalInsertedNucs_ebola-sudan": 0, + "totalSnps_ebola-sudan": 1, + "totalDeletedNucs_ebola-sudan": 0, + "length_ebola-sudan": len(consensus_sequence("ebola-sudan")), + "totalInsertedNucs_ebola-zaire": None, + "totalSnps_ebola-zaire": None, + "totalDeletedNucs_ebola-zaire": None, + "length_ebola-zaire": 0, + }, + expected_errors=[ + ProcessingAnnotationHelper( + [ProcessingAnnotationAlignment], + [ProcessingAnnotationAlignment], + "Found sequences in the input data with segments that are not in the config: " + "unknown_segment. Each metadata entry can have multiple corresponding fasta sequence " + "entries with format _ valid segments are: " + "ebola-sudan, ebola-zaire.", + AnnotationSourceType.NUCLEOTIDE_SEQUENCE, + ) + ], + expected_warnings=[], + expected_processed_alignment=ProcessedAlignment( + unalignedNucleotideSequences={"ebola-sudan": sequence_with_mutation("ebola-sudan")}, + alignedNucleotideSequences={"ebola-sudan": sequence_with_mutation("ebola-sudan")}, + nucleotideInsertions={}, + alignedAminoAcidSequences={ + "NPEbolaSudan": ebola_sudan_aa(sequence_with_mutation("single"), "NP"), + "VP35EbolaSudan": ebola_sudan_aa(sequence_with_mutation("single"), "VP35"), + }, + aminoAcidInsertions={}, + ), + ), +] + def process_single_entry( test_case: ProcessingTestCase, config: Config, dataset_dir: str = "temp" @@ -569,7 +757,9 @@ def process_single_entry( @pytest.mark.parametrize( - "test_case_def", single_segment_case_definitions, ids=lambda tc: f"single segment {tc.name}" + "test_case_def", + single_segment_case_definitions + segment_validation_tests_single_segment, + ids=lambda tc: f"single segment {tc.name}", ) def test_preprocessing_single_segment(test_case_def: Case): config = get_config(SINGLE_SEGMENT_CONFIG, ignore_args=True) @@ -583,7 +773,9 @@ def test_preprocessing_single_segment(test_case_def: Case): @pytest.mark.parametrize( "test_case_def", - multi_segment_case_definitions + multi_segment_case_definitions_all_requirement, + multi_segment_case_definitions + + segment_validation_tests_multi_segments + + multi_segment_case_definitions_all_requirement, ids=lambda tc: f"multi segment {tc.name}", ) def test_preprocessing_multi_segment_all_requirement(test_case_def: Case): @@ -598,7 +790,9 @@ def test_preprocessing_multi_segment_all_requirement(test_case_def: Case): @pytest.mark.parametrize( "test_case_def", - multi_segment_case_definitions + multi_segment_case_definitions_any_requirement, + multi_segment_case_definitions + + segment_validation_tests_multi_segments + + multi_segment_case_definitions_any_requirement, ids=lambda tc: f"multi segment {tc.name}", ) def test_preprocessing_multi_segment_any_requirement(test_case_def: Case): diff --git a/preprocessing/specification.md b/preprocessing/specification.md index 00d4ffb645..8b9df92140 100644 --- a/preprocessing/specification.md +++ b/preprocessing/specification.md @@ -43,6 +43,8 @@ In the unprocessed NDJSON, each line contains a sequence entry represented as a {"accession": 2, "version": 1, "data": {"metadata": {...}, "unalignedNucleotideSequences": {...}, "files": {...}}, "submitter": john_smith, ...} ``` +The `unalignedNucleotideSequences` field is a dictionary from the fasta header (of the form _ or where `submissionId` is the metadata submission Id) of the submitted entry to the sequence. + The `metadata` field contains a flat JSON object in which all values are strings. The fields and values correspond to the columns and values as provided by the submitter. Fields present in the metadata file header but left empty by the submitter will be returned as "". Only columns present in the submission metadata file will be keys in the JSON object - independent of the metadata schema. The `files` field contains a JSON object in which the keys are the names of the file categories and the values are lists of objects containing file IDs, names and URLs. This is only available if the Loculus instance and organism are configured to support files. The URLs are S3 presigned, read-only URLs that are only valid for a limited amount of time. diff --git a/website/src/components/Edit/SequencesForm.tsx b/website/src/components/Edit/SequencesForm.tsx index 3af8c91a2b..1ff31794c3 100644 --- a/website/src/components/Edit/SequencesForm.tsx +++ b/website/src/components/Edit/SequencesForm.tsx @@ -52,14 +52,14 @@ export class EditableSequences { */ static fromInitialData(initialData: SequenceEntryToEdit, segmentNames: string[]): EditableSequences { const emptyRows = this.emptyRows(segmentNames); - const existingDataRows = Object.entries(initialData.originalData.unalignedNucleotideSequences).map( - ([key, value]) => ({ + const existingDataRows = Object.entries(initialData.processedData.unalignedNucleotideSequences) + .filter(([_, value]) => value !== null) + .map(([key, value]) => ({ key, - initialValue: value, - value: value, + initialValue: value!.toString(), + value: value!.toString(), ...mapErrorsAndWarnings(initialData, key, 'NucleotideSequence'), - }), - ); + })); const mergedRows: Row[] = []; // merge in this way to retain the order of segment names as they were given. emptyRows.forEach((row) => {