Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
b68036b
feat(prepro): refactor - more nextclade functions to their own file, …
anna-parker Nov 21, 2025
de5d87a
format
anna-parker Nov 21, 2025
d714206
make order of functions a bit cleaner
anna-parker Nov 21, 2025
db531bd
make diff cleaner
anna-parker Nov 21, 2025
2d482db
format
anna-parker Nov 24, 2025
892a226
pass less through function
corneliusroemer Nov 24, 2025
8ae8738
feat: update edit page
anna-parker Nov 6, 2025
ee34d48
feat: clean up edit page PR a bit (#5396)
anna-parker Nov 10, 2025
03e383c
feat: add some suggestions
anna-parker Nov 19, 2025
ea37faf
feat!(backend): refactor multi-segment submission (2/n) (#5398)
anna-parker Nov 20, 2025
8d566bf
fix backend
anna-parker Nov 20, 2025
451b944
create a separate interface for files with a fasta header (#5495)
fengelniederhammer Nov 21, 2025
7ac390f
silly formatting error
anna-parker Nov 21, 2025
2c34e84
use white space as a separator everywhere (#5501)
anna-parker Nov 21, 2025
c27199b
feat: add validation of map, update integration tests and add prepro …
anna-parker Nov 21, 2025
ad97da8
feat: rename sequenceNameToFastaHeaderMap to sequenceNameToFastaId
anna-parker Nov 21, 2025
dd7e304
make fasta id separator a constant (#5507)
anna-parker Nov 21, 2025
50645b4
wupps
anna-parker Nov 24, 2025
d1f900e
fix integration tests
anna-parker Nov 24, 2025
190bbc0
ingest from higher level EV taxon
anna-parker Nov 25, 2025
20731aa
merge conflict
anna-parker Nov 25, 2025
0139905
feat(prepro): batch segment assignment
anna-parker Nov 25, 2025
ae18b3b
Assign genotype in prepro (#5551)
anna-parker Nov 27, 2025
470b78b
refactorings of prepro multi-segment (#5552)
corneliusroemer Nov 27, 2025
6d14d8a
Separate test datasets, don't copy in tests
corneliusroemer Nov 27, 2025
cf9d51f
Integration test fix (#5560)
anna-parker Nov 27, 2025
9cd1d30
feat(prepro, config): use new EV datasets and fix gene alignment for …
anna-parker Nov 28, 2025
0f57907
merge conflict
anna-parker Nov 29, 2025
a6edb8e
feat(docs, website): multi path - update submission docs and template…
anna-parker Dec 1, 2025
3b78a30
feat(prepro): deduplicate segment assignment code and add tests (#5554)
anna-parker Dec 1, 2025
1eb679b
Apply suggestions from code review
anna-parker Dec 1, 2025
8f649d8
weird merge conflict
anna-parker Dec 1, 2025
3ce662c
Add maxSequencesPerEntry option and validate no duplicate fastaIds (#…
corneliusroemer Dec 2, 2025
9889a83
feat(config): update values.schema.json for multi path (#5582)
anna-parker Dec 2, 2025
475c841
fix(prepro): nextclade dataset tag conversion error (#5614)
anna-parker Dec 2, 2025
18b8b67
work on integration test comments in #5382 (#5617)
fengelniederhammer Dec 3, 2025
21a9df8
refactor(backend): rename addFastaId to requireConsensusSequence
anna-parker Dec 3, 2025
a27144e
refactor: use minimizer_url instead of minimizer_index in both ingest…
anna-parker Dec 3, 2025
6d2bb1e
add back genes missing from readme
anna-parker Dec 3, 2025
3e84bb8
merge conflict: update isMultiSegmentedOrganism to use getReferenceGe…
anna-parker Dec 3, 2025
2213cec
format
anna-parker Dec 3, 2025
35ff38f
format
anna-parker Dec 4, 2025
9a7c376
merge conflict
anna-parker Dec 4, 2025
3c77286
feat(website): edit page - do not include fasta description in metada…
anna-parker Dec 5, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions backend/AGENTS.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
Kotlin dependent packages have already been installed for you.

To run tests:
Run tests like this (if you have Docker set up properly):

./gradlew test --console=plain

If that doesn't work due to Docker issues because you're running inside a cloud environment, try this:

USE_NONDOCKER_INFRA=true ./gradlew test --console=plain

Expand All @@ -15,4 +19,4 @@ Always ensure the tests and lint pass before committing.


Use conventional commits as titles for PRs, e.g. feat(deployment):xx, fix!(website):xx, chore(backend):xx.
Components include: website, backend, deployment, preprocessing, ingest, deposition.
Components include: website, backend, deployment, preprocessing, ingest, deposition.
10 changes: 5 additions & 5 deletions backend/docs/db/schema.sql
Original file line number Diff line number Diff line change
Expand Up @@ -378,7 +378,8 @@ CREATE TABLE public.metadata_upload_aux_table (
group_id integer,
uploaded_at timestamp without time zone NOT NULL,
metadata jsonb NOT NULL,
files jsonb
files jsonb,
fasta_ids text[]
);


Expand Down Expand Up @@ -538,9 +539,8 @@ ALTER VIEW public.sequence_entries_view OWNER TO postgres;

CREATE TABLE public.sequence_upload_aux_table (
upload_id text NOT NULL,
submission_id text NOT NULL,
segment_name text NOT NULL,
compressed_sequence_data text NOT NULL
compressed_sequence_data text NOT NULL,
fasta_id text NOT NULL
);


Expand Down Expand Up @@ -753,7 +753,7 @@ ALTER TABLE ONLY public.sequence_entries_preprocessed_data
--

ALTER TABLE ONLY public.sequence_upload_aux_table
ADD CONSTRAINT sequence_upload_aux_table_pkey PRIMARY KEY (upload_id, submission_id, segment_name);
ADD CONSTRAINT sequence_upload_aux_table_pkey PRIMARY KEY (upload_id, fasta_id);


--
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import com.fasterxml.jackson.databind.JsonDeserializer
import com.fasterxml.jackson.databind.JsonNode
import com.fasterxml.jackson.databind.annotation.JsonDeserialize
import io.swagger.v3.oas.annotations.media.Schema
import org.loculus.backend.model.FastaId
import org.loculus.backend.model.SubmissionId
import org.loculus.backend.service.files.FileId
import org.loculus.backend.utils.Accession
Expand Down Expand Up @@ -166,6 +167,11 @@ data class ProcessedData<SequenceType>(
description = "The key is the gene name, the value is a list of amino acid insertions",
)
val aminoAcidInsertions: Map<GeneName, List<Insertion>>,
@Schema(
example = """{"segment1": "fastaHeader1", "segment2": "fastaHeader2"}""",
description = "The key is the segment name, the value is the fastaHeader of the original Data",
)
val sequenceNameToFastaId: Map<SegmentName, String> = emptyMap(),
@Schema(
example = """{"raw_reads": [{"fileId": "s0m3-uUiDd", "name": "data.fastaq"}], "sequencing_logs": []}""",
description = "The key is the file category name, the value is a list of files, with ID and name.",
Expand Down Expand Up @@ -300,9 +306,9 @@ data class OriginalDataInternal<SequenceType, FilesType>(
val metadata: Map<String, String>,
@Schema(
example = "{\"segment1\": \"ACTG\", \"segment2\": \"GTCA\"}",
description = "The key is the segment name, the value is the nucleotide sequence",
description = "The key is the fastaID, the value is the nucleotide sequence",
)
val unalignedNucleotideSequences: Map<SegmentName, SequenceType?>,
val unalignedNucleotideSequences: Map<FastaId, SequenceType?>,
@Schema(
example = """{"raw_reads": [{"fileId": "f1le-uuId-asdf", "name": "myfile.fastaq"]}""",
description = "A map from file categories, to lists of files. The files can also have URLs.",
Expand Down
4 changes: 3 additions & 1 deletion backend/src/main/kotlin/org/loculus/backend/config/Config.kt
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,13 @@ data class Schema(
val externalMetadata: List<ExternalMetadata> = emptyList(),
val earliestReleaseDate: EarliestReleaseDate = EarliestReleaseDate(false, emptyList()),
val submissionDataTypes: SubmissionDataTypes = SubmissionDataTypes(),
val files: List<FileCategory> = emptyList(),
val files: List<FileCategory> = emptyList(), // Allowed file categories for output files
)

data class SubmissionDataTypes(
val consensusSequences: Boolean = true,
val maxSequencesPerEntry: Int? = null, // null means unlimited sequences per entry
// Allowed file categories for submission files
val files: FilesSubmissionDataType = FilesSubmissionDataType(false, emptyList()),
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ data class ReferenceGenome(val nucleotideSequences: List<ReferenceSequence>, val
?.sequence

private fun shortenSequence(sequence: String) = when {
sequence.length > 10 -> sequence.substring(0, 10) + "..."
sequence.length > 10 -> sequence.take(10) + "..."
else -> sequence
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
package org.loculus.backend.controller

import org.loculus.backend.model.HEADER_TO_CONNECT_METADATA_AND_SEQUENCES
import org.loculus.backend.model.FASTA_IDS_HEADER
import org.loculus.backend.model.FASTA_IDS_SEPARATOR
import org.loculus.backend.model.METADATA_ID_HEADER

const val SUBMIT_RESPONSE_DESCRIPTION = """
Returns a list of accession, version and submissionId of the submitted sequence entries.
The submissionId is the (locally unique) '$HEADER_TO_CONNECT_METADATA_AND_SEQUENCES' provided by the submitter in the metadata file.
The submissionId is the (locally unique) '$METADATA_ID_HEADER' provided by the submitter in the metadata file.
The version will be 1 for every sequence.
The accession is the (globally unique) id that the system assigned to the sequence entry.
You can use this response to associate the user provided $HEADER_TO_CONNECT_METADATA_AND_SEQUENCES with the system assigned accession.
You can use this response to associate the user provided $METADATA_ID_HEADER with the system assigned accession.
"""

const val SUBMIT_ERROR_RESPONSE = """
Expand All @@ -18,16 +20,18 @@ const val METADATA_FILE_DESCRIPTION = """
A TSV (tab separated values) file containing the metadata of the submitted sequence entries.
The file may be compressed with zstd, xz, zip, gzip, lzma, bzip2 (with common extensions).
It must contain the column names.
The field '$HEADER_TO_CONNECT_METADATA_AND_SEQUENCES' is required and must be unique within the provided dataset.
The field '$METADATA_ID_HEADER' is required and must be unique within the provided dataset.
It is used to associate metadata to the sequences in the sequences fasta file.
"""

const val SEQUENCE_FILE_DESCRIPTION = """
A fasta file containing the unaligned nucleotide sequences of the submitted sequences.
The file may be compressed with zstd, xz, zip, gzip, lzma, bzip2 (with common extensions).
If the underlying organism has a single segment,
the headers of the fasta file must match the '$HEADER_TO_CONNECT_METADATA_AND_SEQUENCES' field in the metadata file.
the headers of the fasta file must match the '$METADATA_ID_HEADER' field in the metadata file.
If the underlying organism has multiple segments,
the headers of the fasta file must be of the form '>[$HEADER_TO_CONNECT_METADATA_AND_SEQUENCES]_[segmentName]'.
the headers of the fasta file must be added in a '$FASTA_IDS_SEPARATOR'-separated list to the '$FASTA_IDS_HEADER'
field in the metadata file.
"""

const val FILE_MAPPING_DESCRIPTION = """
Expand Down Expand Up @@ -114,7 +118,7 @@ The version will increase by one in respect to the original accession version.

const val REVISED_METADATA_FILE_DESCRIPTION = """
A TSV (tab separated values) file containing the metadata of the revised data.
The first row must contain the column names. The column '$HEADER_TO_CONNECT_METADATA_AND_SEQUENCES' is required and must be unique within the
The first row must contain the column names. The column '$METADATA_ID_HEADER' is required and must be unique within the
provided dataset. It is used to associate metadata to the sequences in the sequences fasta file.
Additionally, the column 'accession' is required and must match the accession of the original sequence entry.
"""
Expand Down
81 changes: 47 additions & 34 deletions backend/src/main/kotlin/org/loculus/backend/model/SubmitModel.kt
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,7 @@ import org.loculus.backend.config.BackendConfig
import org.loculus.backend.controller.BadRequestException
import org.loculus.backend.controller.DuplicateKeyException
import org.loculus.backend.controller.UnprocessableEntityException
import org.loculus.backend.service.datauseterms.DataUseTermsPreconditionValidator
import org.loculus.backend.service.files.FilesDatabaseService
import org.loculus.backend.service.files.S3Service
import org.loculus.backend.service.groupmanagement.GroupManagementPreconditionValidator
import org.loculus.backend.service.submission.CompressionAlgorithm
import org.loculus.backend.service.submission.SubmissionIdFilesMappingPreconditionValidator
import org.loculus.backend.service.submission.UploadDatabaseService
Expand All @@ -31,13 +28,16 @@ import java.io.BufferedInputStream
import java.io.File
import java.io.InputStream

const val HEADER_TO_CONNECT_METADATA_AND_SEQUENCES = "id"
const val HEADER_TO_CONNECT_METADATA_AND_SEQUENCES_ALTERNATE_FOR_BACKCOMPAT = "submissionId"
const val METADATA_ID_HEADER = "id"
const val METADATA_ID_HEADER_ALTERNATE_FOR_BACKCOMPAT = "submissionId"
const val FASTA_IDS_HEADER = "fastaIds"
const val FASTA_IDS_SEPARATOR = " "

const val ACCESSION_HEADER = "accession"
private val log = KotlinLogging.logger { }

typealias SubmissionId = String
typealias FastaId = String
typealias SegmentName = String

const val UNIQUE_CONSTRAINT_VIOLATION_SQL_STATE = "23505"
Expand Down Expand Up @@ -85,7 +85,6 @@ class SubmitModel(
private val submissionIdFilesMappingPreconditionValidator: SubmissionIdFilesMappingPreconditionValidator,
private val dateProvider: DateProvider,
private val backendConfig: BackendConfig,
private val s3Service: S3Service,
) {

companion object AcceptedFileTypes {
Expand Down Expand Up @@ -126,8 +125,13 @@ class SubmitModel(
val metadataSubmissionIds = uploadDatabaseService.getMetadataUploadSubmissionIds(uploadId).toSet()
if (requiresConsensusSequenceFile(submissionParams.organism)) {
log.debug { "Validating submission with uploadId $uploadId" }
val sequenceSubmissionIds = uploadDatabaseService.getSequenceUploadSubmissionIds(uploadId).toSet()
validateSubmissionIdSetsForConsensusSequences(metadataSubmissionIds, sequenceSubmissionIds)
val metadataFastaIds = uploadDatabaseService.getFastaIdsForMetadata(uploadId).flatten()
val metadataFastaIdsSet = metadataFastaIds.toSet()
if (metadataFastaIdsSet.size < metadataFastaIds.size) {
throw UnprocessableEntityException("Metadata file contains duplicate fastaIds.")
}
val sequenceFastaIds = uploadDatabaseService.getSequenceUploadSubmissionIds(uploadId).toSet()
validateSubmissionIdSetsForConsensusSequences(metadataFastaIdsSet, sequenceFastaIds)
}

if (submissionParams is SubmissionParams.RevisionSubmissionParams) {
Expand Down Expand Up @@ -167,6 +171,7 @@ class SubmitModel(
metadataFileTypes,
metadataTempFileToDelete,
)
val requireConsensusSequence = requiresConsensusSequenceFile(submissionParams.organism)
try {
uploadMetadata(uploadId, submissionParams, metadataStream, batchSize)
} finally {
Expand All @@ -175,30 +180,30 @@ class SubmitModel(

val sequenceFile = submissionParams.sequenceFile
if (sequenceFile == null) {
if (requiresConsensusSequenceFile(submissionParams.organism)) {
if (requireConsensusSequence) {
throw BadRequestException(
"Submissions for organism ${submissionParams.organism.name} require a sequence file.",
)
}
} else {
if (!requiresConsensusSequenceFile(submissionParams.organism)) {
throw BadRequestException(
"Sequence uploads are not allowed for organism ${submissionParams.organism.name}.",
)
}
return
}
if (!requireConsensusSequence) {
throw BadRequestException(
"Sequence uploads are not allowed for organism ${submissionParams.organism.name}.",
)
}

val sequenceTempFileToDelete = MaybeFile()
try {
val sequenceStream = getStreamFromFile(
sequenceFile,
uploadId,
sequenceFileTypes,
sequenceTempFileToDelete,
)
uploadSequences(uploadId, sequenceStream, batchSize, submissionParams.organism)
} finally {
sequenceTempFileToDelete.delete()
}
val sequenceTempFileToDelete = MaybeFile()
try {
val sequenceStream = getStreamFromFile(
sequenceFile,
uploadId,
sequenceFileTypes,
sequenceTempFileToDelete,
)
uploadSequences(uploadId, sequenceStream, batchSize, submissionParams.organism)
} finally {
sequenceTempFileToDelete.delete()
}
}

Expand Down Expand Up @@ -250,10 +255,15 @@ class SubmitModel(
"from $submissionParams.submitter with UploadId $uploadId"
}
val now = dateProvider.getCurrentDateTime()
val maxSequencesPerEntry = backendConfig.getInstanceConfig(submissionParams.organism)
.schema
.submissionDataTypes
.maxSequencesPerEntry

try {
when (submissionParams) {
is SubmissionParams.OriginalSubmissionParams -> {
metadataEntryStreamAsSequence(metadataStream)
metadataEntryStreamAsSequence(metadataStream, maxSequencesPerEntry)
.chunked(batchSize)
.forEach { batch ->
uploadDatabaseService.batchInsertMetadataInAuxTable(
Expand All @@ -269,7 +279,7 @@ class SubmitModel(
}

is SubmissionParams.RevisionSubmissionParams -> {
revisionEntryStreamAsSequence(metadataStream)
revisionEntryStreamAsSequence(metadataStream, maxSequencesPerEntry)
.chunked(batchSize)
.forEach { batch ->
uploadDatabaseService.batchInsertRevisedMetadataInAuxTable(
Expand Down Expand Up @@ -344,14 +354,17 @@ class SubmitModel(

if (metadataKeysNotInSequences.isNotEmpty() || sequenceKeysNotInMetadata.isNotEmpty()) {
val metadataNotPresentErrorText = if (metadataKeysNotInSequences.isNotEmpty()) {
"Metadata file contains ${metadataKeysNotInSequences.size} ids that are not present " +
"in the sequence file: " + metadataKeysNotInSequences.toList().joinToString(limit = 10) + "; "
"Metadata file contains ${metadataKeysNotInSequences.size} FASTA ids that are not present " +
"in the sequence file: " + metadataKeysNotInSequences.toList().joinToString(limit = 10) {
"'$it'"
} + ". "
} else {
""
}
val sequenceNotPresentErrorText = if (sequenceKeysNotInMetadata.isNotEmpty()) {
"Sequence file contains ${sequenceKeysNotInMetadata.size} ids that are not present " +
"in the metadata file: " + sequenceKeysNotInMetadata.toList().joinToString(limit = 10)
"Sequence file contains ${sequenceKeysNotInMetadata.size} FASTA ids that are not present " +
"in the metadata file: " +
sequenceKeysNotInMetadata.toList().joinToString(limit = 10) { "'$it'" }
} else {
""
}
Expand All @@ -364,7 +377,7 @@ class SubmitModel(
if (filesKeysNotInMetadata.isNotEmpty()) {
throw UnprocessableEntityException(
"File upload contains ${filesKeysNotInMetadata.size} submissionIds that are not present in the " +
"metadata file: " + filesKeysNotInMetadata.toList().joinToString(limit = 10),
"metadata file: " + filesKeysNotInMetadata.toList().joinToString(limit = 10) { "'$it'" },
)
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ class CompressionService(private val compressionDictService: CompressionDictServ
}
},
processedData.aminoAcidInsertions,
processedData.sequenceNameToFastaId,
processedData.files,
)

Expand All @@ -128,6 +129,7 @@ class CompressionService(private val compressionDictService: CompressionDictServ
}
},
processedData.aminoAcidInsertions,
processedData.sequenceNameToFastaId,
processedData.files,
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ class EmptyProcessedDataProvider(private val backendConfig: BackendConfig) {
alignedAminoAcidSequences = referenceGenome.genes.map { it.name }.associateWith { null },
nucleotideInsertions = referenceGenome.nucleotideSequences.map { it.name }.associateWith { emptyList() },
aminoAcidInsertions = referenceGenome.genes.map { it.name }.associateWith { emptyList() },
sequenceNameToFastaId = referenceGenome.nucleotideSequences.map { it.name }.associateWith { "" },
files = null,
)
}
Expand Down
Loading
Loading