From 329647aaa6e6ad7d45a5ecf48d8abfaaefb2a8c5 Mon Sep 17 00:00:00 2001 From: Theo Date: Thu, 19 Feb 2026 23:04:25 +0000 Subject: [PATCH] feat(preprocessing): preserve attached files through nextclade pipeline Add file preservation to the nextclade preprocessing pipeline so that files attached to submissions are carried through to the processed output. This enables the nextclade pipeline to replace the dummy pipeline for organisms that use file uploads. Changes: - datatypes.py: Add `files` field to `UnprocessedData` - backend.py: Extract files from backend JSON response - prepro.py: Pass files through to `ProcessedData` in no-alignment path - values.yaml: Rename `dummy-organism-with-files` to `test-organism-files` with inlined metadata and nextclade preprocessing - Update file-sharing integration tests for the rename The dummy pipeline and organisms are intentionally kept for now and will be removed in a follow-up PR after validation. Generated with [Claude Code](https://claude.ai/code) via [Happy](https://happy.engineering) Co-Authored-By: Claude Co-Authored-By: Happy --- .../tests/pages/submission.page.ts | 2 +- .../tests/specs/features/file-sharing.spec.ts | 6 ++-- kubernetes/loculus/values.yaml | 34 +++++++++++++++---- .../src/loculus_preprocessing/backend.py | 1 + .../src/loculus_preprocessing/datatypes.py | 1 + .../src/loculus_preprocessing/prepro.py | 9 +++++ 6 files changed, 42 insertions(+), 11 deletions(-) diff --git a/integration-tests/tests/pages/submission.page.ts b/integration-tests/tests/pages/submission.page.ts index c1b7a9d6e0..7bfdc58e93 100644 --- a/integration-tests/tests/pages/submission.page.ts +++ b/integration-tests/tests/pages/submission.page.ts @@ -96,7 +96,7 @@ export class SingleSequenceSubmissionPage extends SubmissionPage { await this.page.getByLabel('Author affiliations').fill(authorAffiliations); } - async fillSubmissionFormDummyOrganism({ + async fillSubmissionFormFilesOrganism({ submissionId, country, date, diff --git a/integration-tests/tests/specs/features/file-sharing.spec.ts b/integration-tests/tests/specs/features/file-sharing.spec.ts index 51364e6dea..95dfc55a3a 100644 --- a/integration-tests/tests/specs/features/file-sharing.spec.ts +++ b/integration-tests/tests/specs/features/file-sharing.spec.ts @@ -7,7 +7,7 @@ import { SearchPage } from '../../pages/search.page'; import { BulkSubmissionPage, SingleSequenceSubmissionPage } from '../../pages/submission.page'; const ORGANISM_NAME = 'Test organism (with files)'; -const ORGANISM_URL_NAME = 'dummy-organism-with-files'; +const ORGANISM_URL_NAME = 'test-organism-files'; const RAW_READS = 'raw_reads'; const METADATA_HEADERS = ['submissionId', 'country', 'date']; const COUNTRY_1 = 'Norway'; @@ -26,7 +26,7 @@ test('submit single seq w/ 2 files thru single seq submission form', async ({ void groupId; const submissionPage = new SingleSequenceSubmissionPage(page); await submissionPage.navigateToSubmissionPage(ORGANISM_NAME); - await submissionPage.fillSubmissionFormDummyOrganism({ + await submissionPage.fillSubmissionFormFilesOrganism({ submissionId: ID_1, country: COUNTRY_1, date: '2023-10-15', @@ -175,7 +175,7 @@ test('single revise seq with files via edit page', async ({ page, groupId, tmpDi // Step 1: Submit and release a sequence const submissionPage = new SingleSequenceSubmissionPage(page); await submissionPage.navigateToSubmissionPage(ORGANISM_NAME); - await submissionPage.fillSubmissionFormDummyOrganism({ + await submissionPage.fillSubmissionFormFilesOrganism({ submissionId: 'single-rev', country: COUNTRY_1, date: '2023-01-01', diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml index e931f6054c..8ab4ff6f49 100644 --- a/kubernetes/loculus/values.yaml +++ b/kubernetes/loculus/values.yaml @@ -1645,7 +1645,7 @@ defaultOrganisms: sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/sars-cov-2/ORF9b.fasta]]" - name: S sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/sars-cov-2/S.fasta]]" - dummy-organism-with-files: + test-organism-files: schema: image: "https://cdn.who.int/media/images/default-source/mca/mca-covid-19/coronavirus-2.tmb-1920v.jpg?sfvrsn=4dba955c_19" organismName: "Test organism (with files)" @@ -1657,21 +1657,41 @@ defaultOrganisms: - name: raw_reads files: - name: raw_reads - metadata: *dummyMetadata + metadata: + - name: date + type: date + initiallyVisible: true + header: "Collection Details" + - name: country + initiallyVisible: true + type: string + generateIndex: true + autocomplete: true + header: "Collection Details" website: tableColumns: - country - - division - date defaultOrder: descending defaultOrderBy: date preprocessing: - version: 1 - image: ghcr.io/loculus-project/preprocessing-dummy + image: ghcr.io/loculus-project/preprocessing-nextclade args: - - "--watch" - - "--disableConsensusSequences" - referenceGenomes: [] + - "prepro" + configFile: + log_level: DEBUG + batch_size: 100 + segments: + - name: main + references: + - name: singleReference + genes: [] + referenceGenomes: + - name: main + references: + - name: singleReference + sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/sars-cov-2/reference.fasta]]" not-aligned-organism: enabled: true schema: diff --git a/preprocessing/nextclade/src/loculus_preprocessing/backend.py b/preprocessing/nextclade/src/loculus_preprocessing/backend.py index cdc6fe3095..711af286a9 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/backend.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/backend.py @@ -101,6 +101,7 @@ def parse_ndjson(ndjson_data: str) -> Sequence[UnprocessedEntry]: unalignedNucleotideSequences=trimmed_unaligned_nucleotide_sequences if unaligned_nucleotide_sequences else {}, + files=json_object["data"].get("files"), ) entry = UnprocessedEntry( accessionVersion=f"{json_object['accession']}.{json_object['version']}", diff --git a/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py b/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py index 692aac2304..1da3175a85 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py @@ -79,6 +79,7 @@ class UnprocessedData: submittedAt: str # timestamp # noqa: N815 metadata: InputMetadata unalignedNucleotideSequences: dict[SequenceName, NucleotideSequence | None] # noqa: N815 + files: dict[str, list[dict[str, str]]] | None = None # External files attached to submission @dataclass diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py index 8ccfce847c..ef4f2eae0d 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py @@ -268,6 +268,14 @@ def processed_entry_no_alignment( # noqa: PLR0913, PLR0917 nucleotide_insertions: dict[SequenceName, list[NucleotideInsertion]] = {} amino_acid_insertions: dict[GeneName, list[AminoAcidInsertion]] = {} + # Convert files format from backend (list of dicts) to FileIdAndName objects + files_output = None + if unprocessed.files: + files_output = { + category: [FileIdAndName(fileId=f["fileId"], name=f["name"]) for f in file_list] + for category, file_list in unprocessed.files.items() + } + return SubmissionData( processed_entry=ProcessedEntry( accession=accession_from_str(accession_version), @@ -280,6 +288,7 @@ def processed_entry_no_alignment( # noqa: PLR0913, PLR0917 alignedAminoAcidSequences=aligned_aminoacid_sequences, aminoAcidInsertions=amino_acid_insertions, sequenceNameToFastaId=sequenceNameToFastaId, + files=files_output, ), errors=errors, warnings=warnings,