From 329647aaa6e6ad7d45a5ecf48d8abfaaefb2a8c5 Mon Sep 17 00:00:00 2001
From: Theo <theo@theo.io>
Date: Thu, 19 Feb 2026 23:04:25 +0000
Subject: [PATCH] feat(preprocessing): preserve attached files through
 nextclade pipeline

Add file preservation to the nextclade preprocessing pipeline so that
files attached to submissions are carried through to the processed output.
This enables the nextclade pipeline to replace the dummy pipeline for
organisms that use file uploads.

Changes:
- datatypes.py: Add `files` field to `UnprocessedData`
- backend.py: Extract files from backend JSON response
- prepro.py: Pass files through to `ProcessedData` in no-alignment path
- values.yaml: Rename `dummy-organism-with-files` to `test-organism-files`
  with inlined metadata and nextclade preprocessing
- Update file-sharing integration tests for the rename

The dummy pipeline and organisms are intentionally kept for now and will
be removed in a follow-up PR after validation.

Generated with [Claude Code](https://claude.ai/code)
via [Happy](https://happy.engineering)

Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: Happy <yesreply@happy.engineering>
---
 .../tests/pages/submission.page.ts            |  2 +-
 .../tests/specs/features/file-sharing.spec.ts |  6 ++--
 kubernetes/loculus/values.yaml                | 34 +++++++++++++++----
 .../src/loculus_preprocessing/backend.py      |  1 +
 .../src/loculus_preprocessing/datatypes.py    |  1 +
 .../src/loculus_preprocessing/prepro.py       |  9 +++++
 6 files changed, 42 insertions(+), 11 deletions(-)

diff --git a/integration-tests/tests/pages/submission.page.ts b/integration-tests/tests/pages/submission.page.ts
index c1b7a9d6e0..7bfdc58e93 100644
--- a/integration-tests/tests/pages/submission.page.ts
+++ b/integration-tests/tests/pages/submission.page.ts
@@ -96,7 +96,7 @@ export class SingleSequenceSubmissionPage extends SubmissionPage {
         await this.page.getByLabel('Author affiliations').fill(authorAffiliations);
     }
 
-    async fillSubmissionFormDummyOrganism({
+    async fillSubmissionFormFilesOrganism({
         submissionId,
         country,
         date,
diff --git a/integration-tests/tests/specs/features/file-sharing.spec.ts b/integration-tests/tests/specs/features/file-sharing.spec.ts
index 51364e6dea..95dfc55a3a 100644
--- a/integration-tests/tests/specs/features/file-sharing.spec.ts
+++ b/integration-tests/tests/specs/features/file-sharing.spec.ts
@@ -7,7 +7,7 @@ import { SearchPage } from '../../pages/search.page';
 import { BulkSubmissionPage, SingleSequenceSubmissionPage } from '../../pages/submission.page';
 
 const ORGANISM_NAME = 'Test organism (with files)';
-const ORGANISM_URL_NAME = 'dummy-organism-with-files';
+const ORGANISM_URL_NAME = 'test-organism-files';
 const RAW_READS = 'raw_reads';
 const METADATA_HEADERS = ['submissionId', 'country', 'date'];
 const COUNTRY_1 = 'Norway';
@@ -26,7 +26,7 @@ test('submit single seq w/ 2 files thru single seq submission form', async ({
     void groupId;
     const submissionPage = new SingleSequenceSubmissionPage(page);
     await submissionPage.navigateToSubmissionPage(ORGANISM_NAME);
-    await submissionPage.fillSubmissionFormDummyOrganism({
+    await submissionPage.fillSubmissionFormFilesOrganism({
         submissionId: ID_1,
         country: COUNTRY_1,
         date: '2023-10-15',
@@ -175,7 +175,7 @@ test('single revise seq with files via edit page', async ({ page, groupId, tmpDi
     // Step 1: Submit and release a sequence
     const submissionPage = new SingleSequenceSubmissionPage(page);
     await submissionPage.navigateToSubmissionPage(ORGANISM_NAME);
-    await submissionPage.fillSubmissionFormDummyOrganism({
+    await submissionPage.fillSubmissionFormFilesOrganism({
         submissionId: 'single-rev',
         country: COUNTRY_1,
         date: '2023-01-01',
diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml
index e931f6054c..8ab4ff6f49 100644
--- a/kubernetes/loculus/values.yaml
+++ b/kubernetes/loculus/values.yaml
@@ -1645,7 +1645,7 @@ defaultOrganisms:
                 sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/sars-cov-2/ORF9b.fasta]]"
               - name: S
                 sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/sars-cov-2/S.fasta]]"
-  dummy-organism-with-files:
+  test-organism-files:
     schema:
       image: "https://cdn.who.int/media/images/default-source/mca/mca-covid-19/coronavirus-2.tmb-1920v.jpg?sfvrsn=4dba955c_19"
       organismName: "Test organism (with files)"
@@ -1657,21 +1657,41 @@ defaultOrganisms:
             - name: raw_reads
       files:
         - name: raw_reads
-      metadata: *dummyMetadata
+      metadata:
+        - name: date
+          type: date
+          initiallyVisible: true
+          header: "Collection Details"
+        - name: country
+          initiallyVisible: true
+          type: string
+          generateIndex: true
+          autocomplete: true
+          header: "Collection Details"
       website:
         tableColumns:
           - country
-          - division
           - date
         defaultOrder: descending
         defaultOrderBy: date
     preprocessing:
       - version: 1
-        image: ghcr.io/loculus-project/preprocessing-dummy
+        image: ghcr.io/loculus-project/preprocessing-nextclade
         args:
-          - "--watch"
-          - "--disableConsensusSequences"
-    referenceGenomes: []
+          - "prepro"
+        configFile:
+          log_level: DEBUG
+          batch_size: 100
+          segments:
+            - name: main
+              references:
+              - name: singleReference
+                genes: []
+    referenceGenomes:
+      - name: main
+        references:
+          - name: singleReference
+            sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/sars-cov-2/reference.fasta]]"
   not-aligned-organism:
     enabled: true
     schema:
diff --git a/preprocessing/nextclade/src/loculus_preprocessing/backend.py b/preprocessing/nextclade/src/loculus_preprocessing/backend.py
index cdc6fe3095..711af286a9 100644
--- a/preprocessing/nextclade/src/loculus_preprocessing/backend.py
+++ b/preprocessing/nextclade/src/loculus_preprocessing/backend.py
@@ -101,6 +101,7 @@ def parse_ndjson(ndjson_data: str) -> Sequence[UnprocessedEntry]:
             unalignedNucleotideSequences=trimmed_unaligned_nucleotide_sequences
             if unaligned_nucleotide_sequences
             else {},
+            files=json_object["data"].get("files"),
         )
         entry = UnprocessedEntry(
             accessionVersion=f"{json_object['accession']}.{json_object['version']}",
diff --git a/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py b/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py
index 692aac2304..1da3175a85 100644
--- a/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py
+++ b/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py
@@ -79,6 +79,7 @@ class UnprocessedData:
     submittedAt: str  # timestamp  # noqa: N815
     metadata: InputMetadata
     unalignedNucleotideSequences: dict[SequenceName, NucleotideSequence | None]  # noqa: N815
+    files: dict[str, list[dict[str, str]]] | None = None  # External files attached to submission
 
 
 @dataclass
diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py
index 8ccfce847c..ef4f2eae0d 100644
--- a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py
+++ b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py
@@ -268,6 +268,14 @@ def processed_entry_no_alignment(  # noqa: PLR0913, PLR0917
     nucleotide_insertions: dict[SequenceName, list[NucleotideInsertion]] = {}
     amino_acid_insertions: dict[GeneName, list[AminoAcidInsertion]] = {}
 
+    # Convert files format from backend (list of dicts) to FileIdAndName objects
+    files_output = None
+    if unprocessed.files:
+        files_output = {
+            category: [FileIdAndName(fileId=f["fileId"], name=f["name"]) for f in file_list]
+            for category, file_list in unprocessed.files.items()
+        }
+
     return SubmissionData(
         processed_entry=ProcessedEntry(
             accession=accession_from_str(accession_version),
@@ -280,6 +288,7 @@ def processed_entry_no_alignment(  # noqa: PLR0913, PLR0917
                 alignedAminoAcidSequences=aligned_aminoacid_sequences,
                 aminoAcidInsertions=amino_acid_insertions,
                 sequenceNameToFastaId=sequenceNameToFastaId,
+                files=files_output,
             ),
             errors=errors,
             warnings=warnings,