From 8665745261c2b3b14da4128bea4ada9ab9f7743a Mon Sep 17 00:00:00 2001 From: anna-parker <50943381+anna-parker@users.noreply.github.com> Date: Fri, 20 Mar 2026 12:40:31 +0100 Subject: [PATCH 01/15] feat(prepro): add custom lineage functions --- .../processing_functions.py | 113 ++++++++++++++++++ 1 file changed, 113 insertions(+) diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py index aa8a4e70ab..3515859fda 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py @@ -1194,6 +1194,119 @@ def is_above_threshold( ) return ProcessingResult(datum=(input > threshold), warnings=[], errors=[]) + @staticmethod + def is_variant( + input_data: InputMetadata, output_field: str, input_fields: list[str], args: FunctionArgs + ) -> ProcessingResult: + """Flag if number of mutations is above mutation rate (specified in args) times length""" + if "mu" not in args: + return ProcessingResult( + datum=None, + warnings=[], + errors=[ + ProcessingAnnotation.from_fields( + input_fields, + [output_field], + AnnotationSourceType.METADATA, + message=( + f"Field {output_field} is missing mu argument." + " Please report this error to the administrator." + ), + ) + ], + ) + length_datum = input_data["length"] + num_mutations_datum = input_data["numMutations"] + if not length_datum or not num_mutations_datum: + return ProcessingResult(datum=None, warnings=[], errors=[]) + try: + mu = float(args["mu"]) # type: ignore + length = float(length_datum) + threshold = mu * length + is_above_threshold_result = ProcessingFunctions.is_above_threshold( + input_data={"input": num_mutations_datum}, + output_field=output_field, + input_fields=input_fields, + args={"threshold": threshold}, + ) + except (ValueError, TypeError): + return ProcessingResult( + datum=None, + warnings=[], + errors=[ + ProcessingAnnotation.from_fields( + input_fields, + [output_field], + AnnotationSourceType.METADATA, + message=(f"Field {output_field} has non-numeric threshold value."), + ) + ], + ) + return ProcessingResult( + datum=is_above_threshold_result.datum, + warnings=is_above_threshold_result.warnings, + errors=is_above_threshold_result.errors, + ) + + @staticmethod + def assign_custom_lineage( + input_data: InputMetadata, output_field: str, input_fields: list[str], args: FunctionArgs + ) -> ProcessingResult: + """Assign flu lineage based on seg4 and seg6""" + logger.debug( + f"Starting custom lineage assignment with input_data: {input_data} and args: {args}" + ) + if not input_data: + return ProcessingResult(datum=None, warnings=[], errors=[]) + ha_subtype = input_data.get("subtype_seg4") + na_subtype = input_data.get("subtype_seg6") + references: dict[str, str | None] = {} + variant: dict[str, bool | None] = {} + for i in range(1, 9): + segment = f"seg{i}" + reference_field = f"reference_seg{i}" + variant_field = f"variant_seg{i}" + if reference_field in input_data: + references[segment] = input_data.get(reference_field) + variant[segment] = ( + bool(input_data.get(variant_field)) if variant_field in input_data else None + ) + try: + if not ha_subtype or not na_subtype: + return ProcessingResult(datum=None, warnings=[], errors=[]) + lineage = f"{ha_subtype}{na_subtype}" + if references.get("seg4") == "h1n1pdm" and references.get("seg6") == "h1n1pdm": + lineage = "H1N1pdm" + logger.debug( + f"Determined preliminary lineage {lineage} based on segments seg4 and seg6" + ) + if lineage in {"H1N1", "H3N2", "H2N2", "H1N1pdm"}: + logger.debug( + f"Lineage {lineage} is a human lineage, checking for reassortment and variants" + ) + # only assign human lineages + if len(set(references.values())) > 1: + lineage += " reassortant" + if any(v for v in variant.values() if v): + lineage += " (variant)" + return ProcessingResult(datum=lineage, warnings=[], errors=[]) + except (ValueError, TypeError): + return ProcessingResult( + datum=None, + warnings=[], + errors=[ + ProcessingAnnotation.from_fields( + input_fields, + [output_field], + AnnotationSourceType.METADATA, + message=( + f"Internal error processing custom lineage for field {output_field}." + ), + ) + ], + ) + return ProcessingResult(datum=None, warnings=[], errors=[]) + @staticmethod def build_display_name( # noqa: C901 input_data: InputMetadata, From e8d9c02df42fac347c0f20f75e2c9b4c11ddf052 Mon Sep 17 00:00:00 2001 From: anna-parker <50943381+anna-parker@users.noreply.github.com> Date: Fri, 20 Mar 2026 14:39:32 +0100 Subject: [PATCH 02/15] test on preview --- kubernetes/loculus/values.yaml | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml index b9fb75c7b7..ffe069b3d5 100644 --- a/kubernetes/loculus/values.yaml +++ b/kubernetes/loculus/values.yaml @@ -1379,21 +1379,6 @@ defaultOrganismConfig: &defaultOrganismConfig label: Length preprocessing: inputs: {input: nextclade.coverage} - - name: variant - isSequenceFilter: true - perSegment: true - header: "Alignment and QC metrics" - displayName: "Variant" - type: boolean - noInput: true - autocomplete: true - initiallyVisible: false - includeInDownloadsByDefault: false - preprocessing: - function: is_above_threshold - args: - threshold: 50 - inputs: {input: "nextclade.privateNucMutations.totalPrivateSubstitutions"} website: &website tableColumns: - sampleCollectionDate @@ -1548,6 +1533,21 @@ defaultOrganisms: includeInDownloadsByDefault: false preprocessing: inputs: {input: "nextclade.cladeFounderInfo.aaMutations.*.privateSubstitutions"} + - name: variant + isSequenceFilter: true + perSegment: true + header: "Alignment and QC metrics" + displayName: "Variant" + type: boolean + noInput: true + autocomplete: true + initiallyVisible: false + includeInDownloadsByDefault: false + preprocessing: + function: is_variant + args: + mu: 0.002 + inputs: {numMutations: "nextclade.privateNucMutations.totalPrivateSubstitutions", length: processed.length} website: <<: *website tableColumns: From 67227c68a9f566acb8cbca73a0e2a64cb29e805b Mon Sep 17 00:00:00 2001 From: anna-parker <50943381+anna-parker@users.noreply.github.com> Date: Fri, 20 Mar 2026 15:09:53 +0100 Subject: [PATCH 03/15] add values for multi seg, multi ref --- kubernetes/loculus/values.yaml | 57 ++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml index ffe069b3d5..f6264acec8 100644 --- a/kubernetes/loculus/values.yaml +++ b/kubernetes/loculus/values.yaml @@ -1897,6 +1897,63 @@ defaultOrganisms: lineageSystem: cchfS preprocessing: inputs: {input: nextclade.clade} + - name: variant_L + isSequenceFilter: true + header: "Clade & Lineage" + oneHeader: true + displayName: "Variant L" + type: boolean + noInput: true + autocomplete: true + initiallyVisible: false + includeInDownloadsByDefault: false + customDisplay: + type: variantReference + displayGroup: reference_L + label: Closest reference L + preprocessing: + function: is_variant + args: + mu: 0.004 + inputs: {input: "nextclade.totalSubstitutions"} + - name: variant_M + isSequenceFilter: true + header: "Clade & Lineage" + oneHeader: true + displayName: "Variant M" + type: boolean + noInput: true + autocomplete: true + initiallyVisible: false + includeInDownloadsByDefault: false + customDisplay: + type: variantReference + displayGroup: reference_M + label: Closest reference M + preprocessing: + function: is_variant + args: + mu: 0.008 + inputs: {input: "nextclade.totalSubstitutions"} + - name: variant_S + isSequenceFilter: true + header: "Clade & Lineage" + oneHeader: true + displayName: "Variant S" + type: boolean + noInput: true + autocomplete: true + initiallyVisible: false + includeInDownloadsByDefault: false + customDisplay: + type: variantReference + displayGroup: reference_S + label: Closest reference S + preprocessing: + function: is_variant + args: + mu: 0.004 + inputs: {input: "nextclade.totalSubstitutions"} website: <<: *website tableColumns: From 514f4858bf5e158f7934bb5546ad8c9efe25d580 Mon Sep 17 00:00:00 2001 From: anna-parker <50943381+anna-parker@users.noreply.github.com> Date: Fri, 20 Mar 2026 15:12:32 +0100 Subject: [PATCH 04/15] format --- kubernetes/loculus/values.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml index f6264acec8..4e8a2e0afe 100644 --- a/kubernetes/loculus/values.yaml +++ b/kubernetes/loculus/values.yaml @@ -1934,7 +1934,7 @@ defaultOrganisms: function: is_variant args: mu: 0.008 - inputs: {input: "nextclade.totalSubstitutions"} + inputs: {input: "nextclade.totalSubstitutions"} - name: variant_S isSequenceFilter: true header: "Clade & Lineage" @@ -1953,7 +1953,7 @@ defaultOrganisms: function: is_variant args: mu: 0.004 - inputs: {input: "nextclade.totalSubstitutions"} + inputs: {input: "nextclade.totalSubstitutions"} website: <<: *website tableColumns: From f1faaf90434aae71aa1b7da5c51e99c869076c27 Mon Sep 17 00:00:00 2001 From: anna-parker <50943381+anna-parker@users.noreply.github.com> Date: Fri, 20 Mar 2026 15:34:24 +0100 Subject: [PATCH 05/15] improve --- kubernetes/loculus/values.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml index 4e8a2e0afe..9612b6f526 100644 --- a/kubernetes/loculus/values.yaml +++ b/kubernetes/loculus/values.yaml @@ -1915,7 +1915,7 @@ defaultOrganisms: function: is_variant args: mu: 0.004 - inputs: {input: "nextclade.totalSubstitutions"} + inputs: {numMutations: "nextclade.totalSubstitutions", length: processed.length_L} - name: variant_M isSequenceFilter: true header: "Clade & Lineage" @@ -1934,7 +1934,7 @@ defaultOrganisms: function: is_variant args: mu: 0.008 - inputs: {input: "nextclade.totalSubstitutions"} + inputs: {numMutations: "nextclade.totalSubstitutions", length: processed.length_M} - name: variant_S isSequenceFilter: true header: "Clade & Lineage" @@ -1953,7 +1953,7 @@ defaultOrganisms: function: is_variant args: mu: 0.004 - inputs: {input: "nextclade.totalSubstitutions"} + inputs: {numMutations: "nextclade.totalSubstitutions", length: processed.length_S} website: <<: *website tableColumns: From e66fea611e16deeb6e93cf8e3569dcc2ae941af4 Mon Sep 17 00:00:00 2001 From: anna-parker <50943381+anna-parker@users.noreply.github.com> Date: Fri, 20 Mar 2026 15:43:48 +0100 Subject: [PATCH 06/15] wupps --- kubernetes/loculus/values.yaml | 110 ++++++++++++++------------------- 1 file changed, 45 insertions(+), 65 deletions(-) diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml index 9612b6f526..3e531fb759 100644 --- a/kubernetes/loculus/values.yaml +++ b/kubernetes/loculus/values.yaml @@ -1897,63 +1897,6 @@ defaultOrganisms: lineageSystem: cchfS preprocessing: inputs: {input: nextclade.clade} - - name: variant_L - isSequenceFilter: true - header: "Clade & Lineage" - oneHeader: true - displayName: "Variant L" - type: boolean - noInput: true - autocomplete: true - initiallyVisible: false - includeInDownloadsByDefault: false - customDisplay: - type: variantReference - displayGroup: reference_L - label: Closest reference L - preprocessing: - function: is_variant - args: - mu: 0.004 - inputs: {numMutations: "nextclade.totalSubstitutions", length: processed.length_L} - - name: variant_M - isSequenceFilter: true - header: "Clade & Lineage" - oneHeader: true - displayName: "Variant M" - type: boolean - noInput: true - autocomplete: true - initiallyVisible: false - includeInDownloadsByDefault: false - customDisplay: - type: variantReference - displayGroup: reference_M - label: Closest reference M - preprocessing: - function: is_variant - args: - mu: 0.008 - inputs: {numMutations: "nextclade.totalSubstitutions", length: processed.length_M} - - name: variant_S - isSequenceFilter: true - header: "Clade & Lineage" - oneHeader: true - displayName: "Variant S" - type: boolean - noInput: true - autocomplete: true - initiallyVisible: false - includeInDownloadsByDefault: false - customDisplay: - type: variantReference - displayGroup: reference_S - label: Closest reference S - preprocessing: - function: is_variant - args: - mu: 0.004 - inputs: {numMutations: "nextclade.totalSubstitutions", length: processed.length_S} website: <<: *website tableColumns: @@ -2080,12 +2023,11 @@ defaultOrganisms: header: "Host" ingest: ncbiHostName initiallyVisible: true - - name: variant + - name: variant_L isSequenceFilter: true - perSegment: true header: "Clade & Lineage" oneHeader: true - displayName: "Variant" + displayName: "Variant L" type: boolean noInput: true autocomplete: true @@ -2093,13 +2035,51 @@ defaultOrganisms: includeInDownloadsByDefault: false customDisplay: type: variantReference - displayGroup: reference - label: Closest reference + displayGroup: reference_L + label: Closest reference L preprocessing: - function: is_above_threshold + function: is_variant args: - threshold: 1000 - inputs: {input: "nextclade.totalSubstitutions"} #custom nextclade dataset does not have private mutations, so using total substitutions as a proxy for distance from reference + mu: 0.004 + inputs: {numMutations: "nextclade.totalSubstitutions", length: processed.length_L} + - name: variant_M + isSequenceFilter: true + header: "Clade & Lineage" + oneHeader: true + displayName: "Variant M" + type: boolean + noInput: true + autocomplete: true + initiallyVisible: false + includeInDownloadsByDefault: false + customDisplay: + type: variantReference + displayGroup: reference_M + label: Closest reference M + preprocessing: + function: is_variant + args: + mu: 0.008 + inputs: {numMutations: "nextclade.totalSubstitutions", length: processed.length_M} + - name: variant_S + isSequenceFilter: true + header: "Clade & Lineage" + oneHeader: true + displayName: "Variant S" + type: boolean + noInput: true + autocomplete: true + initiallyVisible: false + includeInDownloadsByDefault: false + customDisplay: + type: variantReference + displayGroup: reference_S + label: Closest reference S + preprocessing: + function: is_variant + args: + mu: 0.004 + inputs: {numMutations: "nextclade.totalSubstitutions", length: processed.length_S} - name: reference oneHeader: true header: "Clade & Lineage" From b82af9821308ff5588bb4b3e3c3bc08a301b9e98 Mon Sep 17 00:00:00 2001 From: anna-parker <50943381+anna-parker@users.noreply.github.com> Date: Sat, 21 Mar 2026 20:50:20 +0100 Subject: [PATCH 07/15] require less processed fields --- .../processing_functions.py | 23 ++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py index 3515859fda..76267d5f67 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py @@ -1249,7 +1249,7 @@ def is_variant( ) @staticmethod - def assign_custom_lineage( + def assign_custom_lineage( # noqa: C901 input_data: InputMetadata, output_field: str, input_fields: list[str], args: FunctionArgs ) -> ProcessingResult: """Assign flu lineage based on seg4 and seg6""" @@ -1261,6 +1261,7 @@ def assign_custom_lineage( ha_subtype = input_data.get("subtype_seg4") na_subtype = input_data.get("subtype_seg6") references: dict[str, str | None] = {} + extracted_subtypes: dict[str, str | None] = {} variant: dict[str, bool | None] = {} for i in range(1, 9): segment = f"seg{i}" @@ -1272,10 +1273,26 @@ def assign_custom_lineage( bool(input_data.get(variant_field)) if variant_field in input_data else None ) try: + for i in range(1, 9): + segment = f"seg{i}" + extracted_subtypes[segment] = ProcessingFunctions.call_function( # type: ignore + "extract_regex", + { + "pattern": args["pattern"], + "uppercase": args["uppercase"], + "capture_group": args["capture_group"], + }, + {"regex_field": references.get(segment, "")}, + "output_field", + ["segment_name"], + ).datum if not ha_subtype or not na_subtype: return ProcessingResult(datum=None, warnings=[], errors=[]) lineage = f"{ha_subtype}{na_subtype}" - if references.get("seg4") == "h1n1pdm" and references.get("seg6") == "h1n1pdm": + if ( + extracted_subtypes.get("seg4") == "H1N1PDM" + and extracted_subtypes.get("seg6") == "H1N1PDM" + ): lineage = "H1N1pdm" logger.debug( f"Determined preliminary lineage {lineage} based on segments seg4 and seg6" @@ -1285,7 +1302,7 @@ def assign_custom_lineage( f"Lineage {lineage} is a human lineage, checking for reassortment and variants" ) # only assign human lineages - if len(set(references.values())) > 1: + if len(set(extracted_subtypes.values())) > 1: lineage += " reassortant" if any(v for v in variant.values() if v): lineage += " (variant)" From 769b7894e93d77278cd80e2bd9f05e8568f15157 Mon Sep 17 00:00:00 2001 From: anna-parker <50943381+anna-parker@users.noreply.github.com> Date: Sat, 21 Mar 2026 22:12:17 +0100 Subject: [PATCH 08/15] add more logging --- .../nextclade/src/loculus_preprocessing/processing_functions.py | 1 + 1 file changed, 1 insertion(+) diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py index 76267d5f67..223a4c22e5 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py @@ -1286,6 +1286,7 @@ def assign_custom_lineage( # noqa: C901 "output_field", ["segment_name"], ).datum + logger.debug(f"Extracted subtypes: {extracted_subtypes} from references: {references}") if not ha_subtype or not na_subtype: return ProcessingResult(datum=None, warnings=[], errors=[]) lineage = f"{ha_subtype}{na_subtype}" From e185ccda88516cdd4e825c6636c57bd8c0853ad4 Mon Sep 17 00:00:00 2001 From: anna-parker <50943381+anna-parker@users.noreply.github.com> Date: Sun, 22 Mar 2026 12:35:44 +0100 Subject: [PATCH 09/15] feat: add unit tests --- .../tests/test_assign_custom_lineage.py | 198 ++++++++++++++++++ .../test_metadata_processing_functions.py | 2 +- 2 files changed, 199 insertions(+), 1 deletion(-) create mode 100644 preprocessing/nextclade/tests/test_assign_custom_lineage.py diff --git a/preprocessing/nextclade/tests/test_assign_custom_lineage.py b/preprocessing/nextclade/tests/test_assign_custom_lineage.py new file mode 100644 index 0000000000..0c312a8d45 --- /dev/null +++ b/preprocessing/nextclade/tests/test_assign_custom_lineage.py @@ -0,0 +1,198 @@ +# ruff: noqa: S101 +"""Tests for ProcessingFunctions.assign_custom_lineage.""" + +import pytest + +from loculus_preprocessing.processing_functions import ProcessingFunctions + +ARGS = { + "capture_group": "info", + "pattern": "^(?:.*_)?(?P[^_]+)$", + "uppercase": True, + "is_insdc_ingest_group": True, + "submittedAt": 1774128802, + "ACCESSION_VERSION": "LOC_00020W3.1", +} + + +def make_flu_input( # noqa: PLR0913, PLR0917 + ha_subtype: str | None = "H1", + na_subtype: str | None = "N1", + seg4_ref: str = "h1_h1n1pdm", + seg6_ref: str = "n1_h1n1pdm", + other_ref: str = "h1n1pdm", + variants: dict[str, bool] | None = None, +) -> dict: + """Build a flat input_data dict as assign_custom_lineage expects.""" + data: dict = {} + for i in range(1, 9): + ref = seg4_ref if i == 4 else seg6_ref if i == 6 else other_ref + data[f"reference_seg{i}"] = ref + data[f"variant_seg{i}"] = (variants or {}).get(f"seg{i}", False) + if ha_subtype is not None: + data["subtype_seg4"] = ha_subtype + if na_subtype is not None: + data["subtype_seg6"] = na_subtype + return data + + +def call(input_data: dict) -> str | None: + return ProcessingFunctions.assign_custom_lineage( + input_data=input_data, + output_field="lineage", + input_fields=list(input_data.keys()), + args=ARGS, + ).datum + + +class TestH1N1pdm: + """H1N1pdm lineage: seg4 ref is h1_h1n1pdm, seg6 ref is n1_h1n1pdm.""" + + @staticmethod + def test_h1n1pdm_assigned_when_subtypes_match(): + """All 8 segments reference h1n1pdm lineage (with HA/NA prefixes on seg4/6).""" + input_data = make_flu_input( + ha_subtype="H1", + na_subtype="N1", + seg4_ref="h1_h1n1pdm", + seg6_ref="n1_h1n1pdm", + other_ref="h1n1pdm", + ) + assert call(input_data) == "H1N1pdm" + + @staticmethod + def test_h1n1pdm_with_variant_flag(): + input_data = make_flu_input( + ha_subtype="H1", + na_subtype="N1", + seg4_ref="h1_h1n1pdm", + seg6_ref="n1_h1n1pdm", + other_ref="h1n1pdm", + variants={"seg4": True}, + ) + assert call(input_data) == "H1N1pdm (variant)" + + @staticmethod + def test_h1n1pdm_reassortant_when_one_segment_differs(): + """If one internal segment has a different lineage, result is reassortant.""" + input_data = make_flu_input( + ha_subtype="H1", + na_subtype="N1", + seg4_ref="h1_h1n1pdm", + seg6_ref="n1_h1n1pdm", + other_ref="h1n1pdm", + ) + # Override seg2 to a different lineage + input_data["reference_seg2"] = "h3n2" + assert call(input_data) == "H1N1pdm reassortant" + + +class TestH1N1Seasonal: + """Seasonal H1N1: all references are plain h1n1 (no prefix).""" + + @staticmethod + def test_h1n1_seasonal_assigned(): + input_data = make_flu_input( + ha_subtype="H1", + na_subtype="N1", + seg4_ref="h1n1", + seg6_ref="h1n1", + other_ref="h1n1", + ) + assert call(input_data) == "H1N1" + + @staticmethod + def test_h1n1_seasonal_reassortant(): + input_data = make_flu_input( + ha_subtype="H1", + na_subtype="N1", + seg4_ref="h1n1", + seg6_ref="h1n1", + other_ref="h1n1", + ) + input_data["reference_seg3"] = "h3n2" + assert call(input_data) == "H1N1 reassortant" + + @staticmethod + def test_h1n1_seasonal_with_variant(): + input_data = make_flu_input( + ha_subtype="H1", + na_subtype="N1", + seg4_ref="h1n1", + seg6_ref="h1n1", + other_ref="h1n1", + variants={"seg1": True}, + ) + assert call(input_data) == "H1N1 (variant)" + + +class TestH3N2: + @staticmethod + def test_h3n2_assigned(): + input_data = make_flu_input( + ha_subtype="H3", + na_subtype="N2", + seg4_ref="h3_h3n2", + seg6_ref="n2_h3n2", + other_ref="h3n2", + ) + assert call(input_data) == "H3N2" + + @staticmethod + def test_h3n2_reassortant(): + input_data = make_flu_input( + ha_subtype="H3", + na_subtype="N2", + seg4_ref="h3_h3n2", + seg6_ref="n2_h3n2", + other_ref="h3n2", + ) + input_data["reference_seg1"] = "h1n1pdm" + assert call(input_data) == "H3N2 reassortant" + + +class TestNonHumanLineage: + """Non-human lineages (e.g. H5N1, H7N9) should return None.""" + + @staticmethod + def test_h5n1_returns_none(): + input_data = make_flu_input( + ha_subtype="H5", + na_subtype="N1", + seg4_ref="h5_h5n1", + seg6_ref="n1_h5n1", + other_ref="h5n1", + ) + assert call(input_data) is None + + @staticmethod + def test_h7n9_returns_none(): + input_data = make_flu_input( + ha_subtype="H7", + na_subtype="N9", + seg4_ref="h7_h7n9", + seg6_ref="n9_h7n9", + other_ref="h7n9", + ) + assert call(input_data) is None + + +class TestMissingData: + @staticmethod + def test_empty_input_returns_none(): + assert call({}) is None + + @staticmethod + def test_missing_ha_subtype_returns_none(): + input_data = make_flu_input(ha_subtype=None, na_subtype="N1") + assert call(input_data) is None + + @staticmethod + def test_missing_na_subtype_returns_none(): + input_data = make_flu_input(ha_subtype="H1", na_subtype=None) + assert call(input_data) is None + + @staticmethod + def test_both_subtypes_missing_returns_none(): + input_data = make_flu_input(ha_subtype=None, na_subtype=None) + assert call(input_data) is None diff --git a/preprocessing/nextclade/tests/test_metadata_processing_functions.py b/preprocessing/nextclade/tests/test_metadata_processing_functions.py index 534b3db067..a84659c7da 100644 --- a/preprocessing/nextclade/tests/test_metadata_processing_functions.py +++ b/preprocessing/nextclade/tests/test_metadata_processing_functions.py @@ -1033,7 +1033,7 @@ def test_concatenate() -> None: assert res_fallback_explicit_null.datum == "0/unknown/version.1/unknown" -def test_display_name_construction() -> None: +def test_display_name_construction() -> None: # noqa: PLR0915 submission_id = "mySample" submission_id_formatted = "hDENV1/Germany/myExtractedSample/2025" submission_id_formatted_unexpected = "hDENV1/myExtractedSample/2025" From bb470bdc7c5dd8aa951e7f7269e6fadcc99c7ab1 Mon Sep 17 00:00:00 2001 From: anna-parker <50943381+anna-parker@users.noreply.github.com> Date: Sun, 22 Mar 2026 12:37:54 +0100 Subject: [PATCH 10/15] improve docs --- .../src/loculus_preprocessing/processing_functions.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py index 223a4c22e5..3d61fc720d 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py @@ -1252,7 +1252,11 @@ def is_variant( def assign_custom_lineage( # noqa: C901 input_data: InputMetadata, output_field: str, input_fields: list[str], args: FunctionArgs ) -> ProcessingResult: - """Assign flu lineage based on seg4 and seg6""" + """ + Assign flu lineage based on seg4 and seg6. + Add reassortant flag if subtypes from different lineages are detected for other segments, + add and variant flag if any segment is a variant. + """ logger.debug( f"Starting custom lineage assignment with input_data: {input_data} and args: {args}" ) From 18049ff6edcdc89b6c09ae10f3f39ba00865aa03 Mon Sep 17 00:00:00 2001 From: anna-parker <50943381+anna-parker@users.noreply.github.com> Date: Sun, 22 Mar 2026 12:40:24 +0100 Subject: [PATCH 11/15] add is_variant tests --- .../tests/test_assign_custom_lineage.py | 65 ++++++++++++++++++- 1 file changed, 62 insertions(+), 3 deletions(-) diff --git a/preprocessing/nextclade/tests/test_assign_custom_lineage.py b/preprocessing/nextclade/tests/test_assign_custom_lineage.py index 0c312a8d45..4d30271ea7 100644 --- a/preprocessing/nextclade/tests/test_assign_custom_lineage.py +++ b/preprocessing/nextclade/tests/test_assign_custom_lineage.py @@ -1,7 +1,5 @@ # ruff: noqa: S101 -"""Tests for ProcessingFunctions.assign_custom_lineage.""" - -import pytest +"""Tests for ProcessingFunctions.assign_custom_lineage and is_variant.""" from loculus_preprocessing.processing_functions import ProcessingFunctions @@ -196,3 +194,64 @@ def test_missing_na_subtype_returns_none(): def test_both_subtypes_missing_returns_none(): input_data = make_flu_input(ha_subtype=None, na_subtype=None) assert call(input_data) is None + + +def call_is_variant(length, num_mutations, mu="0.01"): + return ProcessingFunctions.is_variant( + input_data={"length": length, "numMutations": num_mutations}, + output_field="variant", + input_fields=["length", "numMutations"], + args={"mu": mu}, + ) + + +class TestIsVariant: + @staticmethod + def test_above_threshold_is_true(): + # 150 mutations, length 1000, mu=0.1 → threshold=100, 150>100 → True + result = call_is_variant(length="1000", num_mutations="150", mu="0.1") + assert result.datum is True + assert result.errors == [] + + @staticmethod + def test_below_threshold_is_false(): + # 50 mutations, length 1000, mu=0.1 → threshold=100, 50<100 → False + result = call_is_variant(length="1000", num_mutations="50", mu="0.1") + assert result.datum is False + assert result.errors == [] + + @staticmethod + def test_exactly_at_threshold_is_false(): + # 100 mutations, length 1000, mu=0.1 → threshold=100, 100 is not > 100 → False + result = call_is_variant(length="1000", num_mutations="100", mu="0.1") + assert result.datum is False + + @staticmethod + def test_missing_length_returns_none(): + result = call_is_variant(length=None, num_mutations="50") + assert result.datum is None + assert result.errors == [] + + @staticmethod + def test_missing_num_mutations_returns_none(): + result = call_is_variant(length="1000", num_mutations=None) + assert result.datum is None + assert result.errors == [] + + @staticmethod + def test_missing_mu_arg_returns_error(): + result = ProcessingFunctions.is_variant( + input_data={"length": "1000", "numMutations": "50"}, + output_field="variant", + input_fields=["length", "numMutations"], + args={}, + ) + assert result.datum is None + assert len(result.errors) == 1 + assert "missing mu argument" in result.errors[0].message + + @staticmethod + def test_non_numeric_inputs_return_error(): + result = call_is_variant(length="not_a_number", num_mutations="50") + assert result.datum is None + assert len(result.errors) == 1 From 905a00049fa4763bcc43484290a377fffd43d1a8 Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Sun, 22 Mar 2026 12:44:35 +0100 Subject: [PATCH 12/15] Apply suggestions from code review Co-authored-by: claude[bot] <209825114+claude[bot]@users.noreply.github.com> --- .../src/loculus_preprocessing/processing_functions.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py index 3d61fc720d..11567381f3 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py @@ -1215,8 +1215,8 @@ def is_variant( ) ], ) - length_datum = input_data["length"] - num_mutations_datum = input_data["numMutations"] + length_datum = input_data.get("length") + num_mutations_datum = input_data.get("numMutations") if not length_datum or not num_mutations_datum: return ProcessingResult(datum=None, warnings=[], errors=[]) try: @@ -1238,7 +1238,7 @@ def is_variant( input_fields, [output_field], AnnotationSourceType.METADATA, - message=(f"Field {output_field} has non-numeric threshold value."), + message=(f"Field {output_field} has non-numeric length or numMutations value."), ) ], ) @@ -1307,7 +1307,7 @@ def assign_custom_lineage( # noqa: C901 f"Lineage {lineage} is a human lineage, checking for reassortment and variants" ) # only assign human lineages - if len(set(extracted_subtypes.values())) > 1: + if len({v for v in extracted_subtypes.values() if v is not None}) > 1: lineage += " reassortant" if any(v for v in variant.values() if v): lineage += " (variant)" From 9c59c1e2ab905f9647297429b0e031b66b660215 Mon Sep 17 00:00:00 2001 From: anna-parker <50943381+anna-parker@users.noreply.github.com> Date: Sun, 22 Mar 2026 12:43:22 +0100 Subject: [PATCH 13/15] fix types --- .../tests/test_assign_custom_lineage.py | 46 +++++++++---------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/preprocessing/nextclade/tests/test_assign_custom_lineage.py b/preprocessing/nextclade/tests/test_assign_custom_lineage.py index 4d30271ea7..1ead8572d9 100644 --- a/preprocessing/nextclade/tests/test_assign_custom_lineage.py +++ b/preprocessing/nextclade/tests/test_assign_custom_lineage.py @@ -3,7 +3,7 @@ from loculus_preprocessing.processing_functions import ProcessingFunctions -ARGS = { +ARGS: dict[str, list[str] | str | bool | int | float | None] = { "capture_group": "info", "pattern": "^(?:.*_)?(?P[^_]+)$", "uppercase": True, @@ -34,7 +34,7 @@ def make_flu_input( # noqa: PLR0913, PLR0917 return data -def call(input_data: dict) -> str | None: +def assign_custom_lineage(input_data: dict) -> str | int | float | bool | None: return ProcessingFunctions.assign_custom_lineage( input_data=input_data, output_field="lineage", @@ -56,7 +56,7 @@ def test_h1n1pdm_assigned_when_subtypes_match(): seg6_ref="n1_h1n1pdm", other_ref="h1n1pdm", ) - assert call(input_data) == "H1N1pdm" + assert assign_custom_lineage(input_data) == "H1N1pdm" @staticmethod def test_h1n1pdm_with_variant_flag(): @@ -68,7 +68,7 @@ def test_h1n1pdm_with_variant_flag(): other_ref="h1n1pdm", variants={"seg4": True}, ) - assert call(input_data) == "H1N1pdm (variant)" + assert assign_custom_lineage(input_data) == "H1N1pdm (variant)" @staticmethod def test_h1n1pdm_reassortant_when_one_segment_differs(): @@ -82,7 +82,7 @@ def test_h1n1pdm_reassortant_when_one_segment_differs(): ) # Override seg2 to a different lineage input_data["reference_seg2"] = "h3n2" - assert call(input_data) == "H1N1pdm reassortant" + assert assign_custom_lineage(input_data) == "H1N1pdm reassortant" class TestH1N1Seasonal: @@ -97,7 +97,7 @@ def test_h1n1_seasonal_assigned(): seg6_ref="h1n1", other_ref="h1n1", ) - assert call(input_data) == "H1N1" + assert assign_custom_lineage(input_data) == "H1N1" @staticmethod def test_h1n1_seasonal_reassortant(): @@ -109,7 +109,7 @@ def test_h1n1_seasonal_reassortant(): other_ref="h1n1", ) input_data["reference_seg3"] = "h3n2" - assert call(input_data) == "H1N1 reassortant" + assert assign_custom_lineage(input_data) == "H1N1 reassortant" @staticmethod def test_h1n1_seasonal_with_variant(): @@ -121,7 +121,7 @@ def test_h1n1_seasonal_with_variant(): other_ref="h1n1", variants={"seg1": True}, ) - assert call(input_data) == "H1N1 (variant)" + assert assign_custom_lineage(input_data) == "H1N1 (variant)" class TestH3N2: @@ -134,7 +134,7 @@ def test_h3n2_assigned(): seg6_ref="n2_h3n2", other_ref="h3n2", ) - assert call(input_data) == "H3N2" + assert assign_custom_lineage(input_data) == "H3N2" @staticmethod def test_h3n2_reassortant(): @@ -146,7 +146,7 @@ def test_h3n2_reassortant(): other_ref="h3n2", ) input_data["reference_seg1"] = "h1n1pdm" - assert call(input_data) == "H3N2 reassortant" + assert assign_custom_lineage(input_data) == "H3N2 reassortant" class TestNonHumanLineage: @@ -161,7 +161,7 @@ def test_h5n1_returns_none(): seg6_ref="n1_h5n1", other_ref="h5n1", ) - assert call(input_data) is None + assert assign_custom_lineage(input_data) is None @staticmethod def test_h7n9_returns_none(): @@ -172,31 +172,31 @@ def test_h7n9_returns_none(): seg6_ref="n9_h7n9", other_ref="h7n9", ) - assert call(input_data) is None + assert assign_custom_lineage(input_data) is None class TestMissingData: @staticmethod def test_empty_input_returns_none(): - assert call({}) is None + assert assign_custom_lineage({}) is None @staticmethod def test_missing_ha_subtype_returns_none(): input_data = make_flu_input(ha_subtype=None, na_subtype="N1") - assert call(input_data) is None + assert assign_custom_lineage(input_data) is None @staticmethod def test_missing_na_subtype_returns_none(): input_data = make_flu_input(ha_subtype="H1", na_subtype=None) - assert call(input_data) is None + assert assign_custom_lineage(input_data) is None @staticmethod def test_both_subtypes_missing_returns_none(): input_data = make_flu_input(ha_subtype=None, na_subtype=None) - assert call(input_data) is None + assert assign_custom_lineage(input_data) is None -def call_is_variant(length, num_mutations, mu="0.01"): +def assign_custom_lineage_is_variant(length, num_mutations, mu="0.01"): return ProcessingFunctions.is_variant( input_data={"length": length, "numMutations": num_mutations}, output_field="variant", @@ -209,32 +209,32 @@ class TestIsVariant: @staticmethod def test_above_threshold_is_true(): # 150 mutations, length 1000, mu=0.1 → threshold=100, 150>100 → True - result = call_is_variant(length="1000", num_mutations="150", mu="0.1") + result = assign_custom_lineage_is_variant(length="1000", num_mutations="150", mu="0.1") assert result.datum is True assert result.errors == [] @staticmethod def test_below_threshold_is_false(): # 50 mutations, length 1000, mu=0.1 → threshold=100, 50<100 → False - result = call_is_variant(length="1000", num_mutations="50", mu="0.1") + result = assign_custom_lineage_is_variant(length="1000", num_mutations="50", mu="0.1") assert result.datum is False assert result.errors == [] @staticmethod def test_exactly_at_threshold_is_false(): # 100 mutations, length 1000, mu=0.1 → threshold=100, 100 is not > 100 → False - result = call_is_variant(length="1000", num_mutations="100", mu="0.1") + result = assign_custom_lineage_is_variant(length="1000", num_mutations="100", mu="0.1") assert result.datum is False @staticmethod def test_missing_length_returns_none(): - result = call_is_variant(length=None, num_mutations="50") + result = assign_custom_lineage_is_variant(length=None, num_mutations="50") assert result.datum is None assert result.errors == [] @staticmethod def test_missing_num_mutations_returns_none(): - result = call_is_variant(length="1000", num_mutations=None) + result = assign_custom_lineage_is_variant(length="1000", num_mutations=None) assert result.datum is None assert result.errors == [] @@ -252,6 +252,6 @@ def test_missing_mu_arg_returns_error(): @staticmethod def test_non_numeric_inputs_return_error(): - result = call_is_variant(length="not_a_number", num_mutations="50") + result = assign_custom_lineage_is_variant(length="not_a_number", num_mutations="50") assert result.datum is None assert len(result.errors) == 1 From 2da239da9fe839d8d16c605abc9f48df86a9c8f5 Mon Sep 17 00:00:00 2001 From: anna-parker <50943381+anna-parker@users.noreply.github.com> Date: Sun, 22 Mar 2026 12:45:52 +0100 Subject: [PATCH 14/15] format --- .../src/loculus_preprocessing/processing_functions.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py index 11567381f3..8691c6d489 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py @@ -1238,7 +1238,9 @@ def is_variant( input_fields, [output_field], AnnotationSourceType.METADATA, - message=(f"Field {output_field} has non-numeric length or numMutations value."), + message=( + f"Field {output_field} has non-numeric length or numMutations value." + ), ) ], ) From 9522172d793b2520bb67b68b814b7efc6bad13fb Mon Sep 17 00:00:00 2001 From: anna-parker <50943381+anna-parker@users.noreply.github.com> Date: Thu, 26 Mar 2026 22:45:34 +0100 Subject: [PATCH 15/15] fix --- kubernetes/loculus/values.yaml | 1 + preprocessing/nextclade/src/loculus_preprocessing/prepro.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml index 3e531fb759..02a9ee13e9 100644 --- a/kubernetes/loculus/values.yaml +++ b/kubernetes/loculus/values.yaml @@ -2120,6 +2120,7 @@ defaultOrganisms: taxon_id: 3052518 scientific_name: "Orthonairovirus haemorrhagiae" molecule_type: "genomic RNA" + alignment_requirement: ANY log_level: DEBUG nextclade_dataset_server: "https://raw.githubusercontent.com/genspectrum/nextclade-datasets/cchf-multi/data" segments: diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py index cf6f34687c..8fee1f90d3 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py @@ -201,13 +201,13 @@ def add_assigned_reference( unprocessed: UnprocessedAfterNextclade, config: Config, ) -> InputData: - if not unprocessed.nextcladeMetadata: + if not unprocessed.unalignedNucleotideSequences: return InputData(datum=None) segment = spec.args.get("segment", "main") if spec.args else "main" if not isinstance(segment, str): msg = f"add_assigned_reference: segment must be str, got {type(segment)}" raise TypeError(msg) - name = get_dataset_name(segment, unprocessed.nextcladeMetadata, config) + name = get_dataset_name(segment, unprocessed.unalignedNucleotideSequences, config) if not name: return InputData(datum=None) reference = config.get_dataset_by_name(name).reference_name