diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml index b9fb75c7b7..3e531fb759 100644 --- a/kubernetes/loculus/values.yaml +++ b/kubernetes/loculus/values.yaml @@ -1379,21 +1379,6 @@ defaultOrganismConfig: &defaultOrganismConfig label: Length preprocessing: inputs: {input: nextclade.coverage} - - name: variant - isSequenceFilter: true - perSegment: true - header: "Alignment and QC metrics" - displayName: "Variant" - type: boolean - noInput: true - autocomplete: true - initiallyVisible: false - includeInDownloadsByDefault: false - preprocessing: - function: is_above_threshold - args: - threshold: 50 - inputs: {input: "nextclade.privateNucMutations.totalPrivateSubstitutions"} website: &website tableColumns: - sampleCollectionDate @@ -1548,6 +1533,21 @@ defaultOrganisms: includeInDownloadsByDefault: false preprocessing: inputs: {input: "nextclade.cladeFounderInfo.aaMutations.*.privateSubstitutions"} + - name: variant + isSequenceFilter: true + perSegment: true + header: "Alignment and QC metrics" + displayName: "Variant" + type: boolean + noInput: true + autocomplete: true + initiallyVisible: false + includeInDownloadsByDefault: false + preprocessing: + function: is_variant + args: + mu: 0.002 + inputs: {numMutations: "nextclade.privateNucMutations.totalPrivateSubstitutions", length: processed.length} website: <<: *website tableColumns: @@ -2023,12 +2023,11 @@ defaultOrganisms: header: "Host" ingest: ncbiHostName initiallyVisible: true - - name: variant + - name: variant_L isSequenceFilter: true - perSegment: true header: "Clade & Lineage" oneHeader: true - displayName: "Variant" + displayName: "Variant L" type: boolean noInput: true autocomplete: true @@ -2036,13 +2035,51 @@ defaultOrganisms: includeInDownloadsByDefault: false customDisplay: type: variantReference - displayGroup: reference - label: Closest reference + displayGroup: reference_L + label: Closest reference L + preprocessing: + function: is_variant + args: + mu: 0.004 + inputs: {numMutations: "nextclade.totalSubstitutions", length: processed.length_L} + - name: variant_M + isSequenceFilter: true + header: "Clade & Lineage" + oneHeader: true + displayName: "Variant M" + type: boolean + noInput: true + autocomplete: true + initiallyVisible: false + includeInDownloadsByDefault: false + customDisplay: + type: variantReference + displayGroup: reference_M + label: Closest reference M + preprocessing: + function: is_variant + args: + mu: 0.008 + inputs: {numMutations: "nextclade.totalSubstitutions", length: processed.length_M} + - name: variant_S + isSequenceFilter: true + header: "Clade & Lineage" + oneHeader: true + displayName: "Variant S" + type: boolean + noInput: true + autocomplete: true + initiallyVisible: false + includeInDownloadsByDefault: false + customDisplay: + type: variantReference + displayGroup: reference_S + label: Closest reference S preprocessing: - function: is_above_threshold + function: is_variant args: - threshold: 1000 - inputs: {input: "nextclade.totalSubstitutions"} #custom nextclade dataset does not have private mutations, so using total substitutions as a proxy for distance from reference + mu: 0.004 + inputs: {numMutations: "nextclade.totalSubstitutions", length: processed.length_S} - name: reference oneHeader: true header: "Clade & Lineage" diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py index aa8a4e70ab..8691c6d489 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py @@ -1194,6 +1194,143 @@ def is_above_threshold( ) return ProcessingResult(datum=(input > threshold), warnings=[], errors=[]) + @staticmethod + def is_variant( + input_data: InputMetadata, output_field: str, input_fields: list[str], args: FunctionArgs + ) -> ProcessingResult: + """Flag if number of mutations is above mutation rate (specified in args) times length""" + if "mu" not in args: + return ProcessingResult( + datum=None, + warnings=[], + errors=[ + ProcessingAnnotation.from_fields( + input_fields, + [output_field], + AnnotationSourceType.METADATA, + message=( + f"Field {output_field} is missing mu argument." + " Please report this error to the administrator." + ), + ) + ], + ) + length_datum = input_data.get("length") + num_mutations_datum = input_data.get("numMutations") + if not length_datum or not num_mutations_datum: + return ProcessingResult(datum=None, warnings=[], errors=[]) + try: + mu = float(args["mu"]) # type: ignore + length = float(length_datum) + threshold = mu * length + is_above_threshold_result = ProcessingFunctions.is_above_threshold( + input_data={"input": num_mutations_datum}, + output_field=output_field, + input_fields=input_fields, + args={"threshold": threshold}, + ) + except (ValueError, TypeError): + return ProcessingResult( + datum=None, + warnings=[], + errors=[ + ProcessingAnnotation.from_fields( + input_fields, + [output_field], + AnnotationSourceType.METADATA, + message=( + f"Field {output_field} has non-numeric length or numMutations value." + ), + ) + ], + ) + return ProcessingResult( + datum=is_above_threshold_result.datum, + warnings=is_above_threshold_result.warnings, + errors=is_above_threshold_result.errors, + ) + + @staticmethod + def assign_custom_lineage( # noqa: C901 + input_data: InputMetadata, output_field: str, input_fields: list[str], args: FunctionArgs + ) -> ProcessingResult: + """ + Assign flu lineage based on seg4 and seg6. + Add reassortant flag if subtypes from different lineages are detected for other segments, + add and variant flag if any segment is a variant. + """ + logger.debug( + f"Starting custom lineage assignment with input_data: {input_data} and args: {args}" + ) + if not input_data: + return ProcessingResult(datum=None, warnings=[], errors=[]) + ha_subtype = input_data.get("subtype_seg4") + na_subtype = input_data.get("subtype_seg6") + references: dict[str, str | None] = {} + extracted_subtypes: dict[str, str | None] = {} + variant: dict[str, bool | None] = {} + for i in range(1, 9): + segment = f"seg{i}" + reference_field = f"reference_seg{i}" + variant_field = f"variant_seg{i}" + if reference_field in input_data: + references[segment] = input_data.get(reference_field) + variant[segment] = ( + bool(input_data.get(variant_field)) if variant_field in input_data else None + ) + try: + for i in range(1, 9): + segment = f"seg{i}" + extracted_subtypes[segment] = ProcessingFunctions.call_function( # type: ignore + "extract_regex", + { + "pattern": args["pattern"], + "uppercase": args["uppercase"], + "capture_group": args["capture_group"], + }, + {"regex_field": references.get(segment, "")}, + "output_field", + ["segment_name"], + ).datum + logger.debug(f"Extracted subtypes: {extracted_subtypes} from references: {references}") + if not ha_subtype or not na_subtype: + return ProcessingResult(datum=None, warnings=[], errors=[]) + lineage = f"{ha_subtype}{na_subtype}" + if ( + extracted_subtypes.get("seg4") == "H1N1PDM" + and extracted_subtypes.get("seg6") == "H1N1PDM" + ): + lineage = "H1N1pdm" + logger.debug( + f"Determined preliminary lineage {lineage} based on segments seg4 and seg6" + ) + if lineage in {"H1N1", "H3N2", "H2N2", "H1N1pdm"}: + logger.debug( + f"Lineage {lineage} is a human lineage, checking for reassortment and variants" + ) + # only assign human lineages + if len({v for v in extracted_subtypes.values() if v is not None}) > 1: + lineage += " reassortant" + if any(v for v in variant.values() if v): + lineage += " (variant)" + return ProcessingResult(datum=lineage, warnings=[], errors=[]) + except (ValueError, TypeError): + return ProcessingResult( + datum=None, + warnings=[], + errors=[ + ProcessingAnnotation.from_fields( + input_fields, + [output_field], + AnnotationSourceType.METADATA, + message=( + f"Internal error processing custom lineage for field {output_field}." + ), + ) + ], + ) + return ProcessingResult(datum=None, warnings=[], errors=[]) + @staticmethod def build_display_name( # noqa: C901 input_data: InputMetadata, diff --git a/preprocessing/nextclade/tests/test_assign_custom_lineage.py b/preprocessing/nextclade/tests/test_assign_custom_lineage.py new file mode 100644 index 0000000000..1ead8572d9 --- /dev/null +++ b/preprocessing/nextclade/tests/test_assign_custom_lineage.py @@ -0,0 +1,257 @@ +# ruff: noqa: S101 +"""Tests for ProcessingFunctions.assign_custom_lineage and is_variant.""" + +from loculus_preprocessing.processing_functions import ProcessingFunctions + +ARGS: dict[str, list[str] | str | bool | int | float | None] = { + "capture_group": "info", + "pattern": "^(?:.*_)?(?P[^_]+)$", + "uppercase": True, + "is_insdc_ingest_group": True, + "submittedAt": 1774128802, + "ACCESSION_VERSION": "LOC_00020W3.1", +} + + +def make_flu_input( # noqa: PLR0913, PLR0917 + ha_subtype: str | None = "H1", + na_subtype: str | None = "N1", + seg4_ref: str = "h1_h1n1pdm", + seg6_ref: str = "n1_h1n1pdm", + other_ref: str = "h1n1pdm", + variants: dict[str, bool] | None = None, +) -> dict: + """Build a flat input_data dict as assign_custom_lineage expects.""" + data: dict = {} + for i in range(1, 9): + ref = seg4_ref if i == 4 else seg6_ref if i == 6 else other_ref + data[f"reference_seg{i}"] = ref + data[f"variant_seg{i}"] = (variants or {}).get(f"seg{i}", False) + if ha_subtype is not None: + data["subtype_seg4"] = ha_subtype + if na_subtype is not None: + data["subtype_seg6"] = na_subtype + return data + + +def assign_custom_lineage(input_data: dict) -> str | int | float | bool | None: + return ProcessingFunctions.assign_custom_lineage( + input_data=input_data, + output_field="lineage", + input_fields=list(input_data.keys()), + args=ARGS, + ).datum + + +class TestH1N1pdm: + """H1N1pdm lineage: seg4 ref is h1_h1n1pdm, seg6 ref is n1_h1n1pdm.""" + + @staticmethod + def test_h1n1pdm_assigned_when_subtypes_match(): + """All 8 segments reference h1n1pdm lineage (with HA/NA prefixes on seg4/6).""" + input_data = make_flu_input( + ha_subtype="H1", + na_subtype="N1", + seg4_ref="h1_h1n1pdm", + seg6_ref="n1_h1n1pdm", + other_ref="h1n1pdm", + ) + assert assign_custom_lineage(input_data) == "H1N1pdm" + + @staticmethod + def test_h1n1pdm_with_variant_flag(): + input_data = make_flu_input( + ha_subtype="H1", + na_subtype="N1", + seg4_ref="h1_h1n1pdm", + seg6_ref="n1_h1n1pdm", + other_ref="h1n1pdm", + variants={"seg4": True}, + ) + assert assign_custom_lineage(input_data) == "H1N1pdm (variant)" + + @staticmethod + def test_h1n1pdm_reassortant_when_one_segment_differs(): + """If one internal segment has a different lineage, result is reassortant.""" + input_data = make_flu_input( + ha_subtype="H1", + na_subtype="N1", + seg4_ref="h1_h1n1pdm", + seg6_ref="n1_h1n1pdm", + other_ref="h1n1pdm", + ) + # Override seg2 to a different lineage + input_data["reference_seg2"] = "h3n2" + assert assign_custom_lineage(input_data) == "H1N1pdm reassortant" + + +class TestH1N1Seasonal: + """Seasonal H1N1: all references are plain h1n1 (no prefix).""" + + @staticmethod + def test_h1n1_seasonal_assigned(): + input_data = make_flu_input( + ha_subtype="H1", + na_subtype="N1", + seg4_ref="h1n1", + seg6_ref="h1n1", + other_ref="h1n1", + ) + assert assign_custom_lineage(input_data) == "H1N1" + + @staticmethod + def test_h1n1_seasonal_reassortant(): + input_data = make_flu_input( + ha_subtype="H1", + na_subtype="N1", + seg4_ref="h1n1", + seg6_ref="h1n1", + other_ref="h1n1", + ) + input_data["reference_seg3"] = "h3n2" + assert assign_custom_lineage(input_data) == "H1N1 reassortant" + + @staticmethod + def test_h1n1_seasonal_with_variant(): + input_data = make_flu_input( + ha_subtype="H1", + na_subtype="N1", + seg4_ref="h1n1", + seg6_ref="h1n1", + other_ref="h1n1", + variants={"seg1": True}, + ) + assert assign_custom_lineage(input_data) == "H1N1 (variant)" + + +class TestH3N2: + @staticmethod + def test_h3n2_assigned(): + input_data = make_flu_input( + ha_subtype="H3", + na_subtype="N2", + seg4_ref="h3_h3n2", + seg6_ref="n2_h3n2", + other_ref="h3n2", + ) + assert assign_custom_lineage(input_data) == "H3N2" + + @staticmethod + def test_h3n2_reassortant(): + input_data = make_flu_input( + ha_subtype="H3", + na_subtype="N2", + seg4_ref="h3_h3n2", + seg6_ref="n2_h3n2", + other_ref="h3n2", + ) + input_data["reference_seg1"] = "h1n1pdm" + assert assign_custom_lineage(input_data) == "H3N2 reassortant" + + +class TestNonHumanLineage: + """Non-human lineages (e.g. H5N1, H7N9) should return None.""" + + @staticmethod + def test_h5n1_returns_none(): + input_data = make_flu_input( + ha_subtype="H5", + na_subtype="N1", + seg4_ref="h5_h5n1", + seg6_ref="n1_h5n1", + other_ref="h5n1", + ) + assert assign_custom_lineage(input_data) is None + + @staticmethod + def test_h7n9_returns_none(): + input_data = make_flu_input( + ha_subtype="H7", + na_subtype="N9", + seg4_ref="h7_h7n9", + seg6_ref="n9_h7n9", + other_ref="h7n9", + ) + assert assign_custom_lineage(input_data) is None + + +class TestMissingData: + @staticmethod + def test_empty_input_returns_none(): + assert assign_custom_lineage({}) is None + + @staticmethod + def test_missing_ha_subtype_returns_none(): + input_data = make_flu_input(ha_subtype=None, na_subtype="N1") + assert assign_custom_lineage(input_data) is None + + @staticmethod + def test_missing_na_subtype_returns_none(): + input_data = make_flu_input(ha_subtype="H1", na_subtype=None) + assert assign_custom_lineage(input_data) is None + + @staticmethod + def test_both_subtypes_missing_returns_none(): + input_data = make_flu_input(ha_subtype=None, na_subtype=None) + assert assign_custom_lineage(input_data) is None + + +def assign_custom_lineage_is_variant(length, num_mutations, mu="0.01"): + return ProcessingFunctions.is_variant( + input_data={"length": length, "numMutations": num_mutations}, + output_field="variant", + input_fields=["length", "numMutations"], + args={"mu": mu}, + ) + + +class TestIsVariant: + @staticmethod + def test_above_threshold_is_true(): + # 150 mutations, length 1000, mu=0.1 → threshold=100, 150>100 → True + result = assign_custom_lineage_is_variant(length="1000", num_mutations="150", mu="0.1") + assert result.datum is True + assert result.errors == [] + + @staticmethod + def test_below_threshold_is_false(): + # 50 mutations, length 1000, mu=0.1 → threshold=100, 50<100 → False + result = assign_custom_lineage_is_variant(length="1000", num_mutations="50", mu="0.1") + assert result.datum is False + assert result.errors == [] + + @staticmethod + def test_exactly_at_threshold_is_false(): + # 100 mutations, length 1000, mu=0.1 → threshold=100, 100 is not > 100 → False + result = assign_custom_lineage_is_variant(length="1000", num_mutations="100", mu="0.1") + assert result.datum is False + + @staticmethod + def test_missing_length_returns_none(): + result = assign_custom_lineage_is_variant(length=None, num_mutations="50") + assert result.datum is None + assert result.errors == [] + + @staticmethod + def test_missing_num_mutations_returns_none(): + result = assign_custom_lineage_is_variant(length="1000", num_mutations=None) + assert result.datum is None + assert result.errors == [] + + @staticmethod + def test_missing_mu_arg_returns_error(): + result = ProcessingFunctions.is_variant( + input_data={"length": "1000", "numMutations": "50"}, + output_field="variant", + input_fields=["length", "numMutations"], + args={}, + ) + assert result.datum is None + assert len(result.errors) == 1 + assert "missing mu argument" in result.errors[0].message + + @staticmethod + def test_non_numeric_inputs_return_error(): + result = assign_custom_lineage_is_variant(length="not_a_number", num_mutations="50") + assert result.datum is None + assert len(result.errors) == 1 diff --git a/preprocessing/nextclade/tests/test_metadata_processing_functions.py b/preprocessing/nextclade/tests/test_metadata_processing_functions.py index 534b3db067..a84659c7da 100644 --- a/preprocessing/nextclade/tests/test_metadata_processing_functions.py +++ b/preprocessing/nextclade/tests/test_metadata_processing_functions.py @@ -1033,7 +1033,7 @@ def test_concatenate() -> None: assert res_fallback_explicit_null.datum == "0/unknown/version.1/unknown" -def test_display_name_construction() -> None: +def test_display_name_construction() -> None: # noqa: PLR0915 submission_id = "mySample" submission_id_formatted = "hDENV1/Germany/myExtractedSample/2025" submission_id_formatted_unexpected = "hDENV1/myExtractedSample/2025"