Skip to content
83 changes: 60 additions & 23 deletions kubernetes/loculus/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1379,21 +1379,6 @@ defaultOrganismConfig: &defaultOrganismConfig
label: Length
preprocessing:
inputs: {input: nextclade.coverage}
- name: variant
isSequenceFilter: true
perSegment: true
header: "Alignment and QC metrics"
displayName: "Variant"
type: boolean
noInput: true
autocomplete: true
initiallyVisible: false
includeInDownloadsByDefault: false
preprocessing:
function: is_above_threshold
args:
threshold: 50
inputs: {input: "nextclade.privateNucMutations.totalPrivateSubstitutions"}
website: &website
tableColumns:
- sampleCollectionDate
Expand Down Expand Up @@ -1548,6 +1533,21 @@ defaultOrganisms:
includeInDownloadsByDefault: false
preprocessing:
inputs: {input: "nextclade.cladeFounderInfo.aaMutations.*.privateSubstitutions"}
- name: variant
isSequenceFilter: true
perSegment: true
header: "Alignment and QC metrics"
displayName: "Variant"
type: boolean
noInput: true
autocomplete: true
initiallyVisible: false
includeInDownloadsByDefault: false
preprocessing:
function: is_variant
args:
mu: 0.002
inputs: {numMutations: "nextclade.privateNucMutations.totalPrivateSubstitutions", length: processed.length}
website:
<<: *website
tableColumns:
Expand Down Expand Up @@ -2023,26 +2023,63 @@ defaultOrganisms:
header: "Host"
ingest: ncbiHostName
initiallyVisible: true
- name: variant
- name: variant_L
isSequenceFilter: true
perSegment: true
header: "Clade & Lineage"
oneHeader: true
displayName: "Variant"
displayName: "Variant L"
type: boolean
noInput: true
autocomplete: true
initiallyVisible: false
includeInDownloadsByDefault: false
customDisplay:
type: variantReference
displayGroup: reference
label: Closest reference
displayGroup: reference_L
label: Closest reference L
preprocessing:
function: is_variant
args:
mu: 0.004
inputs: {numMutations: "nextclade.totalSubstitutions", length: processed.length_L}
- name: variant_M
isSequenceFilter: true
header: "Clade & Lineage"
oneHeader: true
displayName: "Variant M"
type: boolean
noInput: true
autocomplete: true
initiallyVisible: false
includeInDownloadsByDefault: false
customDisplay:
type: variantReference
displayGroup: reference_M
label: Closest reference M
preprocessing:
function: is_variant
args:
mu: 0.008
inputs: {numMutations: "nextclade.totalSubstitutions", length: processed.length_M}
- name: variant_S
isSequenceFilter: true
header: "Clade & Lineage"
oneHeader: true
displayName: "Variant S"
type: boolean
noInput: true
autocomplete: true
initiallyVisible: false
includeInDownloadsByDefault: false
customDisplay:
type: variantReference
displayGroup: reference_S
label: Closest reference S
preprocessing:
function: is_above_threshold
function: is_variant
args:
threshold: 1000
inputs: {input: "nextclade.totalSubstitutions"} #custom nextclade dataset does not have private mutations, so using total substitutions as a proxy for distance from reference
mu: 0.004
inputs: {numMutations: "nextclade.totalSubstitutions", length: processed.length_S}
- name: reference
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The old config had a comment explaining why totalSubstitutions is used instead of privateNucMutations for this organism (the custom nextclade dataset doesn't have private mutations). That context was lost when the config was rewritten. Consider adding it back:

Suggested change
- name: reference
inputs: {numMutations: "nextclade.totalSubstitutions", length: processed.length_S} # custom nextclade dataset does not have private mutations, so using total substitutions as a proxy for distance from reference

oneHeader: true
header: "Clade & Lineage"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1194,6 +1194,143 @@ def is_above_threshold(
)
return ProcessingResult(datum=(input > threshold), warnings=[], errors=[])

@staticmethod
def is_variant(
input_data: InputMetadata, output_field: str, input_fields: list[str], args: FunctionArgs
) -> ProcessingResult:
"""Flag if number of mutations is above mutation rate (specified in args) times length"""
if "mu" not in args:
return ProcessingResult(
datum=None,
warnings=[],
errors=[
ProcessingAnnotation.from_fields(
input_fields,
[output_field],
AnnotationSourceType.METADATA,
message=(
f"Field {output_field} is missing mu argument."
" Please report this error to the administrator."
),
)
],
)
length_datum = input_data.get("length")
num_mutations_datum = input_data.get("numMutations")
if not length_datum or not num_mutations_datum:
return ProcessingResult(datum=None, warnings=[], errors=[])
try:
mu = float(args["mu"]) # type: ignore
length = float(length_datum)
threshold = mu * length
is_above_threshold_result = ProcessingFunctions.is_above_threshold(
input_data={"input": num_mutations_datum},
output_field=output_field,
input_fields=input_fields,
args={"threshold": threshold},
)
except (ValueError, TypeError):
return ProcessingResult(
datum=None,
warnings=[],
errors=[
ProcessingAnnotation.from_fields(
input_fields,
[output_field],
AnnotationSourceType.METADATA,
message=(
f"Field {output_field} has non-numeric length or numMutations value."
),
)
],
)
return ProcessingResult(
datum=is_above_threshold_result.datum,
warnings=is_above_threshold_result.warnings,
errors=is_above_threshold_result.errors,
)

@staticmethod
def assign_custom_lineage( # noqa: C901
input_data: InputMetadata, output_field: str, input_fields: list[str], args: FunctionArgs
) -> ProcessingResult:
"""
Assign flu lineage based on seg4 and seg6.
Add reassortant flag if subtypes from different lineages are detected for other segments,
add and variant flag if any segment is a variant.
"""
logger.debug(
f"Starting custom lineage assignment with input_data: {input_data} and args: {args}"
)
if not input_data:
return ProcessingResult(datum=None, warnings=[], errors=[])
ha_subtype = input_data.get("subtype_seg4")
na_subtype = input_data.get("subtype_seg6")
references: dict[str, str | None] = {}
extracted_subtypes: dict[str, str | None] = {}
variant: dict[str, bool | None] = {}
for i in range(1, 9):
segment = f"seg{i}"
reference_field = f"reference_seg{i}"
variant_field = f"variant_seg{i}"
if reference_field in input_data:
references[segment] = input_data.get(reference_field)
variant[segment] = (
bool(input_data.get(variant_field)) if variant_field in input_data else None
)
try:
for i in range(1, 9):
segment = f"seg{i}"
extracted_subtypes[segment] = ProcessingFunctions.call_function( # type: ignore
"extract_regex",
{
"pattern": args["pattern"],
"uppercase": args["uppercase"],
"capture_group": args["capture_group"],
},
{"regex_field": references.get(segment, "")},
"output_field",
["segment_name"],
).datum
logger.debug(f"Extracted subtypes: {extracted_subtypes} from references: {references}")
if not ha_subtype or not na_subtype:
return ProcessingResult(datum=None, warnings=[], errors=[])
lineage = f"{ha_subtype}{na_subtype}"
if (
extracted_subtypes.get("seg4") == "H1N1PDM"
and extracted_subtypes.get("seg6") == "H1N1PDM"
):
lineage = "H1N1pdm"
logger.debug(
f"Determined preliminary lineage {lineage} based on segments seg4 and seg6"
)
if lineage in {"H1N1", "H3N2", "H2N2", "H1N1pdm"}:
logger.debug(
f"Lineage {lineage} is a human lineage, checking for reassortment and variants"
)
# only assign human lineages
if len({v for v in extracted_subtypes.values() if v is not None}) > 1:
lineage += " reassortant"
if any(v for v in variant.values() if v):
lineage += " (variant)"
return ProcessingResult(datum=lineage, warnings=[], errors=[])
except (ValueError, TypeError):
return ProcessingResult(
datum=None,
warnings=[],
errors=[
ProcessingAnnotation.from_fields(
input_fields,
[output_field],
AnnotationSourceType.METADATA,
message=(
f"Internal error processing custom lineage for field {output_field}."
),
)
],
)
return ProcessingResult(datum=None, warnings=[], errors=[])

@staticmethod
def build_display_name( # noqa: C901
input_data: InputMetadata,
Expand Down
Loading
Loading