From 2491033edfe8e19e00c0b5dbd63f12c71d51ede4 Mon Sep 17 00:00:00 2001 From: Aine Date: Thu, 6 Nov 2025 16:18:51 +0000 Subject: [PATCH 1/7] feat: add popEVE data handling --- EVE.pm | 130 +++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 90 insertions(+), 40 deletions(-) diff --git a/EVE.pm b/EVE.pm index b686fd10..df5e6cdc 100644 --- a/EVE.pm +++ b/EVE.pm @@ -91,7 +91,6 @@ use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepTabixPlugin); sub new { my $class = shift; - my $self = $class->SUPER::new(@_); my $config = $self->{config}; @@ -99,20 +98,29 @@ sub new { $self->expand_right(0); my $assembly = $config->{assembly} || $config->{human_assembly}; + die "\nAssembly is not GRCh38, EVE only works with GRCh38.\n" if ($assembly ne "GRCh38"); - die "\nAssembly is not GRCh38, EVE only works with GRCh38. \n" if ($assembly ne "GRCh38"); + my $param = $self->params_to_hash(); - my $param_hash = $self->params_to_hash(); + my $eve_file = $param->{file} || $param->{eve_file}; + my $popeve_file = $param->{popeve_file}; - die "\nERROR: No EVE file specified\nTry using 'file=/eve_file.vcf.gz'\n" unless defined($param_hash->{file}); + die "\nERROR: No input specified\nUse 'file=' and/or 'popeve_file='\n" + unless ($eve_file || $popeve_file); - $self->add_file($param_hash->{file}); + if ($eve_file) { + $self->add_file($eve_file); + $self->{has_eve} = 1; + } + if ($popeve_file) { + $self->add_file($popeve_file); + $self->{has_pop} = 1; + } my @valid_class_numbers = (10, 20, 25, 30, 40, 50, 60, 70, 75, 80, 90); - - if (defined($param_hash->{class_number})) { - my $class_number = $param_hash->{class_number}; - die "\nERROR: This class_number: '$class_number' does not exists.\nTry any of these numbers: " . join(', ', @valid_class_numbers) + if (defined($param->{class_number})) { + my $class_number = $param->{class_number}; + die "\nERROR: This class_number: '$class_number' does not exist.\nTry any of: " . join(', ', @valid_class_numbers) unless grep(/^$class_number$/, @valid_class_numbers); $self->{class_number} = $class_number; } else { @@ -128,10 +136,24 @@ sub feature_types { sub get_header_info { my $self = shift; - return { + my $h = { EVE_SCORE => "Score from EVE model", - EVE_CLASS => "Classification (Benign, Uncertain, or Pathogenic) when setting $self->{class_number}% as uncertain" + EVE_CLASS => "Classification (Benign, Uncertain, or Pathogenic) when setting $self->{class_number}% as uncertain", + }; + + if ($self->{has_pop}) { + $h->{popEVE_SCORE} = "Score from popEVE"; + $h->{popEVE_EVE} = "Raw EVE score (popEVE file)"; + $h->{popEVE_ESM1v} = "Raw ESM1v score (popEVE file)"; + $h->{popEVE_pop_adjusted_EVE} = "Population-adjusted EVE"; + $h->{popEVE_pop_adjusted_ESM1v} = "Population-adjusted ESM1v"; + $h->{popEVE_gap_frequency} = "Gap frequency"; + $h->{popEVE_gene} = "Gene symbol"; + $h->{popEVE_protein} = "Protein accession"; + $h->{popEVE_mutant} = "Protein-level change"; } + + return $h; } sub run { @@ -140,9 +162,8 @@ sub run { return {} unless grep {$_->SO_term eq 'missense_variant'} @{$tva->get_all_OverlapConsequences}; - # get allele my $alt_alleles = $tva->base_variation_feature->alt_alleles; - my $ref_allele = $vf->ref_allele_string; + my $ref_allele = $vf->ref_allele_string; my @data = @{ $self->get_data( @@ -152,51 +173,78 @@ sub run { ) }; - return {} unless(@data); + return {} unless @data; - foreach my $variant (@data) { + my %out; + foreach my $variant (@data) { my $matches = get_matched_variant_alleles( - { - ref => $ref_allele, - alts => $alt_alleles, - pos => $vf->{start}, - strand => $vf->strand - }, - { - ref => $variant->{ref}, - alts => [$variant->{alt}], - pos => $variant->{start}, - } + { ref => $ref_allele, alts => $alt_alleles, pos => $vf->{start}, strand => $vf->strand }, + { ref => $variant->{ref}, alts => [ $variant->{alt} ], pos => $variant->{start} } ); + next unless @$matches; - return $variant->{result} if (@$matches); + # MERGE instead of returning immediately + # merge results from every matching record within the window, instead of returning on the first match + # This allows EVE (allele in codon format: XXX) and popEVE (allele in SNV format: X) to both annotate the same input variant + @out{ keys %{ $variant->{result} } } = values %{ $variant->{result} }; } - return {}; - + return %out ? \%out : {}; } sub parse_data { my ($self, $line) = @_; + chomp $line; # ensure INFO regexes see clean line endings + + my ($chrom, $pos, $id, $ref, $alt, $qual, $filter, $info) = split /\t/, $line, 8; + + # source detection + my $is_pop = ($info =~ /(;\s*)?(popEVE|protein|gene|mutant|gap_frequency|ESM1v|pop-adjusted_EVE|pop-adjusted_ESM1v)=/); + + if ($is_pop) { + # -------- popEVE branch: extract popEVE_* fields -------- + my ($score) = $info =~ /(?:^|;)popEVE=([^;]+)/; + my ($raw_eve) = $info =~ /(?:^|;)EVE=([^;]+)/; + my ($esm1v) = $info =~ /(?:^|;)ESM1v=([^;]+)/; + my ($padj_eve) = $info =~ /(?:^|;)pop[-_]adjusted_EVE=([^;]+)/; + my ($padj_esm) = $info =~ /(?:^|;)pop[-_]adjusted_ESM1v=([^;]+)/; + my ($gap) = $info =~ /(?:^|;)gap_frequency=([^;]+)/; + my ($gene) = $info =~ /(?:^|;)gene=([^;]+)/; + my ($protein) = $info =~ /(?:^|;)protein=([^;]+)/; + my ($mutant) = $info =~ /(?:^|;)mutant=([^;]+)/; + + my %res; + $res{popEVE_SCORE} = $score if defined $score; + $res{popEVE_EVE} = $raw_eve if defined $raw_eve; # avoids collision with the EVE col from the EVE file + $res{popEVE_ESM1v} = $esm1v if defined $esm1v; + $res{popEVE_pop_adjusted_EVE} = $padj_eve if defined $padj_eve; + $res{popEVE_pop_adjusted_ESM1v} = $padj_esm if defined $padj_esm; + $res{popEVE_gap_frequency} = $gap if defined $gap; + $res{popEVE_gene} = $gene if defined $gene; + $res{popEVE_protein} = $protein if defined $protein; + $res{popEVE_mutant} = $mutant if defined $mutant; + + my $end = $pos + (length($ref||'') ? length($ref)-1 : 0); + return { ref=>$ref, alt=>$alt, start=>$pos, end=>$end, result=>\%res }; + } - # Parsing VCF fields - my ($chrom, $pos, $id, $ref, $alt, $qual, $filter, $info) = split /\t/, $line; - - # Parsing INFO field - my ($EVE_SCORE) = $info =~ /EVE=(.*?);/; + # -------- EVE branch -------- + my ($EVE_SCORE) = $info =~ /EVE=([^;]+)/; my $class_number = $self->{class_number}; - my ($EVE_CLASS) = $info =~ /Class$class_number=(.*?)([;]|$)/; + my ($EVE_CLASS) = $info =~ /Class$class_number=([^;]+)/; + my $end = $pos + (length($ref||'') ? length($ref)-1 : 0); return { ref => $ref, alt => $alt, start => $pos, + end => $end, result => { - EVE_SCORE => $EVE_SCORE, - EVE_CLASS => $EVE_CLASS + EVE_SCORE => $EVE_SCORE, + EVE_CLASS => $EVE_CLASS } - } + }; } sub get_start { @@ -204,7 +252,9 @@ sub get_start { } sub get_end { - return $_[1]->{end}; + # safe fallback if 'end' wasn't set in parse_data + my $v = $_[1]; + return defined $v->{end} ? $v->{end} : ($v->{start} + (length($v->{ref} // '') ? length($v->{ref}) - 1 : 0)); } -1; +1; \ No newline at end of file From 16c3c38f463cc21951a52f4d1140235c3b3c322e Mon Sep 17 00:00:00 2001 From: Aine Date: Tue, 11 Nov 2025 15:09:58 +0000 Subject: [PATCH 2/7] feat: add VCF header parsing or fallback to default col descriptions --- EVE.pm | 68 +++++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 56 insertions(+), 12 deletions(-) diff --git a/EVE.pm b/EVE.pm index df5e6cdc..8e248fa0 100644 --- a/EVE.pm +++ b/EVE.pm @@ -88,6 +88,7 @@ use Bio::EnsEMBL::Variation::Utils::Sequence qw(get_matched_variant_alleles); use Bio::EnsEMBL::Variation::Utils::BaseVepTabixPlugin; use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepTabixPlugin); +use Bio::DB::HTS::Tabix; sub new { my $class = shift; @@ -111,10 +112,12 @@ sub new { if ($eve_file) { $self->add_file($eve_file); $self->{has_eve} = 1; + $self->{eve_file} = $eve_file; } if ($popeve_file) { $self->add_file($popeve_file); $self->{has_pop} = 1; + $self->{pop_file} = $popeve_file; } my @valid_class_numbers = (10, 20, 25, 30, 40, 50, 60, 70, 75, 80, 90); @@ -134,23 +137,64 @@ sub feature_types { return ['Transcript']; } +sub _vcf_info_descriptions { + my ($file) = @_; + my %desc; + + return \%desc unless $file; + + eval { + my $tbx = Bio::DB::HTS::Tabix->new( filename => $file ); + my $header = $tbx->header; + + foreach my $line (split /\n/, $header) { + next unless $line =~ /^##INFO<(.+)>/ || $line =~ /^##INFO=<(.+)>/; + my $body = $1; + my ($id) = $body =~ /\bID=([^,>]+)/; + my ($d) = $body =~ /Description="([^"]*)"/; + + $desc{$id} = $d if defined $id && defined $d; + } + }; + # if tabix/header parsing fails, return an empty hash and let get_header_info use defaults + return \%desc; +} + sub get_header_info { my $self = shift; + + # try to read INFO descriptions from the EVE and popEVE VCF headers + my %eve_desc; + my %pop_desc; + + if ($self->{has_eve} && $self->{eve_file}) { + my $h = _vcf_info_descriptions($self->{eve_file}); + %eve_desc = %{$h} if $h; + } + + if ($self->{has_pop} && $self->{pop_file}) { + my $h = _vcf_info_descriptions($self->{pop_file}); + %pop_desc = %{$h} if $h; + } + + my $class_key = "Class" . $self->{class_number}; + + # prefer INFO descriptions pulled from VCF but fall back to hard-coded descriptions my $h = { - EVE_SCORE => "Score from EVE model", - EVE_CLASS => "Classification (Benign, Uncertain, or Pathogenic) when setting $self->{class_number}% as uncertain", + EVE_SCORE => ($eve_desc{EVE} || "Score from EVE model"), + EVE_CLASS => ($eve_desc{$class_key} || "Classification (Benign, Uncertain, or Pathogenic) when setting $self->{class_number}% as uncertain"), }; if ($self->{has_pop}) { - $h->{popEVE_SCORE} = "Score from popEVE"; - $h->{popEVE_EVE} = "Raw EVE score (popEVE file)"; - $h->{popEVE_ESM1v} = "Raw ESM1v score (popEVE file)"; - $h->{popEVE_pop_adjusted_EVE} = "Population-adjusted EVE"; - $h->{popEVE_pop_adjusted_ESM1v} = "Population-adjusted ESM1v"; - $h->{popEVE_gap_frequency} = "Gap frequency"; - $h->{popEVE_gene} = "Gene symbol"; - $h->{popEVE_protein} = "Protein accession"; - $h->{popEVE_mutant} = "Protein-level change"; + $h->{popEVE_SCORE} = ($pop_desc{popEVE} || "Score from popEVE"); + $h->{popEVE_EVE} = ($pop_desc{EVE} || "Raw EVE score (popEVE file)"); + $h->{popEVE_ESM1v} = ($pop_desc{ESM1v} || "Raw ESM1v score (popEVE file)"); + $h->{popEVE_pop_adjusted_EVE} = ($pop_desc{'pop-adjusted_EVE'} || $pop_desc{'pop_adjusted_EVE'} || "Population-adjusted EVE"); + $h->{popEVE_pop_adjusted_ESM1v} = ($pop_desc{'pop-adjusted_ESM1v'}|| $pop_desc{'pop_adjusted_ESM1v'} || "Population-adjusted ESM1v"); + $h->{popEVE_gap_frequency} = ($pop_desc{gap_frequency} || "Gap frequency"); + $h->{popEVE_gene} = ($pop_desc{gene} || "Gene symbol"); + $h->{popEVE_protein} = ($pop_desc{protein} || "Protein accession"); + $h->{popEVE_mutant} = ($pop_desc{mutant} || "Protein-level change"); } return $h; @@ -186,7 +230,7 @@ sub run { # MERGE instead of returning immediately # merge results from every matching record within the window, instead of returning on the first match - # This allows EVE (allele in codon format: XXX) and popEVE (allele in SNV format: X) to both annotate the same input variant + # this allows EVE (allele in codon format: XXX) and popEVE (allele in SNV format: X) to both annotate the same input variant @out{ keys %{ $variant->{result} } } = values %{ $variant->{result} }; } From 67e6bcbaf66eea0c0b2e2e8237407672950003d9 Mon Sep 17 00:00:00 2001 From: Aine Date: Fri, 14 Nov 2025 10:31:10 +0000 Subject: [PATCH 3/7] docs: add download/prep instructions for popEVE to header --- EVE.pm | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/EVE.pm b/EVE.pm index 8e248fa0..50056b14 100644 --- a/EVE.pm +++ b/EVE.pm @@ -43,11 +43,9 @@ limitations under the License. Please cite EVE publication alongside Ensembl VEP if you use this resource: https://www.nature.com/articles/s41586-021-04043-8 -################################################### -# Bash script to merge all VCFs from EVE dataset. # -################################################### - -### BEGIN +######################################################################## +# Get and prepare EVE data: script to merge all VCFs from EVE dataset. # +######################################################################## # EVE input file can be downloaded from https://evemodel.org/api/proteins/bulk/download/ # Input: VCF files by protein (vcf_files_missense_mutations inside zip folder) @@ -76,7 +74,16 @@ bgzip ${OUTPUT_FOLDER}/${OUTPUT_NAME}; # If not installed, use: sudo apt install tabix tabix ${OUTPUT_FOLDER}/${OUTPUT_NAME}.gz; -### END +######################################################################## +# Get and prepare popEVE data # +######################################################################## + +# popEVE input file can be downloaded from https://data.evemodel.org/popeve/v1.1/downloads/grch38_popEVE_ukbb_20250715.vcf.gz +# Input: popEVE scores aligned to GRCh38, one file +# Output: Compressed VCF file (vcf.gz) + index file (.tbi) + +wget https://data.evemodel.org/popeve/v1.1/downloads/grch38_popEVE_ukbb_20250715.vcf.gz +tabix grch38_popEVE_ukbb_20250715.vcf.gz =cut package EVE; From 718b4b7c3fd2cd14e6d8b0100046c674147e8d8c Mon Sep 17 00:00:00 2001 From: Aine Date: Wed, 7 Jan 2026 13:38:53 +0000 Subject: [PATCH 4/7] fix: hard-code headers, ensure headers only included for dataset used --- EVE.pm | 70 ++++++++++++++-------------------------------------------- 1 file changed, 16 insertions(+), 54 deletions(-) diff --git a/EVE.pm b/EVE.pm index 50056b14..fea48a04 100644 --- a/EVE.pm +++ b/EVE.pm @@ -95,7 +95,6 @@ use Bio::EnsEMBL::Variation::Utils::Sequence qw(get_matched_variant_alleles); use Bio::EnsEMBL::Variation::Utils::BaseVepTabixPlugin; use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepTabixPlugin); -use Bio::DB::HTS::Tabix; sub new { my $class = shift; @@ -144,67 +143,30 @@ sub feature_types { return ['Transcript']; } -sub _vcf_info_descriptions { - my ($file) = @_; - my %desc; - - return \%desc unless $file; - - eval { - my $tbx = Bio::DB::HTS::Tabix->new( filename => $file ); - my $header = $tbx->header; - - foreach my $line (split /\n/, $header) { - next unless $line =~ /^##INFO<(.+)>/ || $line =~ /^##INFO=<(.+)>/; - my $body = $1; - my ($id) = $body =~ /\bID=([^,>]+)/; - my ($d) = $body =~ /Description="([^"]*)"/; - - $desc{$id} = $d if defined $id && defined $d; - } - }; - # if tabix/header parsing fails, return an empty hash and let get_header_info use defaults - return \%desc; -} - sub get_header_info { my $self = shift; - # try to read INFO descriptions from the EVE and popEVE VCF headers - my %eve_desc; - my %pop_desc; - - if ($self->{has_eve} && $self->{eve_file}) { - my $h = _vcf_info_descriptions($self->{eve_file}); - %eve_desc = %{$h} if $h; - } + my %h; - if ($self->{has_pop} && $self->{pop_file}) { - my $h = _vcf_info_descriptions($self->{pop_file}); - %pop_desc = %{$h} if $h; + if ($self->{has_eve}) { + my $class_number = $self->{class_number}; + $h{EVE_SCORE} = "Score from EVE model"; + $h{EVE_CLASS} = "Classification (Benign, Uncertain, or Pathogenic) when setting ${class_number}% as uncertain"; } - my $class_key = "Class" . $self->{class_number}; - - # prefer INFO descriptions pulled from VCF but fall back to hard-coded descriptions - my $h = { - EVE_SCORE => ($eve_desc{EVE} || "Score from EVE model"), - EVE_CLASS => ($eve_desc{$class_key} || "Classification (Benign, Uncertain, or Pathogenic) when setting $self->{class_number}% as uncertain"), - }; - if ($self->{has_pop}) { - $h->{popEVE_SCORE} = ($pop_desc{popEVE} || "Score from popEVE"); - $h->{popEVE_EVE} = ($pop_desc{EVE} || "Raw EVE score (popEVE file)"); - $h->{popEVE_ESM1v} = ($pop_desc{ESM1v} || "Raw ESM1v score (popEVE file)"); - $h->{popEVE_pop_adjusted_EVE} = ($pop_desc{'pop-adjusted_EVE'} || $pop_desc{'pop_adjusted_EVE'} || "Population-adjusted EVE"); - $h->{popEVE_pop_adjusted_ESM1v} = ($pop_desc{'pop-adjusted_ESM1v'}|| $pop_desc{'pop_adjusted_ESM1v'} || "Population-adjusted ESM1v"); - $h->{popEVE_gap_frequency} = ($pop_desc{gap_frequency} || "Gap frequency"); - $h->{popEVE_gene} = ($pop_desc{gene} || "Gene symbol"); - $h->{popEVE_protein} = ($pop_desc{protein} || "Protein accession"); - $h->{popEVE_mutant} = ($pop_desc{mutant} || "Protein-level change"); + $h{popEVE_SCORE} = "Score from popEVE model"; + $h{popEVE_EVE} = "Raw EVE model score (unsupervised variant effect prediction)"; + $h{popEVE_ESM1v} = "Raw ESM1v model score (log-likelihood ratio from protein language model)"; + $h{popEVE_pop_adjusted_EVE} = "EVE score adjusted for population variation using the popEVE framework"; + $h{popEVE_pop_adjusted_ESM1v} = "ESM1v log-likelihood ratio adjusted for population variation using the popEVE framework"; + $h{popEVE_gap_frequency} = "Fraction of sequences with a gap at this alignment position in the MSA used for model inference - filter anything above 0.5"; + $h{popEVE_gene} = "Gene symbol corresponding to the variant"; + $h{popEVE_protein} = "RefSeq identifier associated with the variant"; + $h{popEVE_mutant} = "Protein-level variant in [WILDTYPE_AA][AA_POSITION][VARIANT_AA] format (e.g. A123T)"; } - return $h; + return \%h; } sub run { @@ -308,4 +270,4 @@ sub get_end { return defined $v->{end} ? $v->{end} : ($v->{start} + (length($v->{ref} // '') ? length($v->{ref}) - 1 : 0)); } -1; \ No newline at end of file +1; From bebe32f0b70df8950b68f732ff5237120a821430 Mon Sep 17 00:00:00 2001 From: Aine Date: Wed, 7 Jan 2026 13:42:55 +0000 Subject: [PATCH 5/7] style: revert formatting of get_matched_variant_alleles --- EVE.pm | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/EVE.pm b/EVE.pm index fea48a04..9c5eb07a 100644 --- a/EVE.pm +++ b/EVE.pm @@ -192,8 +192,17 @@ sub run { foreach my $variant (@data) { my $matches = get_matched_variant_alleles( - { ref => $ref_allele, alts => $alt_alleles, pos => $vf->{start}, strand => $vf->strand }, - { ref => $variant->{ref}, alts => [ $variant->{alt} ], pos => $variant->{start} } + { + ref => $ref_allele, + alts => $alt_alleles, + pos => $vf->{start}, + strand => $vf->strand + }, + { + ref => $variant->{ref}, + alts => [$variant->{alt}], + pos => $variant->{start}, + } ); next unless @$matches; From 0ee1df3b97b8bf6c61799f869e09a8f13d736802 Mon Sep 17 00:00:00 2001 From: Aine Date: Wed, 7 Jan 2026 13:46:55 +0000 Subject: [PATCH 6/7] docs: add run example for popeve --- EVE.pm | 2 ++ 1 file changed, 2 insertions(+) diff --git a/EVE.pm b/EVE.pm index 9c5eb07a..7b7cc58d 100644 --- a/EVE.pm +++ b/EVE.pm @@ -30,6 +30,8 @@ limitations under the License. cp EVE.pm ${HOME}/.vep/Plugins ./vep -i variations.vcf --plugin EVE,file=/path/to/eve/data.vcf.gz # By default, Class75 is used. ./vep -i variations.vcf --plugin EVE,file=/path/to/eve/data.vcf.gz,class_number=60 + ./vep -i variations.vcf --plugin EVE,file=/path/to/eve/data.vcf.gz,popeve_file=/path/to/popeve/data.vcf.gz + ./vep -i variations.vcf --plugin EVE,popeve_file=/path/to/popeve/data.vcf.gz =head1 DESCRIPTION From 2981314a4a987bcda18540573713ab736eca2c07 Mon Sep 17 00:00:00 2001 From: Aine Date: Wed, 7 Jan 2026 13:49:43 +0000 Subject: [PATCH 7/7] docs: add run example for popeve --- EVE.pm | 1 + 1 file changed, 1 insertion(+) diff --git a/EVE.pm b/EVE.pm index 7b7cc58d..058c43e8 100644 --- a/EVE.pm +++ b/EVE.pm @@ -32,6 +32,7 @@ limitations under the License. ./vep -i variations.vcf --plugin EVE,file=/path/to/eve/data.vcf.gz,class_number=60 ./vep -i variations.vcf --plugin EVE,file=/path/to/eve/data.vcf.gz,popeve_file=/path/to/popeve/data.vcf.gz ./vep -i variations.vcf --plugin EVE,popeve_file=/path/to/popeve/data.vcf.gz + ./vep -i variations.vcf --plugin EVE,file=/path/to/eve/data.vcf.gz,class_number=60,popeve_file=/path/to/popeve/data.vcf.gz =head1 DESCRIPTION