diff --git a/dist/data/cache/gene-structures/homo-sapiens-gene-structures.tsv.gz b/dist/data/cache/gene-structures/homo-sapiens-gene-structures.tsv.gz index e5f6abcd..eb64749f 100644 Binary files a/dist/data/cache/gene-structures/homo-sapiens-gene-structures.tsv.gz and b/dist/data/cache/gene-structures/homo-sapiens-gene-structures.tsv.gz differ diff --git a/dist/data/cache/variants/homo-sapiens-variants.tsv.gz b/dist/data/cache/variants/homo-sapiens-variants.tsv.gz new file mode 100644 index 00000000..3fbf482f Binary files /dev/null and b/dist/data/cache/variants/homo-sapiens-variants.tsv.gz differ diff --git a/dist/data/cache/variants/homo-sapiens-variants.tsv.li.gz b/dist/data/cache/variants/homo-sapiens-variants.tsv.li.gz new file mode 100644 index 00000000..bad39a8a Binary files /dev/null and b/dist/data/cache/variants/homo-sapiens-variants.tsv.li.gz differ diff --git a/karma.conf.js b/karma.conf.js index 09d61a4c..3b4067c7 100644 --- a/karma.conf.js +++ b/karma.conf.js @@ -16,10 +16,10 @@ module.exports = function(config) { // list of files / patterns to load in the browser files: [ 'src/js/index.js', - 'test/offline/**.test.js', - 'test/online/**.test.js', + // 'test/offline/**.test.js', + // 'test/online/**.test.js', // 'test/online/related-genes.test.js', - // 'test/offline/gene-structure.test.js', + 'test/offline/gene-structure.test.js', // 'test/offline/tissue.test.js', {pattern: 'dist/data/**', watched: false, included: false, served: true, nocache: false} ], diff --git a/scripts/python/cache/clinvar_cache.py b/scripts/python/cache/clinvar_cache.py deleted file mode 100644 index db878124..00000000 --- a/scripts/python/cache/clinvar_cache.py +++ /dev/null @@ -1,140 +0,0 @@ -"""Cache data on variants related to human health, from NCBI ClinVar - -Example: -python clinvar_cache.py -""" - -import csv -import json -import gzip - -clinical_concerns = ['Likely_pathogenic', 'Pathogenic/Likely_pathogenic', 'Pathogenic'] -robust_review_statuses = [ - 'criteria_provided,_multiple_submitters,_no_conflicts', - 'reviewed_by_expert_panel', - 'practice_guideline' -] - -disease_ids_and_names = [] -variant_types = [] -molecular_consequences = [] - -def get_is_relevant(fields): - is_clinical_concern = False - is_robustly_reviewed = False - for field in fields: - [name, value] = field.split('=') - - if name == 'CLNSIG' and value in clinical_concerns: - is_clinical_concern = True - - if name == 'CLNREVSTAT' and value in robust_review_statuses: - is_robustly_reviewed = True - - return is_clinical_concern and is_robustly_reviewed - -def trim_info_fields(fields): - slim_fields = [] - names_to_keep = ['CLNREVSTAT', 'CLNSIG', 'CLNVC', 'MC', 'ORIGIN', 'RS'] - disease_indexes = [] - disease_names = [] - disease_ids = [] - for field in fields: - [name, value] = field.split('=') - - if name == 'CLNDISDB': # "Clinical disease database" - entries = value.split('|') - for entry in entries: - full_values = entry.split(',') - has_mondo = False - for fv in full_values: - split_fv = fv.split(':') - db_name = split_fv[0] - db_value = split_fv[-1] - if db_name == 'MONDO': - disease_ids.append(db_value) - has_mondo = True - if not has_mondo: - disease_ids.append('-1') - - elif name == 'CLNDN': # "Clinical disease name" - disease_names = value.split('|') - - elif name == 'CLNVC': - if value not in variant_types: - variant_types.append(value) - variant_type = variant_types.index(value) - slim_fields.append(str(variant_type)) - - elif name == 'MC': - entries = value.split(',') - slim_mc = [] - for entry in entries: - if entry not in molecular_consequences: - molecular_consequences.append(entry) - molecular_consequence = molecular_consequences.index(entry) - slim_mc.append(str(molecular_consequence)) - slim_fields.append(','.join(slim_mc)) - - elif name in names_to_keep: - if name == 'CLNSIG': - value = clinical_concerns.index(value) - elif name == 'CLNREVSTAT': - value = robust_review_statuses.index(value) - slim_fields.append(str(value)) - - for (i, disease_id) in enumerate(disease_ids): - if disease_id == '-1': - continue - disease_name = disease_names[i] - disease_id_and_name = disease_id + '|' + disease_name - if disease_id_and_name not in disease_ids_and_names: - disease_ids_and_names.append(disease_id_and_name) - disease_index = disease_ids_and_names.index(disease_id_and_name) - disease_indexes.append(str(disease_index)) - - disease_indexes_string = ','.join(disease_indexes) - slim_fields.insert(0, disease_indexes_string) - - slim_info = '\t'.join(slim_fields) - return slim_info - -output_rows = [] - -# https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar_20241215.vcf.gz -# Source: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/ -with open('clinvar_20241215.vcf') as file: - reader = csv.reader(file, delimiter="\t") - for row in reader: - if row[0][0] == '#': - continue - # print('row', row) - info = row[7] - fields = info.split(';') - is_relevant = get_is_relevant(fields) - if not is_relevant: - continue - slim_info = trim_info_fields(fields) - del row[5:7] - row[5] = slim_info - - output_rows.append('\t'.join(row)) - -content = '\n'.join(output_rows) - -disease_map = json.dumps(disease_ids_and_names) -column_names = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'DISEASE_IDS', 'CLNREVSTAT', 'CLNSIG', 'CLNVC', 'MC', 'ORIGIN', 'RS'] -headers = '\n'.join([ - '# disease_mondo_ids_and_names = ' + disease_map, - '# variant_types = ' + str(variant_types), - '# molecular_consequences = ' + str(molecular_consequences), - '\t'.join(column_names) -]) -content = headers + '\n' + content - - -output_path = 'clinvar_priority_20241215.tsv' -with open(output_path, "w") as f: - f.write(content) -with gzip.open(f"{output_path}.gz", "wt") as f: - f.write(content) diff --git a/scripts/python/cache/gene_structure_cache.py b/scripts/python/cache/gene_structure_cache.py index 841cf182..2b4e3b62 100644 --- a/scripts/python/cache/gene_structure_cache.py +++ b/scripts/python/cache/gene_structure_cache.py @@ -262,6 +262,8 @@ def parse_mrna(raw_mrna, biotypes_list): def build_structures(structures_by_id): biotypes_list = list(biotypes.keys()) + prev_gene = '' + structures = [] for id in structures_by_id: structure_lists = structures_by_id[id] @@ -273,8 +275,19 @@ def build_structures(structures_by_id): for structure_list in structure_lists[1:]: subpart = parse_transcript_subpart(structure_list, mrna_start) - structure += [";".join(subpart) ] - + structure += [";".join(subpart)] + + # Set transcript start coordinate relative to most-upstream transcript + # This enables projecting genomic features (e.g. variants) onto + # transcript coordinates. It also enables viewing multiple transcripts + # in genomic coordinates, like typical genome browsers (Ensembl, IGV). + gene_name = structure[1].split('-')[0] + if gene_name != prev_gene: + gene_start = int(mrna_start) # Start of 1st transcript is gene start + prev_gene = gene_name + mrna_start_offset = str(int(mrna_start) - gene_start) + + structure.insert(2, mrna_start_offset) structures.append(structure) return structures @@ -288,8 +301,8 @@ def parse_structures(canonical_ids, gff_path, gff_url): Parts of a transcript that comprise "gene structure" here: * Exons: regions of gene not removed by RNA splicing - * 3'-UTR: Three prime untranslated region; start region - * 5'-UTR: Fix prime untranslated region; end region + * 5'-UTR: Fix prime untranslated region; start region (for +, end for -) + * 3'-UTR: Three prime untranslated region; end region (for +, start for -) (Introns are the regions between 3'- and 5'-UTRs that are not exons. These are implied in the structure, and not modeled explicitly.) diff --git a/scripts/python/cache/variant_cache.py b/scripts/python/cache/variant_cache.py new file mode 100644 index 00000000..23675f59 --- /dev/null +++ b/scripts/python/cache/variant_cache.py @@ -0,0 +1,346 @@ +"""Cache data on variants related to human health, from NCBI ClinVar + +Example: +python variant_cache.py +""" + +import csv +import gzip +import json +import sys + +# Avoid "field larger than field limit (131072)" +csv.field_size_limit(sys.maxsize) + + +clinical_concerns = [ + 'Likely_pathogenic', + 'Pathogenic/Likely_pathogenic', + 'Pathogenic' +] +robust_review_statuses = [ + 'criteria_provided,_multiple_submitters,_no_conflicts', + 'reviewed_by_expert_panel', + 'practice_guideline' +] + +disease_ids_and_names = [] +variant_types = [] +molecular_consequences = [] + +def get_is_relevant(fields): + is_clinical_concern = False + is_robustly_reviewed = False + for field in fields: + [name, value] = field.split('=') + + if name == 'CLNSIG' and value in clinical_concerns: + is_clinical_concern = True + + if name == 'CLNREVSTAT' and value in robust_review_statuses: + is_robustly_reviewed = True + + return is_clinical_concern and is_robustly_reviewed + +def trim_info_fields(fields): + slim_fields = [] + names_to_keep = [ + 'AF_EXAC', # Allele frequency, per ExAC Consortium + 'CLNREVSTAT', # Clinical review status + 'CLNSIG', # Clinical significance + 'CLNVC', # Variant type ("clinical variant class?") + 'MC', # Molecular consequence + 'ORIGIN', # Variant origin, e.g. germline, somatic + 'RS' # dbSNP RS ID + ] + disease_indexes = [] + disease_names = [] + disease_ids = [] + has_af_exac = False + has_mc = False + + for field in fields: + [name, value] = field.split('=') + + if name == 'CLNDISDB': # "Clinical disease database" + entries = value.split('|') + for entry in entries: + full_values = entry.split(',') + has_mondo = False + for fv in full_values: + split_fv = fv.split(':') + db_name = split_fv[0] + db_value = split_fv[-1] + if db_name == 'MONDO': + disease_ids.append(db_value) + has_mondo = True + if not has_mondo: + disease_ids.append('-1') + + elif name == 'CLNDN': # "Clinical disease name" + disease_names = value.split('|') + + elif name == 'CLNVC': + if value not in variant_types: + variant_types.append(value) + variant_type = variant_types.index(value) + slim_fields.append(str(variant_type)) + + elif name == 'MC': + has_mc = True + entries = value.split(',') + slim_mc = [] + for entry in entries: + if entry not in molecular_consequences: + molecular_consequences.append(entry) + molecular_consequence = molecular_consequences.index(entry) + slim_mc.append(str(molecular_consequence)) + slim_fields.append(','.join(slim_mc)) + + elif name in names_to_keep: + if name == 'AF_EXAC': + has_af_exac = True + if name == 'CLNSIG': + value = clinical_concerns.index(value) + elif name == 'CLNREVSTAT': + value = robust_review_statuses.index(value) + slim_fields.append(str(value)) + + if not has_af_exac: + slim_fields.insert(0, '') + + # Index disease names and IDs, and represent them compactly + for (i, disease_id) in enumerate(disease_ids): + if disease_id == '-1': + continue + disease_name = disease_names[i] + disease_id_and_name = disease_id + '|' + disease_name + if disease_id_and_name not in disease_ids_and_names: + disease_ids_and_names.append(disease_id_and_name) + disease_index = disease_ids_and_names.index(disease_id_and_name) + disease_indexes.append(str(disease_index)) + disease_indexes_string = ','.join(disease_indexes) + slim_fields.insert(0, disease_indexes_string) + + if not has_mc: + # Seen in a variant in gene PTPN11, + # https://www.ncbi.nlm.nih.gov/clinvar/variation/44604/ + slim_fields.insert(5, '') + + slim_info = '\t'.join(slim_fields) + return slim_info + + +def write_gene_byte_index(filepath): + """Write byte-offset index file of variants for a gene + """ + variant_indexes_by_gene = {} + variants_by_chromosome = {} + genes_path = '../../../dist/data/cache/genes/homo-sapiens-genes.tsv' + + variant_line_offset = 0 + print('filepath', filepath) + with open(filepath) as file: + reader = csv.reader(file, delimiter="\t") + for (i, row) in enumerate(reader): + if row[0][0] == '#': + variant_line_offset += 1 + continue + variant = row + chromosome = variant[0] + bp_position = int(variant[1]) # base-pair position of start of variant + if chromosome not in variants_by_chromosome: + variants_by_chromosome[chromosome] = [] + variants_by_chromosome[chromosome].append([bp_position, i]) + + # print('variants_by_chromosome["1"][0]', variants_by_chromosome["1"][0]) + + with gzip.open(f"{genes_path}.gz", "rt") as file: + reader = csv.reader(file, delimiter="\t") + for row in reader: + if row[0][0] == '#': + continue + + # Gene data + chromosome = row[0] + start = int(row[1]) + length = int(row[2]) + gene = row[4] + # print('gene', gene) + + if chromosome == 'Y': + # ClinVar does not represent variants in chromosome Y by default; + # see "papu" resources in ClinVar FTP for variants in chrY + # pseudoautosomal region (PAR). + continue + + if chromosome not in variants_by_chromosome: + # ClinVar represents unlocalized sequences like GL000194.1 as + # chromosomes, but they're not of interest here. + continue + + # For each variant, check if it's in the gene. + # If so, add the index of the variant to a list + # of variant indexes for the gene + variants_in_chromosome = variants_by_chromosome[chromosome] + for (i, variant) in enumerate(variants_in_chromosome): + variant_bp_position = variant[0] + if variant_bp_position >= start and variant_bp_position <= start + length: + variant_line_index = variant[1] + if gene not in variant_indexes_by_gene: + variant_indexes_by_gene[gene] = [ + variant_line_index, # Used later for byte-range start + variant_line_index + 1 # Used later for byte-range end + ] + else: + # Set the 2nd element in the 2-element list to the line + # _after_ the current variant. This will let us define the + # end of the byte-range as the last byte of the last variant + # in the gene. + variant_indexes_by_gene[gene][1] = variant_line_index + 1 + + variant_gene_index_rows = [] + for gene in variant_indexes_by_gene: + row = variant_indexes_by_gene[gene] + row.insert(0, gene) + variant_gene_index_rows.append(row) + + with open('variant_gene_index_rows.tsv', 'w') as f: + f.write('\n'.join(['\t'.join([str(item) for item in row]) for row in variant_gene_index_rows])) + + header = "# gene\tbyte_offset\tbyte_length" + gene_variant_byte_index = [header] + variant_byte_index = [header] + genes = [] + + with open(filepath) as file: + lines = file.readlines() + for line in lines: + if line[0] == '#': + continue + gene = line.split('\t')[0] + genes.append(gene) + + # print('genes[0:3]', genes[0:3]) + # print('genes[-3:]', genes[-3:]) + + # Get the byte offset of each line (i.e., each variant) in variants.tsv + with open(filepath) as file: + char = file.read(1) + byte_offset = 0 + while char != "": # end of file + if byte_offset % 100_000 == 0: + print(f"Lines byte-indexed, so far: {len(variant_byte_index)}") + char = file.read(1) + if char == "\n": + variant_byte_index.append(byte_offset) + byte_offset += 1 + file.seek(byte_offset) + continue + variant_byte_index.append(byte_offset) + + print('variant_indexes_by_gene["TP53"]', variant_indexes_by_gene["TP53"]) + + # For each gene, get the byte offset and byte length (i.e. + # compact byte-range) of its variants in variants.tsv + for gene in variant_indexes_by_gene: + variant_line_indexes = variant_indexes_by_gene[gene] + byte_index_first_variant = variant_byte_index[variant_line_indexes[1]] + byte_index_last_variant = variant_byte_index[variant_line_indexes[2]] + bytes_length = byte_index_last_variant - byte_index_first_variant + byte_indexes = str(byte_index_first_variant) + '\t' + str(bytes_length) + gene_variant_byte_index.append(gene + '\t' + byte_indexes) + + output = "\n".join([str(o) for o in gene_variant_byte_index]) + + with open('variant-headers.tsv') as file: + keys = file.read() + + comments = "\n".join([ + '# Byte index of gene data in variants.tsv', + '#' , + '# This line index file reports the byte-offset and byte-range of ', + '# variants summarized in variants.tsv, for each human gene.', + '# The keys below map the compressed values of each data row in', + '# variants.tsv to more human-readable values.', + '#', + ]) + output = comments + '\n' + keys + '\n' + output + + with open(f"{filepath}.li", "w") as file: + file.write(output) + with gzip.open(f"{output_path}.li.gz", "wt") as f: + f.write(output) + print(f"Lines byte-indexed, total: {len(gene_variant_byte_index)}") + +def cache_variants(output_path): + + output_rows = [] + + # https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar_20241215.vcf.gz + # Source: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/ + with open('clinvar_20241215.vcf') as file: + reader = csv.reader(file, delimiter="\t") + for row in reader: + if row[0][0] == '#': + continue + # print('row', row) + info = row[7] + fields = info.split(';') + is_relevant = get_is_relevant(fields) + if not is_relevant: + continue + slim_info = trim_info_fields(fields) + del row[5:7] + row[5] = slim_info + + output_rows.append('\t'.join(row)) + + content = '\n'.join(output_rows) + + disease_map = json.dumps(disease_ids_and_names) + column_names = [ + '#CHROM', 'POS', 'ID', 'REF', 'ALT', + 'DISEASE_IDS', 'AF_EXAC', 'CLNREVSTAT', 'CLNSIG', 'CLNVC', 'MC', 'ORIGIN', 'RS' + ] + + keys = '\n'.join([ + '# Keys:', + '# disease_mondo_ids_and_names = ' + disease_map, + '# variant_types = ' + json.dumps(variant_types), + '# clinical_significances = ' + json.dumps(clinical_concerns), + '# clinical_review_statuses = ' + json.dumps(robust_review_statuses), + '# molecular_consequences = ' + json.dumps(molecular_consequences), + '# ', + ]) + + with open('variant-headers.tsv', "w") as f: + f.write(keys) + + headers = '\n'.join([ + '# Cache data on variants related to human health, from NCBI ClinVar', + '# Used by Gene Leads, a feature of Ideogram.js. This is a compact representation of data from:', + '# https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar_20241215.vcf.gz', + '# ', + '# It filters that data to include only the columns below, and only rows', + '# where the variant has a clinical significance of at least "Likely_pathogenic"', + '# and a review status of at least "criteria_provided,_multiple_submitters,_no_conflicts".', + '# ', + '# Values below are custom-compressed, with keys and indexes in:', + '# variants.tsv.li.gz', + '#', + '\t'.join(column_names) + ]) + content = headers + '\n' + content + + with open(output_path, "w") as f: + f.write(content) + with gzip.open(f"{output_path}.gz", "wt") as f: + f.write(content) + + print('Wrote output to: ' + output_path) + +output_path = 'variants.tsv' + +cache_variants(output_path) +write_gene_byte_index(output_path) diff --git a/scripts/python/lib.py b/scripts/python/lib.py index d08727e9..3de24fbe 100644 --- a/scripts/python/lib.py +++ b/scripts/python/lib.py @@ -62,8 +62,8 @@ def download(url, output_path, cache=0): response = urllib.request.urlopen(request, context=ctx) content = response.read().decode() - print('content') - print(content) + # print('content') + # print(content) with open(output_path, "w") as f: f.write(content) diff --git a/src/js/annotations/draw.js b/src/js/annotations/draw.js index fc9cb3c6..af668bfb 100644 --- a/src/js/annotations/draw.js +++ b/src/js/annotations/draw.js @@ -163,15 +163,15 @@ function writeTrackAnnots(chrAnnot, ideo) { return 'translate(' + d.px + ',' + y + ')'; } }) - .on('mouseover', function(event, d) {ideo.showAnnotTooltip(d, this);}) - .on('mouseout', function() {ideo.startHideAnnotTooltipTimeout();}) - .on('click', function(event, d) {ideo.onClickAnnot(d);}) .attr('fill', function(d) {return d.color;}); gElement .filter(function(d) { return d.shape === 'span' || d.shape == 'histo'; }) + .on('mouseover', function(event, d) {ideo.showAnnotTooltip(d, this);}) + .on('mouseout', function() {ideo.startHideAnnotTooltipTimeout();}) + .on('click', function(event, d) {ideo.onClickAnnot(d);}) .append('polygon') .attr('points', function(d) { var x1 = d.startPx; @@ -219,7 +219,10 @@ function writeTrackAnnots(chrAnnot, ideo) { return d.shape !== 'span' && d.shape !== 'histo'; }) .append('path') - .attr('d', function(d) {return determineShape(d, shapes);}); + .attr('d', function(d) {return determineShape(d, shapes);}) + .on('mouseover', function(event, d) {ideo.showAnnotTooltip(d, this);}) + .on('mouseout', function() {ideo.startHideAnnotTooltipTimeout();}) + .on('click', function(event, d) {ideo.onClickAnnot(d);}); } /** diff --git a/src/js/annotations/events.js b/src/js/annotations/events.js index 7fab2228..c482f2bd 100644 --- a/src/js/annotations/events.js +++ b/src/js/annotations/events.js @@ -109,8 +109,8 @@ function getContentAndYOffset(annot, includeLength=false) { /** * Optional callback, invoked before showing annotation tooltip */ -function onWillShowAnnotTooltip(event, context) { - call(this.onWillShowAnnotTooltipCallback, event, context); +async function onWillShowAnnotTooltip(event, context) { + await call(this.onWillShowAnnotTooltipCallback, event, context); } function onDidShowAnnotTooltip() { @@ -139,15 +139,16 @@ function onClickAnnot(annot) { * @param annot {Object} Processed annotation object * @param context {Object} "This" of the caller -- an SVG path DOM object */ -function showAnnotTooltip(annot, context) { - var content, yOffset, tooltip, ideo = this; +async function showAnnotTooltip(annot, context) { + var content, yOffset, tooltip, + ideo = this; if (ideo.config.showAnnotTooltip === false) return; clearTimeout(ideo.hideAnnotTooltipTimeout); if (ideo.onWillShowAnnotTooltipCallback) { - annot = ideo.onWillShowAnnotTooltipCallback(annot, context); + annot = await ideo.onWillShowAnnotTooltipCallback(annot, context); } // Enable onWillShowAnnotTooltipCallback to cancel showing tooltip diff --git a/src/js/init/caches/cache-lib.js b/src/js/init/caches/cache-lib.js index 491224e3..d95f1868 100644 --- a/src/js/init/caches/cache-lib.js +++ b/src/js/init/caches/cache-lib.js @@ -26,6 +26,21 @@ async function fetchByteRangesByName(url) { return byteRangesByName; } +async function fetchVariantByteRangesByName(text) { + const byteRangesByName = {}; + + const lines = text.split('\n'); + for (let i = 0; i < lines.length - 1; i++) { + const [gene, rawOffset, rawLength] = lines[i].split('\t'); + if (gene[0] === '#') continue; + const offset = parseInt(rawOffset); + const offsetEnd = offset + parseInt(rawLength); + byteRangesByName[gene] = [offset, offsetEnd]; + } + + return byteRangesByName; +} + /** Reports if current organism has a gene structure cache */ export function supportsCache(orgName, cacheName) { const metadata = parseOrgMetadata(orgName); @@ -33,7 +48,7 @@ export function supportsCache(orgName, cacheName) { return metadata[cacheProp] && metadata[cacheProp] === true; } -/** Get URL for gene structure cache file */ +/** Get URL for given type of cache file (e.g. gene, tissue, variant) */ export function getCacheUrl(orgName, cacheDir, cacheType, fileType='tsv') { const organism = slug(orgName); if (!cacheDir) { @@ -107,6 +122,7 @@ export async function cacheFetch(url) { } export async function cacheRangeFetch(url, byteRange) { + // console.log('url', url) url = url.replace('.gz', ''); // +/- 1 to trim newlines @@ -157,6 +173,11 @@ export async function fetchAndParse( if (cacheUrl.includes('tissue')) { const byteRangesByName = await fetchByteRangesByName(cacheUrl); parsedCache = parseFn(data, perfTimes, byteRangesByName); + } else if (cacheUrl.includes('variant')) { + const variantsTsvPath = cacheUrl.replace('.li', ''); + await cacheFetch(variantsTsvPath); + const byteRangesByName = await fetchVariantByteRangesByName(data); + parsedCache = parseFn(data, perfTimes, byteRangesByName); } else { parsedCache = parseFn(data, perfTimes, orgName); } diff --git a/src/js/init/caches/cache.js b/src/js/init/caches/cache.js index ed3b955f..2476837e 100644 --- a/src/js/init/caches/cache.js +++ b/src/js/init/caches/cache.js @@ -24,6 +24,7 @@ import {parseGeneStructureCache} from './gene-structure-cache-worker'; import {parseProteinCache} from './protein-cache-worker'; import {parseSynonymCache} from './synonym-cache-worker'; import {parseTissueCache} from './tissue-cache-worker'; +import {parseVariantCacheIndex} from './variant-cache-worker'; /** * Populates in-memory content caches from on-disk service worker (SW) caches. @@ -63,12 +64,13 @@ export async function initCaches(ideo) { cacheFactory('paralog', organism, ideo, cacheDir), cacheFactory('interaction', organism, ideo, cacheDir), cacheFactory('synonym', organism, ideo, cacheDir), - cacheFactory('tissue', organism, ideo, cacheDir) ]); if (config.showGeneStructureInTooltip) { cacheFactory('geneStructure', organism, ideo, cacheDir); cacheFactory('protein', organism, ideo, cacheDir); + cacheFactory('tissue', organism, ideo, cacheDir); + cacheFactory('variant', organism, ideo, cacheDir); } return cachePromise; @@ -82,6 +84,7 @@ export async function initCaches(ideo) { cacheFactory('protein', organism, ideo, cacheDir); cacheFactory('synonym', organism, ideo, cacheDir); cacheFactory('tissue', organism, ideo, cacheDir); + cacheFactory('variant', organism, ideo, cacheDir); } } } @@ -128,6 +131,12 @@ const allCacheProps = { fn: setTissueCache, // worker: tissueCacheWorker // Uncomment when workers work parseFn: parseTissueCache // Remove when workers work + }, + variant: { + metadata: 'Variant', dir: 'variants', + fn: setVariantCache, extension: 'tsv.li', + // worker: variantCacheWorker // Uncomment when workers work + parseFn: parseVariantCacheIndex // Remove when workers work } }; @@ -178,6 +187,10 @@ function setTissueCache(parsedCache, ideo) { ideo.tissueCache = parsedCache; } +function setVariantCache(parsedCache, ideo) { + ideo.variantCache = parsedCache; +} + async function cacheFactory(cacheName, orgName, ideo, cacheDir=null) { const cacheProps = allCacheProps[cacheName]; diff --git a/src/js/init/caches/gene-structure-cache-worker.js b/src/js/init/caches/gene-structure-cache-worker.js index 7ba4bd04..8392b856 100644 --- a/src/js/init/caches/gene-structure-cache-worker.js +++ b/src/js/init/caches/gene-structure-cache-worker.js @@ -59,12 +59,13 @@ export function parseGeneStructureCache(rawTsv, perfTimes) { const splitLine = line.trim().split(/\t/); const [ - name, biotypeCompressed, strand - ] = splitLine.slice(0, 3); + name, rawStartOffset, biotypeCompressed, strand + ] = splitLine.slice(0, 4); + const startOffset = parseInt(rawStartOffset); const gene = name.split('-').slice(0, -1).join('-'); - const rawSubparts = splitLine.slice(3); + const rawSubparts = splitLine.slice(4); const subparts = deserializeSubparts(rawSubparts, subpartKeys); const biotype = biotypeKeys[biotypeCompressed]; @@ -72,6 +73,7 @@ export function parseGeneStructureCache(rawTsv, perfTimes) { // E.g. ACE2-201, protein_coding, -, const feature = { name, + startOffset, biotype, strand, subparts diff --git a/src/js/init/caches/variant-cache-worker.js b/src/js/init/caches/variant-cache-worker.js new file mode 100644 index 00000000..dd8157bc --- /dev/null +++ b/src/js/init/caches/variant-cache-worker.js @@ -0,0 +1,278 @@ +import { + fetchAndParse, getFullId, inspectWorker, + cacheFetch, cacheRangeFetch, getCacheUrl +} from './cache-lib'; + +const variantOriginMap = { + '0': 'unknown', + '1': 'germline', + '2': 'somatic', + '4': 'inherited', + '8': 'paternal', + '16': 'maternal', + '32': 'de-novo', + '64': 'biparental', + '128': 'uniparental', + '256': 'not-tested', + '512': 'tested-inconclusive', + '1073741824': 'other' +}; + +function parseDiseaseElement(diseaseArray, i) { + const [rawId, rawName] = diseaseArray[i].split('|'); + const id = 'MONDO:' + rawId; + const name = rawName.replaceAll('_', ' '); + return [id, name]; +} + +/** Transform custom-compressed array into disease name-by-id map */ +function parseDiseaseKey(line) { + const diseaseNamesById = {}; + + // E.g. ["0012345|Disease_name_1", "0000042|Particular_condition", ...] + const diseaseArray = getArray(line); + + for (let i = 0; i < diseaseArray.length; i++) { + const [id, name] = parseDiseaseElement(diseaseArray, i); + diseaseNamesById[id] = name; + } + + return [diseaseArray, diseaseNamesById]; +} + +/** Transform custom-compressed molecular consequence array */ +function parseMolecularConsequenceKey(line) { + const molecularConsequenceById = {}; + + // E.g. ['SO:0001574|splice_acceptor_variant', 'SO:0001234|foo', ...] + const molecularConsequenceArray = getArray(line); + + for (let i = 0; i < molecularConsequenceArray.length; i++) { + const [id, rawName] = molecularConsequenceArray[i].split('|'); + const name = rawName.replaceAll('_', ' '); + molecularConsequenceById[id] = name; + } + + return [molecularConsequenceArray, molecularConsequenceById]; +} + +/** Get disease names and IDs for this variant, given raw compressed values */ +function parseDiseases(rawDiseases, diseaseArray) { + const diseases = []; + + if (rawDiseases === '') { + return [{id: '', disease: 'Not provided'}]; + } + + // The raw "diseases" value in variants.tsv is a list of integer index values, + // which map to the id-name values in disease_mondo_ids_and_names from the + // variants.tsv.li file. + const diseaseIndexValues = rawDiseases.split(','); + + for (let i = 0; i < diseaseIndexValues.length; i++) { + const diseaseIndexValue = parseInt(diseaseIndexValues[i]); + const [id, name] = parseDiseaseElement(diseaseArray, diseaseIndexValue); + const disease = {id, name}; + diseases.push(disease); + } + + return diseases; +} + +/** Like parseDiseases, but for molecular consequences */ +function parseMolecularConsequences(rawMolecularConsequences, mcArray) { + const molecularConsequences = []; + + const mcIndexValues = rawMolecularConsequences.split(','); + + for (let i = 0; i < mcIndexValues.length; i++) { + const mcIndexValue = parseInt(mcIndexValues[i]); + const [id, name] = mcArray[mcIndexValue].split('|'); + const mc = {id, name}; + molecularConsequences.push(mc); + } + + return molecularConsequences; +} + +function parseKey(index, key) { + return key[index].replaceAll('_', ' '); +} + +/** + * Parse a line in variants.tsv, return a useful variant object + * + * The line has the format: + * #CHROM POS ID REF ALT DISEASE_IDS CLNREVSTAT CLNSIG CLNVC MC ORIGIN RS + */ +function parseVariant(line, variantCache) { + + const [ + chromosome, + rawPosition, + rawClinvarId, + refAllele, // Allele in the reference genome + altAllele, // Allele that makes this a "variant" + rawDiseases, + rawAfExac, + rawReviewStatus, + rawClinicalSignificance, + rawVariantType, + rawMolecularConsequences, + rawOrigin, + rsNumber + ] = line.split('\t') + + const position = parseInt(rawPosition); + const afExac = rawAfExac === '' ? null : parseFloat(rawAfExac); + + const keys = variantCache.keys; + + // const zeros = '0'.repeat(9 - rawClinvarId.length); + // const clinvarVariantId = 'VCV' + zeros + rawClinvarId; + const clinvarVariantId = rawClinvarId; + const diseases = parseDiseases(rawDiseases, keys.diseaseArray); + const reviewStatus = parseKey(rawReviewStatus, keys.reviewStatuses); + const clinicalSignificance = parseKey( + rawClinicalSignificance, keys.clinicalSignificances + ); + const variantType = parseKey(rawVariantType, keys.variantTypes); + let molecularConsequences = null; + if (rawMolecularConsequences !== '') { + molecularConsequences = parseMolecularConsequences( + rawMolecularConsequences, keys.molecularConsequenceArray + ); + } + const dbSnpId = rsNumber ? 'rs' + rsNumber : null; + + const origin = variantOriginMap[rawOrigin]; + + const variant = { + chromosome, position, + afExac, + clinvarVariantId, + refAllele, + altAllele, + diseases, + reviewStatus, + rawReviewStatus: parseInt(rawReviewStatus), + clinicalSignificance, + rawClinicalSignificance: parseInt(rawClinicalSignificance), + variantType, + molecularConsequences, + dbSnpId, + origin, + rawOrigin: parseInt(rawOrigin) + }; + + return variant; +} + +async function getVariants(gene, ideo) { + const variants = []; + + const cache = ideo.variantCache; + const byteRange = cache.byteRangesByName[gene]; + + // Easier debuggability + if (!ideo.cacheRangeFetch) ideo.cacheRangeFetch = cacheRangeFetch; + + if (!byteRange) return []; + + const config = ideo.config; + let cacheDir = null; + if (config.cacheDir) cacheDir = config.cacheDir; + const cacheType = 'variants'; + const extension = 'tsv'; + + const orgName = 'homo-sapiens'; + const cacheUrl = getCacheUrl(orgName, cacheDir, cacheType, extension); + + const geneLocus = ideo.geneCache.lociByName[gene]; + + // Get variant data only for the requested gene + const data = await cacheRangeFetch(cacheUrl, byteRange); + const lines = data.split('\n'); + + for (let i = 0; i < lines.length; i++) { + const line = lines[i]; + const variant = parseVariant(line, cache); + variant.positionRelative = variant.position - geneLocus[1]; + variants.push(variant); + } + + return variants; +} + +function getArray(line) { + // console.log('line', line) + const value = line.split('= ')[1]; + return JSON.parse(value); +} + +/** Parse a tissue cache TSV file */ +export function parseVariantCacheIndex(rawTsv, perfTimes, byteRangesByName) { + let diseaseArray; + let diseaseNamesById; // Per MONDO ontology + let variantTypes; // e.g. "single_nucleotide_variant", "Indel" + let clinicalSignificances; + let reviewStatuses; + let molecularConsequenceArray; + let molecularConsequenceById; + + let t0 = performance.now(); + const lines = rawTsv.split(/\r\n|\n/); + perfTimes.rawTsvSplit = Math.round(performance.now() - t0); + + t0 = performance.now(); + for (let i = 0; i < lines.length; i++) { + const line = lines[i]; + if (line === '') continue; // Skip empty lines + if (line[0] === '#') { + // Parse keys + const firstChars = line.slice(0, 50); + if (firstChars.includes('disease_mondo_ids_and_names')) { + [diseaseArray, diseaseNamesById] = parseDiseaseKey(line); + } else if (firstChars.includes('variant_types')) { + variantTypes = getArray(line); + } else if (firstChars.includes('clinical_significances')) { + clinicalSignificances = getArray(line); + } else if (firstChars.includes('clinical_review_statuses')) { + reviewStatuses = getArray(line); + } else if (firstChars.includes('molecular_consequences')) { + [molecularConsequenceArray, molecularConsequenceById] = + parseMolecularConsequenceKey(line); + } + continue; + } + + // Only process comments, because + // other variant index lines (byteRangesByName) are processed upstream + break; + }; + const t1 = performance.now(); + perfTimes.parseCacheLoop = Math.round(t1 - t0); + + return { + getVariants, + byteRangesByName, + keys: { + diseaseArray, + diseaseNamesById, + variantTypes, + clinicalSignificances, + reviewStatuses, + molecularConsequenceArray, + molecularConsequenceById + } + }; +} + +// Uncomment when workers work outside localhost +// addEventListener('message', async event => { +// console.time('tissueCacheWorker'); +// const [cacheUrl, perfTimes, debug] = event.data; +// const result = await fetchAndParse(cacheUrl, perfTimes, parseCache); +// postMessage(result); +// if (debug) inspectWorker('paralog', result[0]); +// }); diff --git a/src/js/init/organism-metadata.js b/src/js/init/organism-metadata.js index b0dcc69c..78cf3a26 100644 --- a/src/js/init/organism-metadata.js +++ b/src/js/init/organism-metadata.js @@ -14,7 +14,8 @@ var organismMetadata = { hasGeneStructureCache: true, hasProteinCache: true, hasSynonymCache: true, - hasTissueCache: true + hasTissueCache: true, + hasVariantCache: true }, 10090: { commonName: 'Mouse', diff --git a/src/js/init/write-container.js b/src/js/init/write-container.js index a461b46d..6b420fcf 100644 --- a/src/js/init/write-container.js +++ b/src/js/init/write-container.js @@ -50,7 +50,7 @@ function getContainerSvgClass(ideo) { /** Hide tooltip upon pressing "esc" on keyboard */ function handleEscape(event) { if (event.keyCode === 27) { // "Escape" key pressed - const tooltip = document.querySelector('#_ideogramTooltip'); + const tooltip = document.querySelector('._ideogramTooltip'); if (tooltip) { tooltip.style.opacity = 0; } diff --git a/src/js/kit/gene-structure.js b/src/js/kit/gene-structure.js index c6781412..a7157ad7 100644 --- a/src/js/kit/gene-structure.js +++ b/src/js/kit/gene-structure.js @@ -5,6 +5,7 @@ import {tippyCss, tippyLightCss} from './tippy-styles'; import {d3} from '../lib'; import {getIcon} from '../annotations/legend'; import {getProtein, getHasTopology} from './protein'; +import {getVariantsSvg} from './variant'; const y = 5; @@ -107,12 +108,13 @@ function writeFooter(container) { } /** Write newly-selected gene structure diagram, header, and footer */ -function updateGeneStructure(ideo, offset=0) { +async function updateGeneStructure(ideo, offset=0) { const [structure, selectedIndex] = getSelectedStructure(ideo, offset); const isCanonical = (selectedIndex === 0); const menu = document.querySelector('#_ideoGeneStructureMenu'); menu.options[selectedIndex].selected = true; - const svg = getSvg(structure, ideo, ideo.spliceExons)[0]; + const svgResults = await getSvg(structure, ideo, ideo.spliceExons); + const svg = svgResults[0]; const container = document.querySelector('._ideoGeneStructureSvgContainer'); container.innerHTML = svg; updateHeader(ideo.spliceExons, isCanonical); @@ -167,7 +169,7 @@ function addMenuListeners(ideo) { const container = document.querySelector('._ideoGeneStructureContainer'); // Don't search this gene if clicking to expand the menu - container.addEventListener('click', (event) => { + container.addEventListener('click', async (event) => { if (event.target.id === menuId) { event.stopPropagation(); } @@ -182,7 +184,7 @@ function addMenuListeners(ideo) { const menuArrow = svgMaybe; const direction = menuArrow.getAttribute('data-dir'); const offset = direction === 'down' ? 1 : -1; - updateGeneStructure(ideo, offset); + await updateGeneStructure(ideo, offset); event.stopPropagation(); } }); @@ -217,7 +219,7 @@ function addSpliceToggleListeners(ideo) { if (!container) return; - toggler.addEventListener('change', (event) => { + toggler.addEventListener('change', async (event) => { toggleSplice(ideo); addHoverListeners(ideo); event.stopPropagation(); @@ -516,8 +518,8 @@ function addHoverListeners(ideo) { // Listen for change of selected option in transcript menu const tooltip = document.querySelector('._ideogramTooltip'); - tooltip.addEventListener('change', () => { - updateGeneStructure(ideo); + tooltip.addEventListener('change', async () => { + await updateGeneStructure(ideo); // Without this, selecting a new transcript will close the tooltip if // the selected