diff --git a/data-exploration/other/mappings-cleanup-investigation.ipynb b/data-exploration/other/mappings-cleanup-investigation.ipynb new file mode 100644 index 00000000..a67308f7 --- /dev/null +++ b/data-exploration/other/mappings-cleanup-investigation.ipynb @@ -0,0 +1,1245 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 161, + "id": "9f3f2e9a-803a-410e-a5d9-1506aebe4307", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import os\n", + "import json\n", + "import csv\n", + "\n", + "import matplotlib.pyplot as plt\n", + "from sankeyflow import Sankey" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "id": "9e9c8c8d-f51f-4116-aec6-64702577157d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from cmat.clinvar_xml_io import ClinVarDataset\n", + "from cmat.clinvar_xml_io.filtering import filter_by_submission\n", + "from cmat.output_generation.clinvar_to_evidence_strings import load_ontology_mapping\n", + "from cmat.trait_mapping.ols import is_current_and_in_ontology" + ] + }, + { + "cell_type": "markdown", + "id": "7b41f368-d7e1-488c-a2e9-2e3d7614b9fd", + "metadata": {}, + "source": [ + "## Investigations for cleaning up trait mappings\n", + "\n", + "1. [Mappings used in evidence](#Mappings-used-in-evidence)\n", + "2. [Origin of mappings used in evidence](#Origin-of-mappings-used-in-evidence)\n", + "3. [Preferred vs. all trait names](#Preferred-vs-all-trait-names)\n", + "4. [Zooma behaviour](#Zooma-behaviour)" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "id": "15f40470-912a-4858-89bb-9c0437336e8f", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def print_set(s, n=10):\n", + " # Helper function to peek at large sets\n", + " i = 0\n", + " for x in s:\n", + " print(x)\n", + " i += 1\n", + " if i > n:\n", + " break" + ] + }, + { + "cell_type": "markdown", + "id": "8b753c43-f890-4645-9e13-c37b1b841922", + "metadata": {}, + "source": [ + "### Mappings used in evidence\n", + "\n", + "[Top of page](#Investigations-for-cleaning-up-trait-mappings)\n", + "\n", + "* How many mappings in the latest_mappings file are used in latest evidence strings / latest ClinVar?" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "10d2d03e-663f-48f2-862b-1c22e69e5318", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Get all trait names with EFO mappings from 2025.12 evidence\n", + "# To easily handle multiples we define a \"mapping\" as a pair: (trait_name, ontology_id)\n", + "# This mirrors how they're counted in the tsv files where each row is a mapping.\n", + "\n", + "mappings_in_evidence = set()\n", + "with open(os.path.join(os.getenv('BATCH_ROOT'), 'batch-2025-12/evidence_strings/evidence_strings.json')) as f:\n", + " for line in f:\n", + " evidence = json.loads(line.strip())\n", + " if evidence.get('diseaseFromSourceMappedId') and evidence.get('diseaseFromSource'):\n", + " mappings_in_evidence.add((evidence.get('diseaseFromSource').lower(), evidence.get('diseaseFromSourceMappedId')))" + ] + }, + { + "cell_type": "markdown", + "id": "39966266-370b-4422-b10a-847419970b12", + "metadata": {}, + "source": [ + "The above method of getting used mappings does NOT work, due to [this quirk](https://github.com/EBIvariation/CMAT/issues/384) about how we annotate traits in ClinVar. For example:\n", + "\n", + "```\n", + "$ grep -i 'focal facial dermal dysplasia type iii' trait_names_to_ontology_mappings.tsv\n", + "focal facial dermal dysplasia type iii http://purl.obolibrary.org/obo/MONDO_0009203 focal facial dermal dysplasia type III\n", + "congenital ectodermal dysplasia of face http://www.orpha.net/ORDO/Orphanet_1807 Focal facial dermal dysplasia type III\n", + "focal facial dermal dysplasia 3, setleis type http://www.orpha.net/ORDO/Orphanet_1807 Focal facial dermal dysplasia type III\n", + "\n", + "$ grep -i 'focal facial dermal dysplasia type iii' batch-2025-12/evidence_strings/evidence_strings.json\n", + "\n", + "{...\"cohortPhenotypes\": [\"BITEMPORAL FORCEPS MARKS SYNDROME\", \"FFDD type 2\", \"FOCAL FACIAL DERMAL DYSPLASIA, TYPE II\", \"Focal facial dermal dysplasia 3\", \"Focal facial dermal dysplasia 3, Setleis type\", \"Focal facial dermal dysplasia type III\", \"SETLEIS SYNDROME\"],\n", + " \"diseaseFromSource\": \"Focal facial dermal dysplasia type III\",\n", + " \"diseaseFromSourceId\": \"C1744559\", \n", + " \"diseaseFromSourceMappedId\": \"Orphanet_1807\", ...}\n", + "{...\"cohortPhenotypes\": [\"BITEMPORAL FORCEPS MARKS SYNDROME\", \"FFDD type 2\", \"FOCAL FACIAL DERMAL DYSPLASIA, TYPE II\", \"Focal facial dermal dysplasia 3\", \"Focal facial dermal dysplasia 3, Setleis type\", \"Focal facial dermal dysplasia type III\", \"SETLEIS SYNDROME\"],\n", + " \"diseaseFromSource\": \"Focal facial dermal dysplasia type III\", \n", + " \"diseaseFromSourceId\": \"C1744559\",\n", + " \"diseaseFromSourceMappedId\": \"Orphanet_398166\", ...}\n", + "{...\"cohortPhenotypes\": [\"BITEMPORAL FORCEPS MARKS SYNDROME\", \"FFDD type 2\", \"FOCAL FACIAL DERMAL DYSPLASIA, TYPE II\", \"Focal facial dermal dysplasia 3\", \"Focal facial dermal dysplasia 3, Setleis type\", \"Focal facial dermal dysplasia type III\", \"SETLEIS SYNDROME\"],\n", + " \"diseaseFromSource\": \"Focal facial dermal dysplasia type III\",\n", + " \"diseaseFromSourceId\": \"C1744559\",\n", + " \"diseaseFromSourceMappedId\": \"MONDO_0009203\", ...}\n", + "\n", + "\n", + "# The other two mappings appear above, check the third\n", + "$ grep -i 'Orphanet_398166' trait_names_to_ontology_mappings.tsv\n", + "congenital ectodermal dysplasia of face http://www.orpha.net/ORDO/Orphanet_398166 Focal facial dermal dysplasia\n", + "focal facial dermal dysplasia 3, setleis type http://www.orpha.net/ORDO/Orphanet_398166 Focal facial dermal dysplasia\n", + "```\n", + "Note that both [Orphanet_1807](https://www.ebi.ac.uk/ols4/ontologies/efo/classes/http%253A%252F%252Fwww.orpha.net%252FORDO%252FOrphanet_1807) and [Orphanet_398166](https://www.ebi.ac.uk/ols4/ontologies/efo/classes/http%253A%252F%252Fwww.orpha.net%252FORDO%252FOrphanet_398166) are deprecated in EFO, which is what Issue 384 is about.\n", + "\n", + "This issue is a separate one from what we're investigating (though not using previous mappings would mitigate this as well). For the purposes of this investigation, the point is that these `(diseaseFromSource, diseaseFromSourceMappedId)` tuples are not actually the mappings we should be using for counts!" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "id": "5643897d-c92c-418d-bd71-4e06f6457143", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Instead run an abridged version of the evidence string pipeline with only trait mappings\n", + "clinvar_xml = os.path.join(os.getenv('BATCH_ROOT'), 'batch-2025-12/clinvar/clinvar.xml.gz')\n", + "latest_mappings_file = os.path.join(os.getenv('BATCH_ROOT'), 'manual_curation/2025-10-02/trait_names_to_ontology_mappings.tsv')" + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "id": "0f1f33a7-d5e9-4634-ac95-8de880fb7fc2", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:cmat.output_generation:15972 ontology mappings loaded for ontology EFO\n" + ] + } + ], + "source": [ + "latest_mappings, _ = load_ontology_mapping(latest_mappings_file)" + ] + }, + { + "cell_type": "code", + "execution_count": 198, + "id": "a7508c8e-f548-4586-abfb-adf2a8e54186", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "100000 records processed\n", + "200000 records processed\n", + "300000 records processed\n", + "400000 records processed\n", + "500000 records processed\n", + "600000 records processed\n", + "700000 records processed\n", + "800000 records processed\n", + "900000 records processed\n", + "1000000 records processed\n", + "1100000 records processed\n", + "1200000 records processed\n", + "1300000 records processed\n", + "1400000 records processed\n", + "1500000 records processed\n", + "1600000 records processed\n", + "1700000 records processed\n", + "1800000 records processed\n", + "1900000 records processed\n", + "2000000 records processed\n", + "2100000 records processed\n", + "2200000 records processed\n", + "2300000 records processed\n", + "2400000 records processed\n", + "2500000 records processed\n", + "2600000 records processed\n", + "2700000 records processed\n", + "2800000 records processed\n", + "2900000 records processed\n", + "3000000 records processed\n", + "3100000 records processed\n", + "3200000 records processed\n", + "3300000 records processed\n", + "3400000 records processed\n", + "3500000 records processed\n", + "3600000 records processed\n", + "3700000 records processed\n", + "3800000 records processed\n", + "3900000 records processed\n", + "4000000 records processed\n", + "4100000 records processed\n", + "4200000 records processed\n", + "4300000 records processed\n", + "4400000 records processed\n", + "4500000 records processed\n", + "4600000 records processed\n", + "4700000 records processed\n", + "Problem getting mappings for RCV003883131: MultipleClinicalClassificationsError('Found multiple descriptions for one ClinicalClassification in RCV003883131')\n", + "4800000 records processed\n", + "4900000 records processed\n", + "4961110 records processed\n" + ] + } + ], + "source": [ + "mappings_in_evidence = {}\n", + "\n", + "dataset = ClinVarDataset(clinvar_xml)\n", + "clinvar_total = 0\n", + "for clinvar_set in dataset.iter_cvs():\n", + " clinvar_total += 1\n", + " if clinvar_total % 100000 == 0:\n", + " print(f'{clinvar_total} records processed')\n", + "\n", + " try:\n", + " # Records skipped by evidence string pipeline\n", + " if not filter_by_submission(clinvar_set):\n", + " continue\n", + " clinvar_record = clinvar_set.rcv\n", + " \n", + " if len(clinvar_record.clinical_classifications) > 1:\n", + " continue\n", + " if not clinvar_record.traits_with_valid_names:\n", + " continue\n", + " if not clinvar_record.valid_clinical_significances:\n", + " continue\n", + " if clinvar_record.measure is None:\n", + " continue\n", + "\n", + " # Get EFO mappings\n", + " for trait in clinvar_record.traits_with_valid_names:\n", + " for trait_name in trait.all_names:\n", + " for uri, label in latest_mappings.get(trait_name.lower(), []):\n", + " # Store the full URI and whether the trait name is preferred or not\n", + " mappings_in_evidence[(trait_name.lower(), uri.split('/')[-1])] = (uri, trait_name.lower() == trait.preferred_or_other_valid_name.lower())\n", + "\n", + " except Exception as e:\n", + " print(f'Problem getting mappings for {clinvar_set.rcv.accession}: {repr(e)}')\n", + " continue\n", + " \n", + "print(f'{clinvar_total} records processed')" + ] + }, + { + "cell_type": "code", + "execution_count": 118, + "id": "830659b9-f6d1-4c33-b166-3d98fd49e883", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Dump mappings_in_evidence so we don't need to rerun\n", + "with open('mappings_in_evidence.csv', 'w') as f:\n", + " writer = csv.writer(f, delimiter=',')\n", + " for row in mappings_in_evidence:\n", + " writer.writerow(row)" + ] + }, + { + "cell_type": "code", + "execution_count": 201, + "id": "b51e56d0-a718-45b0-9077-274b37574a19", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "13689" + ] + }, + "execution_count": 201, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(mappings_in_evidence)" + ] + }, + { + "cell_type": "code", + "execution_count": 199, + "id": "b12b2124-74c1-460b-a1d8-2105da7a8afa", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "15975 /nfs/production/keane/eva/opentargets/manual_curation/latest_mappings.tsv\n" + ] + } + ], + "source": [ + "!wc -l ${BATCH_ROOT}/manual_curation/latest_mappings.tsv" + ] + }, + { + "cell_type": "code", + "execution_count": 121, + "id": "70aabc71-01a3-4b1f-84e9-440329a6afc2", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.8569014084507042" + ] + }, + "execution_count": 121, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Percentage of mappings being used in latest evidence\n", + "13689/15975" + ] + }, + { + "cell_type": "markdown", + "id": "2e75d1dd-2767-4e77-aa56-b0dda80d6ad7", + "metadata": { + "tags": [] + }, + "source": [ + "### Origin of mappings used in evidence\n", + "\n", + "[Top of page](#Investigations-for-cleaning-up-trait-mappings)\n", + "\n", + "* How many mappings are being used but are bypassing curation completely (high confidence from Zooma or exact match from OLS)?\n", + "* Are there any mappings that are used in evidence strings but aren’t in the most recent curated and automated mappings?" + ] + }, + { + "cell_type": "code", + "execution_count": 128, + "id": "03afccef-12bc-424b-85d4-99fd21584954", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Check where these mappings come from: most recent automated mappings, most recent manual curation, or neither\n", + "\n", + "# Use a dict so we can hash on (trait_name, ontology_id), but keep the URI to check obsoleteness\n", + "automated_mappings = {}\n", + "with open(os.path.join(os.getenv('BATCH_ROOT'), 'manual_curation/2025-10-02/automated_trait_mappings.tsv')) as f:\n", + " for line in f:\n", + " trait_name, ontology_uri, ontology_label = line.strip().split('\\t')\n", + " automated_mappings[(trait_name.lower(), ontology_uri.split('/')[-1])] = ontology_uri" + ] + }, + { + "cell_type": "code", + "execution_count": 129, + "id": "5cdfe873-d59e-4b2c-b718-688c5b3abeb1", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "11075" + ] + }, + "execution_count": 129, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(automated_mappings)" + ] + }, + { + "cell_type": "code", + "execution_count": 130, + "id": "295e5d5b-55bf-4afc-9ce9-70e25b0441e4", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "curated_mappings = {}\n", + "with open(os.path.join(os.getenv('BATCH_ROOT'), 'manual_curation/2025-10-02/finished_curation_spreadsheet.csv')) as f:\n", + " reader = csv.reader(f, dialect='excel')\n", + " # skip header\n", + " next(reader)\n", + " next(reader)\n", + " for row in reader:\n", + " if row[5] == 'DONE' and row[0] and row[7]:\n", + " curated_mappings[(row[7].lower(), row[0].split('/')[-1])] = row[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 131, + "id": "e494d978-06f7-44d5-8a03-a5b378de2e15", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "371" + ] + }, + "execution_count": 131, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(curated_mappings)" + ] + }, + { + "cell_type": "code", + "execution_count": 133, + "id": "708b29d7-b996-4b6c-802c-0fd7e6ed2120", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "4529" + ] + }, + "execution_count": 133, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Mappings in latest carried over from previous (not from automated or curated)\n", + "15975 - (11075 + 371)" + ] + }, + { + "cell_type": "code", + "execution_count": 134, + "id": "3304cec8-ae43-4e44-a246-91c24df342d6", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "mappings_used_in_automated = set()\n", + "mappings_used_in_curated = set()\n", + "mappings_used_in_neither = set() # these are \"previous mappings\" only\n", + "mappings_used_in_both = set() # sanity check - this should never happen!\n", + "\n", + "for mapping in mappings_in_evidence:\n", + " if mapping in automated_mappings and mapping not in curated_mappings:\n", + " mappings_used_in_automated.add(mapping)\n", + " if mapping not in automated_mappings and mapping in curated_mappings:\n", + " mappings_used_in_curated.add(mapping)\n", + " if mapping in automated_mappings and mapping in curated_mappings:\n", + " mappings_used_in_both.add(mapping)\n", + " if mapping not in automated_mappings and mapping not in curated_mappings:\n", + " mappings_used_in_neither.add(mapping)" + ] + }, + { + "cell_type": "code", + "execution_count": 135, + "id": "1a56b935-6ecf-492f-aa2c-ed548b52133d", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "11059" + ] + }, + "execution_count": 135, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(mappings_used_in_automated)" + ] + }, + { + "cell_type": "code", + "execution_count": 136, + "id": "37fb3cbc-1101-483c-aa98-d67f723f8b20", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "366" + ] + }, + "execution_count": 136, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(mappings_used_in_curated)" + ] + }, + { + "cell_type": "code", + "execution_count": 140, + "id": "57e769b0-fb11-4ec5-b02a-cfc9722a76e5", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "2264" + ] + }, + "execution_count": 140, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(mappings_used_in_neither)" + ] + }, + { + "cell_type": "code", + "execution_count": 141, + "id": "5b731a9c-20e6-4738-b159-e71cb28259c4", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 141, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(mappings_used_in_both)" + ] + }, + { + "cell_type": "code", + "execution_count": 142, + "id": "fb3fe02f-608b-4633-9e7e-1456aa5e2268", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "13689" + ] + }, + "execution_count": 142, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Confirm these add up to the number of mappings used in the evidence\n", + "11059 + 366 + 2264" + ] + }, + { + "cell_type": "markdown", + "id": "03e897b2-b234-4ae5-b959-9b8af954c049", + "metadata": { + "tags": [] + }, + "source": [ + "* How many mappings in each of these categories (automated, curated, or previous) are obsolete?" + ] + }, + { + "cell_type": "code", + "execution_count": 151, + "id": "2dc5ce63-417d-4404-a311-9f95e3b508ba", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def get_num_obsolete(mapping_set):\n", + " num_obsolete = 0\n", + " for mapping in mapping_set:\n", + " if not is_current_and_in_ontology(mappings_in_evidence[mapping]):\n", + " num_obsolete += 1\n", + " return num_obsolete " + ] + }, + { + "cell_type": "code", + "execution_count": 192, + "id": "b7e67021-48b0-4e89-8d66-f2ee27bf81b3", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "13" + ] + }, + "execution_count": 192, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Note these are from a couple months ago, so it's expected that a handful will have been deprecated since then\n", + "get_num_obsolete(mappings_used_in_automated)" + ] + }, + { + "cell_type": "code", + "execution_count": 153, + "id": "0e37140b-eec3-469f-9083-ee06c3466b63", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "3" + ] + }, + "execution_count": 153, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "get_num_obsolete(mappings_used_in_curated)" + ] + }, + { + "cell_type": "code", + "execution_count": 154, + "id": "a58b7167-f668-43d7-9401-4647e88ebd5c", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "717" + ] + }, + "execution_count": 154, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "get_num_obsolete(mappings_used_in_neither)" + ] + }, + { + "cell_type": "code", + "execution_count": 157, + "id": "462b0f29-1f84-41b7-9598-7ea9c7046ca2", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "1547" + ] + }, + "execution_count": 157, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Number of current EFO mappings used in the current evidence that are only present in previous mappings,\n", + "# i.e. not in the latest automated or curated mappings.\n", + "2264-717" + ] + }, + { + "cell_type": "markdown", + "id": "cea245be-5a38-4e96-9748-81a48ff829e4", + "metadata": {}, + "source": [ + "This exceeds the 716 multiple mappings found previously, so these can't all be duplicate terms. The question is, where are these coming from and why are they not present in Zooma or automated mappings more generally?" + ] + }, + { + "cell_type": "code", + "execution_count": 173, + "id": "15a91776-3791-44a9-a839-e17f17b1227c", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Visualise what we have so far\n", + "plt.figure(figsize=(10, 5))\n", + "flows = [\n", + " ('Latest mappings', 'Used in evidence', 13689),\n", + " ('Latest mappings', 'Not used', 15975-13689),\n", + " ('Used in evidence', 'From automated', 11059),\n", + " ('Used in evidence', 'From curated', 366),\n", + " ('Used in evidence', 'From previous', 2264),\n", + " ('From automated', 'Current', 11059-13),\n", + " ('From curated', 'Current', 366-3),\n", + " ('From previous', 'Current', 2264-717),\n", + " ('From automated', 'Obsolete', 13),\n", + " ('From curated', 'Obsolete', 3),\n", + " ('From previous', 'Obsolete', 717),\n", + "]\n", + "s = Sankey(flows=flows)\n", + "s.draw()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 175, + "id": "8fc6e1f1-5b0f-4a59-be00-e3ad718cfe38", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Filter mappings_used_in_neither to remove obsolete mappings and terms that are present in automated or curated\n", + "# The latter should in the future be \"overwritten\" by the automated and curated mappings in each round\n", + "\n", + "automated_trait_names = {trait_name for trait_name, ontology_id in mappings_used_in_automated}\n", + "curated_trait_names = {trait_name for trait_name, ontology_id in mappings_used_in_curated}" + ] + }, + { + "cell_type": "code", + "execution_count": 178, + "id": "0d95aedb-89a6-4c53-8881-4039166d69bd", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "previous_mappings_for_distinct_traits = set()\n", + "for mapping in mappings_used_in_neither:\n", + " if is_current_and_in_ontology(mappings_in_evidence[mapping][0]):\n", + " trait_name = mapping[0]\n", + " if trait_name not in automated_trait_names and trait_name not in curated_trait_names:\n", + " previous_mappings_for_distinct_traits.add(mapping)" + ] + }, + { + "cell_type": "code", + "execution_count": 181, + "id": "9707c003-54a9-4aa2-bedf-35bfef880a0d", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "1546" + ] + }, + "execution_count": 181, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(previous_mappings_for_distinct_traits)" + ] + }, + { + "cell_type": "markdown", + "id": "7464eb9c-9838-4cfd-9d87-77bb94e0f3cc", + "metadata": {}, + "source": [ + "These are mappings that:\n", + "1. Do not appear in the most recent automated and curated mappings\n", + "2. Are used the most recent evidence strings\n", + "2. Are not obsolete\n", + "3. Are not for traits that appear in the automated/curated mappings" + ] + }, + { + "cell_type": "code", + "execution_count": 182, + "id": "7dd1d578-86de-4f14-a192-f98b80469f19", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "with open('suspicious_mappings.csv', 'w') as f:\n", + " writer = csv.writer(f, delimiter=',')\n", + " for row in previous_mappings_for_distinct_traits:\n", + " writer.writerow(row)" + ] + }, + { + "cell_type": "markdown", + "id": "1490270f-491c-453a-a766-c419c513d422", + "metadata": {}, + "source": [ + "An example mapping here is `dystonia` => `HP_0001332`, which is an exact match via OLS and thus should be an automated mapping. (It's also a high confidence mapping from Zooma, but that might be affected by the Zooma feedback we've imported since the curation happened.)\n", + "\n", + "So the only reason this wouldn't show up in automated mappings is if it appeared among all trait names (used for evidence generation) but NOT among preferred trait names (used for generating automated and curated mappings) - or (less likely), if it appeared in the version of ClinVar used for evidence generation but NOT the version of ClinVar used for generating mappings and curation.\n", + "\n", + "My main conclusion from this is that barring some unusual behaviour from Zooma, there is no need to keep previous mappings as long as we either attempt to map all trait names, or only annotate preferred trait names.\n", + "\n", + "Below we check both Zooma behaviour as well as the preferred vs. all trait name hypothesis." + ] + }, + { + "cell_type": "markdown", + "id": "447e8d5c-7ed6-475d-ad22-c4d70e5900c9", + "metadata": {}, + "source": [ + "### Preferred vs. all trait names\n", + "\n", + "[Top of page](#Investigations-for-cleaning-up-trait-mappings)\n", + "\n", + "* Are previous mappings being used in evidence because annotations are being done with all trait names rather than just preferred?" + ] + }, + { + "cell_type": "code", + "execution_count": 204, + "id": "b8dea049-63fd-4e3f-ab46-4ac36540b776", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Filter for previous mappings applied to preferred names only\n", + "previous_preferred_mappings = set()\n", + "for mapping in previous_mappings_for_distinct_traits:\n", + " if mappings_in_evidence[mapping][1]:\n", + " previous_preferred_mappings.add(mapping)" + ] + }, + { + "cell_type": "code", + "execution_count": 205, + "id": "59cbe777-3366-4349-8024-b39cd4de3883", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "469" + ] + }, + "execution_count": 205, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(previous_preferred_mappings)" + ] + }, + { + "cell_type": "code", + "execution_count": 206, + "id": "391c9e25-3fef-401c-a240-8d1fbd076c56", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "('posterior polymorphous corneal dystrophy 3', 'MONDO_0020364')\n", + "('hyperhomocysteinemia, thrombotic, cbs-related', 'Orphanet_394')\n", + "('trichoepithelioma, multiple familial, 1', 'MONDO_0011512')\n", + "('immunodeficiency 65, susceptibility to viral infections', 'MONDO_0021094')\n", + "('sitosterolemia 1', 'MONDO_0008863')\n", + "('psoriasis 1, susceptibility to', 'EFO_0000676')\n", + "('intellectual disability, anterior maxillary protrusion, and strabismus', 'HP_0001249')\n", + "('metaphyseal chondrodysplasia', 'MONDO_0009597')\n", + "('myopathy, congenital, with structured cores and z-line abnormalities', 'MONDO_0019952')\n", + "('intellectual disability, autosomal dominant 46', 'MONDO_0015802')\n", + "('nephronophthisis 20', 'MONDO_0019005')\n" + ] + } + ], + "source": [ + "print_set(previous_preferred_mappings)" + ] + }, + { + "cell_type": "code", + "execution_count": 210, + "id": "415566b2-d7d7-4947-9475-53bf4172567b", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Visualise what we have so far\n", + "plt.figure(figsize=(10, 5))\n", + "flows = [\n", + " ('Previous mappings used in evidence', 'Current', 1547),\n", + " ('Previous mappings used in evidence', 'Obsolete', 717),\n", + " ('Current', 'Traits only in previous', 1546),\n", + " ('Current', 'Traits also in latest', 1),\n", + " ('Traits only in previous', 'Preferred trait name', 469),\n", + " ('Traits only in previous', 'Other trait name', 1546-469),\n", + "]\n", + "s = Sankey(flows=flows)\n", + "s.draw()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "e723d17a-3fe6-40f8-aea1-6a13bc23ee65", + "metadata": {}, + "source": [ + "In fact there are still preferred trait names who get their mappings only from previous mappings, not automated or curated. So while aligning the sets of trait names we map vs. annotate would help with a lot of issues, it doesn't remove the need to keep our memory of previous mappings" + ] + }, + { + "cell_type": "markdown", + "id": "1a6433c8-b0d2-4d53-ada8-50bc623e2e17", + "metadata": { + "tags": [] + }, + "source": [ + "### Zooma behaviour\n", + "\n", + "[Top of page](#Investigations-for-cleaning-up-trait-mappings)\n", + "\n", + "* Are there mappings that used to be high confidence in automated mapping and subsequently were mapped with low confidence?" + ] + }, + { + "cell_type": "code", + "execution_count": 183, + "id": "6a5f8abf-a892-427f-b7e5-71d6d3b81349", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2022-02-03_test 2023-05-26_test 2024-07-02\t 2025-10-02\n", + "2022-03-03\t 2023-06-06\t 2024-09-05_test eva_clinvar.txt\n", + "2022-05-11\t 2023-06-06_counts 2024-10-11\t latest_comments.tsv\n", + "2022-08-18\t 2023-07-24\t 2025-01-16\t latest_mappings.tsv\n", + "2022-10-17_test 2023-10-03\t 2025-02-25_test\n", + "2023-01-09\t 2024-01-19\t 2025-04-15\n", + "2023-03-01\t 2024-04-11\t 2025-06-20\n" + ] + } + ], + "source": [ + "!ls $BATCH_ROOT/manual_curation" + ] + }, + { + "cell_type": "code", + "execution_count": 195, + "id": "815a346f-75b2-4451-ace8-bc0c23a36af5", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Start with first date that uses current spreadsheet export method\n", + "dates = ['2023-01-09', '2023-03-01', '2023-06-06', '2023-07-24', '2023-10-03', \n", + " '2024-01-19', '2024-04-11', '2024-07-02', '2024-10-11', \n", + " '2025-01-16', '2025-04-15', '2025-06-20', '2025-10-02']" + ] + }, + { + "cell_type": "code", + "execution_count": 196, + "id": "488538bb-14d7-4407-b6ad-dc964b5b19e7", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Check past batches for terms that move from automated mappings to mappings for curation\n", + "automated_terms_through_time = []\n", + "curated_terms_through_time = []\n", + "\n", + "for date in dates:\n", + " automated_terms = set()\n", + " with open(os.path.join(os.getenv('BATCH_ROOT'), f'manual_curation/{date}/automated_trait_mappings.tsv')) as f:\n", + " for line in f:\n", + " trait_name, ontology_uri, ontology_label = line.strip().split('\\t')\n", + " automated_terms.add(trait_name)\n", + " curated_terms = set()\n", + " with open(os.path.join(os.getenv('BATCH_ROOT'), f'manual_curation/{date}/finished_curation_spreadsheet.csv')) as f:\n", + " reader = csv.reader(f, dialect='excel')\n", + " # skip header\n", + " next(reader)\n", + " next(reader)\n", + " for row in reader:\n", + " if row[5] == 'DONE' and row[0] and row[7]:\n", + " curated_terms.add(row[7].lower())\n", + " automated_terms_through_time.append(automated_terms)\n", + " curated_terms_through_time.append(curated_terms)" + ] + }, + { + "cell_type": "code", + "execution_count": 197, + "id": "1a4bbf7a-f372-4807-9dce-07bd23068f84", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Following terms were automated in 2023-06-06 but curated in 2023-07-24:\n", + "{'orofacial cleft 7', 'heterotaxy', 'spinocerebellar ataxia 7', 'qualitative or quantitative defects of beta-myosin heavy chain (myh7)', 'spheroid body myopathy', 'infantile hypercalcemia'}\n", + "\n", + "Following terms were automated in 2023-07-24 but curated in 2023-10-03:\n", + "{'multisystemic smooth muscle dysfunction syndrome', 'li-fraumeni syndrome 1'}\n", + "\n", + "Following terms were automated in 2023-10-03 but curated in 2024-01-19:\n", + "{'qualitative or quantitative defects of calpain', 'severe myoclonic epilepsy in infancy', 'dilated cardiomyopathy 1kk', 'dilated cardiomyopathy 1m', 'hereditary angioedema with normal c1inh', 'dilated cardiomyopathy 1gg', 'congenital myopathy with cores', 'apolipoprotein a-i (munster3b)', 'isolated anorectal malformation', 'non-syndromic renal or urinary tract malformation', 'dilated cardiomyopathy 1e', 'dilated cardiomyopathy 1ee', 'dilated cardiomyopathy 1aa', 'qualitative or quantitative defects of merosin', 'dilated cardiomyopathy 1ff', 'congenital cornea plana', 'hypoalphalipoproteinemia, primary, 2', 'dilated cardiomyopathy 1w', 'ovarioleukodystrophy', 'dilated cardiomyopathy 1p', 'oligosynaptic infertility', 'apolipoprotein a-i deficiency', 'dilated cardiomyopathy 1o', 'dilated cardiomyopathy 1u', 'apolipoprotein a-i (munster4)', 'microcephalic primordial dwarfism', 'pure or complex autosomal recessive spastic paraplegia', 'pik3ca related overgrowth syndrome', 'dilated cardiomyopathy 1bb', 'apolipoprotein a-i (munster3c)', 'sodium channelopathy-related small fiber neuropathy', 'familial isolated restrictive cardiomyopathy', 'apolipoprotein a-i (norway)', 'primary bone dysplasia with multiple joint dislocations', 'cerebellar malformation', 'dilated cardiomyopathy 1i', 'familial hypoalphalipoproteinemia', 'dilated cardiomyopathy 1v', 'apolipoprotein a-i (milano)', 'syndromic retinitis pigmentosa', 'anophthalmia-microphthalmia syndrome', 'resistance to thyroid hormone due to a mutation in thyroid hormone receptor beta', 'apolipoprotein a-i (marburg)', 'idiopathic and/or familial pulmonary arterial hypertension', 'duchenne and becker muscular dystrophy', 'semilobar holoprosencephaly', 'male infertility with azoospermia or oligozoospermia due to single gene mutation', 'dilated cardiomyopathy 2b', 'autosomal dominant hereditary axonal motor and sensory neuropathy', 'htra1-related cerebral small vessel disease', 'primary bone dysplasia with increased bone density', 'brain malformation', 'brachycephaly', 'severe brain malformation', 'dilated cardiomyopathy 1l', 'familial isolated dilated cardiomyopathy', 'dilated cardiomyopathy 1x', 'short stature, microcephaly, and endocrine dysfunction', 'pura-related severe neonatal hypotonia-seizures-encephalopathy syndrome', 'spermatogenic failure 14', 'sagittal craniosynostosis', 'dilated cardiomyopathy 1hh', 'dilated cardiomyopathy 1g', 'dilated cardiomyopathy 1ii', 'bleeding diathesis due to thromboxane synthesis deficiency', 'sickle cell disease and related diseases', 'dilated cardiomyopathy 2a', 'syndromic anorectal malformation', 'qualitative or quantitative defects of myotubularin', 'apolipoprotein a-i (giessen)', 'dilated cardiomyopathy 1dd', 'non-syndromic male infertility due to sperm motility disorder', 'dilated cardiomyopathy 1cc', 'spermatogenic failure 13', 'dilated cardiomyopathy 1jj', 'inherited prion disease', 'imperforated anus', 'dilated cardiomyopathy 1z', 'arx-related epileptic encephalopathy', 'joint hyperflexibility', 'col4a1 or col4a2-related cerebral small vessel disease', 'dilated cardiomyopathy 1t'}\n", + "\n", + "Following terms were automated in 2024-07-02 but curated in 2024-10-11:\n", + "{'difficulty walking', 'low-set, posteriorly rotated ears'}\n", + "\n", + "Following terms were automated in 2024-10-11 but curated in 2025-01-16:\n", + "{'global developmental delay-neuro-ophthalmological abnormalities-seizures-intellectual disability syndrome', 'intellectual disability, autosomal dominant 55, with seizures'}\n", + "\n", + "Following terms were automated in 2025-04-15 but curated in 2025-06-20:\n", + "{'11p partial monosomy syndrome'}\n", + "\n", + "Following terms were automated in 2025-06-20 but curated in 2025-10-02:\n", + "{'hydrocephalus due to aqueductal stenosis', 'langereis blood group', 'plasma triglyceride level quantitative trait locus', 'methylmalonic acidemia with homocystinuria cblc', 'hypogonadotropic hypogonadism 15 with anosmia', 'polycystic kidney disease, adult type', 'increased responsiveness to growth hormone', 'renier-gabreels-jasper syndrome', 'pulmonary arterial hypertension related to hereditary hemorrhagic telangiectasia', 'short-rib thoracic dysplasia 10 with polydactyly', 'skin/hair/eye pigmentation, variation in, 10', 'sensorineural deafness and migraine', 'lissencephaly, recessive', 'hurthle cell carcinoma of thyroid', 'pten hamartoma tumor syndromes', 'warfarin sensitivity, x-linked', 'marfan syndrome, severe classic', 'long qt syndrome 2/3, digenic', 'niemann-pick disease, type d', 'skin/hair/eye pigmentation, variation in, 2', 'migraine, sporadic hemiplegic, with progressive cerebellar ataxia', 'setbp1-related disorder', 'pseudoachondroplasia, severe', 'lipoprotein lipase (olbia)', 'microtia with or without hearing impairment', 'pituitary dependent hypercortisolism', 'inherited immunodeficiency diseases', 'pelizaeus-merzbacher disease, mild', 'hypophosphatemic rickets, recessive', 'joint laxity', 'pex7-related disorder', 'papulosquamous eruptions', 'structural brain abnormalities', 'neuronal ceroid-lipofuscinosis, recessive', 'isolated coronal synostosis', 'neurodevelopmental disability', 'hutchinson-gilford progeria syndrome, atypical', 'late-onset citrullinemia', 'malignant rhabdoid tumor, somatic', 'psoriasis 2, pustular', 'voriconazole response', 'osteogenesis imperfecta type 2, thin-bone', 'smarca4-related bafopathy', 'nephrolithiasis/nephrocalcinosis', 'mandibuloacral dysplasia with type a lipodystrophy, atypical', 'progressive cone dystrophy (without rod involvement)', 'malformation of the heart and great vessels', 'nonsyndromic hearing loss, mixed', 'hypohidrotic ectodermal dysplasia, recessive', 'scid due to ada deficiency, delayed onset', 'piebaldism, progressive', 'long qt syndrome 2/9, digenic', 'properdin deficiency, type ii', 'macular dystrophy, retinal, 5', 'non-syndromic oligodontia', 'myocardial infarction, susceptibility to, 1', 'smarca2-related bafopathy', 'thrombocytopenia, x-linked, intermittent', 'rh-null, amorph type', 'retinitis pigmentosa, juvenile', 'usp7-related neurodevelopmental disorder', 'osteogenesis imperfecta type 1, mild', 'hypotrichosis with juvenile macular dystrophy', 'limb-girdle muscular dystrophy, dominant', 'von hippel-lindau syndrome, modifier of', 'mitochondrial cytopathy', 'intellectual disability, recessive', 'transient neonatal diabetes, recessive', 'primary microcephaly, recessive', 'rett syndrome, congenital variant', 'infantile epilepsy', 'ryr1-related disorder', 'phenytoin response', 'pacs1-related syndrome', 'multiple epiphyseal dysplasia, dominant', 'iodotyrosine deiodination defect', 'inherited erythromelalgia', 'muscle amp deaminase deficiency', 'pelizaeus-merzbacher disease, atypical', 'mannosidosis, alpha-, types i and ii', 'hypogonadotropic hypogonadism 26 with anosmia', 'limb-girdle muscular dystrophy, recessive', 'wfs1-related spectrum disorders', 'parkes weber syndrome', 'pazopanib response', 'spherocytosis, recessive', 'men2 phenotype: unclassified', 'transient neonatal diabetes, dominant', 'vater/vacterl association with cns malformations', 'van buchem disease type 2', 'skin/hair/eye pigmentation 2, blond hair/fair skin', 'marfan syndrome, mild', 'hypogonadotropic hypogonadism 20 without anosmia', 'isolated thoracic aortic aneurysm', 'thyroglobulin synthesis defect', 'mucopolysaccharidosis, type vi, intermediate', 'hypogonadotropic hypogonadism 2 with anosmia', 'long qt syndrome 3/6, digenic', 'mycotic aneurysm, intracranial', 'syndrome with a dandy-walker malformation as major feature', 'neurodevelopmental disorders', 'seizures, benign familial infantile, 6', 'hypogonadotropic hypogonadism 18 with anosmia', 'irinotecan response', 'ventricular tachycardia, somatic', 'spinocerebellar ataxia, x-linked', 'parathyroid adenoma, somatic', 'intellectual disability, dominant', 'leydig hypoplasia, type i', 'persistent mullerian duct syndrome, type ii', 'hypodysfibrinogenemia', 'peroxisomal biogenesis disorder 3b', 'idiopathic and/or familial pulmonary arterial hypertension', 'language retardation', 'stargardt disease, recessive', 'serum calcium level', 'polg2-related spectrum disorders', 'methylmalonic aciduria due to complete methylmalonyl-coa mutase deficiency', 'tyrosine kinase inhibitor response', 'lynch-like syndrome', 'methemoglobinemia, type i', 'kat6b-related disorder', 'ubtf-related disorder', 'migalastat response', 'methylmalonic aciduria, mut(-) type', 'intractable seizure', 'myotonia congenita, atypical, acetazolamide-responsive', 'pancreatic cancer, susceptibility to', 'osteogenesis imperfecta, type iii/iv', 'lipoatrophy with diabetes, hepatic steatosis, hypertrophic cardiomyopathy, and leukomelanodermic papules', 'hypertension, early-onset, autosomal dominant, with exacerbation in pregnancy', 'isolated gnrh deficiency', 'joubert syndrome 12/15, digenic', 'stevens-johnson syndrome, susceptibility to', 'mucopolysaccharidosis, type ii, mild form', 'malaria, severe, susceptibility to', 'poor coordination', 'thalassemia intermedia', 'methylmalonic aciduria and homocystinuria, cblc type, digenic', 'prkag2 syndrome', 'properdin deficiency, type iii', 'russell-silver syndrome', 'persistent mullerian duct syndrome, type i', 'isovaleric acidemia, type iii', 'tobacco use disorder', 'ugdh-related disorder', 'invasive medullary breast carcinoma', 'pelizaeus-merzbacher disease, connatal', 'hypogonadotropic hypogonadism 10 without anosmia', 'oculoectodermal syndrome, somatic', 'rlbp1-related disorder', 'magel2-related disorder', 'intermediate muscular dystrophy', 'severe brain malformation', 'myokymia 1', 'parkinson disease, dominant/recessive', 'leukemia, acute myeloid, reduced survival in, somatic', 'monogenic non-syndromic obesity', 'nonsyndromic otitis media', 'single ventricle defect', 'hypogonadotropic hypogonadism 8 without anosmia', 'mucopolysaccharidosis, type vi, severe', 'small vessel cerebrovascular disease', 'primary degenerative dementia of the alzheimer type, presenile onset', 'severe combined immunodeficiency, b cell-negative', 'sertraline response', 'retinitis punctata albescens, autosomal dominant', 'ovarian dysgenesis', 'ventral septal defect', 'leukemia, acute myeloid, m0 subtype', 'spastic paraplegia, autosomal dominant', 'intellectual disability, cask-related, x-linked', 'isolated hereditary giant platelet disorder', 'nemaline myopathy, recessive', 'lesch-nyhan syndrome, neurologic variant', 'left ventricular noncompaction 3', 'hypereosinophilic syndrome, idiopathic, resistant to imatinib', 'multiple congenital anomalies', 'short-rib thoracic dysplasia 10 without polydactyly', 'robinow syndrome, autosomal recessive, with brachy-syn-polydactyly', 'spina bifida, susceptibility to', 'myh9-related disorder', 'porphyria, acute intermittent, nonerythroid variant', 'isolated nonsyndromic congenital heart disease', 'hypogonadotropic hypogonadism 5 without anosmia', 'moyamoya angiopathy', 'waardenburg syndrome type 2e, without neurologic involvement', 'normokalemic periodic paralysis, potassium-sensitive', 'mitochondrial complex i deficiency, mitochondrial type 2', 'optic nerve hypoplasia and abnormalities of the central nervous system', 'pfeiffer syndrome variant', 'mycobacterium tuberculosis, susceptibility to infection by', 'satb2-associated syndrome', 'impaired social interactions', 'prkag2 cardiac syndrome', 'marshall/stickler syndrome', 'isovaleric acidemia, type ii', 'metachromatic leukodystrophy, severe', 'long qt syndrome 1/2, digenic', 'sleep-wake schedule disorder, delayed phase type', 'short qt syndrome 4', 'renal agenesis and hypodysplasia', 'zonular pulverulent cataract', 'vexas', 'hypogonadotropic hypogonadism 3 without anosmia', 'wars2-related disorder', 'type ii collagenopathies', 'ovarian cancers', 'retinitis pigmentosa, dominant', 'wdr26-related disorder', 'hypogonadotropic hypogonadism 22 with anosmia', 'nonsyndromic hearing loss, dominant', 'skin/hair/eye pigmentation, variation in, 4', 'severe intellectual deficiency', 'methylmalonic aciduria, mut(0) type', 'retinitis pigmentosa, recessive', 'tmem67-related disorder', 'refsum disease, adult, 1', 'iminoglycinuria, digenic', 'juvenile polyposis', 'isovaleric acidemia, type i', 'slc26a2-related disorder', 'myo7a-related disorder', 'joubert syndrome 12', 'wiskott-aldrich syndrome, attenuated', 'hypogonadotropic hypogonadism 14 with anosmia', 'sphingolipid activator protein 1 deficiency', 'myoclonic encephalopathy', 'short qt syndrome 5', 'xeroderma pigmentosum group g/cockayne syndrome', 'warfarin response', 'microphthalmia, cataracts, and iris abnormalities', 'subcortical laminar heterotopia, x-linked', 'nonsyndromic cleft lip palate', 'mycobacterium tuberculosis, susceptibility to', 'spherocytosis, dominant', 'rheumatoid arthritis, progression of', 'severe cystic degeneration of the brain', 'progressive neurodegenerative disease', 'mild obesity', 'marfan syndrome, atypical', 'invasive pneumococcal disease, recurrent isolated', 'tlk2-related neurodevelopmental disorder', 'hypohidrotic ectodermal dysplasia, dominant', 'macular corneal dystrophy, type ii', 'multiple joint dislocations, short stature, craniofacial dysmorphism, with or without congenital heart defects', 'osteopoikilosis with or without melorheostosis', 'inclusion body myopathy, recessive', 'marfan syndrome, mild variable', 'normal pregnancy', 'megacystis-microcolon hypoperistalsis syndrome 1', 'mucopolysaccharidosis, type vi, mild', 'low-set, posteriorly rotated ears', 'mastocytosis, systemic, somatic', 'intellectual disability with language impairment and autistic features', 'smith-magenis syndrome-like', 'sparse and thin eyebrow', 'metachromatic leukodystrophy, mild', 'plasma factor xi deficiency', 'usher syndrome, type id/f, digenic', 'niemann-pick disease, type c1, juvenile form', 'optic atrophy, dominant', 'leukemia, acute lymphoblastic, susceptibility to', 'infantile hypercalcemia', 'male infertility due to obstructive azoospermia', 'menkes disease, mild', 'polg-related spectrum disorders', 'nonsyndromic oculocutaneous albinism', 'sandhoff disease, chronic', 'joubert syndrome 9/15, digenic', 'nephronophthisis-like nephropathy', 'rhabdomyosarcoma, somatic', 'warfarin response - dosage', 'ocular albinism with congenital sensorineural hearing loss', 'prph2-related disorder', 'rrm2b-related mitochondrial disease', 'retinal dystrophy, early-onset severe, lrat-related', 'hyperinsulinism, dominant', 'joint hyperflexibility', 'pyruvate carboxylase deficiency', 'infantile axial hypotonia', 'skin/hair/eye pigmentation, variation in, 8', 'intellectual deficiency', 'nonsyndromic hearing loss and deafness, autosomal recessive', 'macular degeneration, age-related, neovascular type', 'severe microlissencephaly', 'spastic paraplegia, recessive', 'zap70-related severe combined immunodeficiency', 'nr2e3-related disorder', 'sepn1-related disorder', 'nephronophthisis 8', 'mucolipidosis iii alpha/beta, atypical', 'tramadol response', 'pallister-killian syndrome', 'thyroid dyshormonogenesis', 'mitochondrial complex i deficiency, mitochondrial type 3', 'ivacaftor response - efficacy', 'leigh syndrome due to mitochondrial complex i deficiency', 'poor motor coordination', 'muscular dystrophy, congenital, merosin deficient or partially deficient', 'prednisolone response'}\n", + "\n" + ] + } + ], + "source": [ + "for i in range(1, len(dates)):\n", + " moved_terms = automated_terms_through_time[i-1].intersection(curated_terms_through_time[i])\n", + " if moved_terms:\n", + " print(f'Following terms were automated in {dates[i-1]} but curated in {dates[i]}:')\n", + " print(moved_terms)\n", + " print()" + ] + }, + { + "cell_type": "markdown", + "id": "a7b2f393-15b0-4939-a572-369c60912d8b", + "metadata": { + "tags": [] + }, + "source": [ + "Manually checked some of these by searching for them in the relevant curation spreadsheet and seeing how they were annotated there. Some findings:\n", + "\n", + "* Besides June and October 2025, most of the above traits that I've checked manually were moved from automated to needing curation due to mappings being deprecated.\n", + " * This includes the large batch in January 2024, which I believe is when EFO did some large-scale deprecations (I haven't checked all these terms though).\n", + " * A notable exception is `qualitative or quantitative defects of beta-myosin heavy chain (myh7)` => `MONDO_0016195`, which was automated in 2023-06-06 but [curated in 2023-07-24](https://docs.google.com/spreadsheets/d/1IYfPY_K0aPsVwH9usVb2U2eUft1_yls5RWXxy-lq1SU/edit?gid=88027652#gid=88027652&range=6100:6100), at which point it was given the same mapping as previous. This is when the spreadsheet still reported Zooma confidence, so we can see explicitly that the Zooma confidence for the mapping was `GOOD` rather than `HIGH`.\n", + " * The July 2023 curation spreadsheet provides a good example of what I remember seeing: lots of previous mappings which are still going through curation with Zooma confidence `GOOD`. I don't understand why this would be the case, maybe a Zooma issue?\n", + " * One example is `nephronophthisis` which disappears from automated mappings throughout 2023 despite always being present in our final latest_mappings file:\n", + "\n", + "```\n", + "$ grep -Pie '^nephronophthisis\\t' */automated_trait_mappings.tsv \n", + "2022-02-03_test/automated_trait_mappings.tsv:nephronophthisis http://www.orpha.net/ORDO/Orphanet_655 Nephronophthisis\n", + "2022-03-03/automated_trait_mappings.tsv:nephronophthisis http://www.orpha.net/ORDO/Orphanet_655 Nephronophthisis\n", + "2022-05-11/automated_trait_mappings.tsv:nephronophthisis http://www.orpha.net/ORDO/Orphanet_655 Nephronophthisis\n", + "2022-08-18/automated_trait_mappings.tsv:nephronophthisis http://purl.obolibrary.org/obo/MONDO_0019005 nephronophthisis\n", + "2022-10-17_test/automated_trait_mappings.tsv:nephronophthisis http://purl.obolibrary.org/obo/MONDO_0019005 nephronophthisis\n", + "2024-01-19/automated_trait_mappings.tsv:nephronophthisis http://purl.obolibrary.org/obo/MONDO_0019005 nephronophthisis\n", + "2024-04-11/automated_trait_mappings.tsv:nephronophthisis http://purl.obolibrary.org/obo/MONDO_0019005 nephronophthisis\n", + "2024-07-02/automated_trait_mappings.tsv:nephronophthisis http://purl.obolibrary.org/obo/MONDO_0019005 nephronophthisis\n", + "2024-09-05_test/automated_trait_mappings.tsv:nephronophthisis http://purl.obolibrary.org/obo/MONDO_0019005 nephronophthisis\n", + "2024-10-11/automated_trait_mappings.tsv:nephronophthisis http://purl.obolibrary.org/obo/MONDO_0019005 nephronophthisis\n", + "2025-01-16/automated_trait_mappings.tsv:nephronophthisis http://purl.obolibrary.org/obo/MONDO_0019005 nephronophthisis\n", + "2025-02-25_test/automated_trait_mappings.tsv:nephronophthisis http://purl.obolibrary.org/obo/MONDO_0019005 nephronophthisis\n", + "2025-04-15/automated_trait_mappings.tsv:nephronophthisis http://purl.obolibrary.org/obo/MONDO_0019005 nephronophthisis\n", + "2025-06-20/automated_trait_mappings.tsv:nephronophthisis http://purl.obolibrary.org/obo/MONDO_0019005 nephronophthisis\n", + "2025-06-20/automated_trait_mappings.tsv:nephronophthisis http://purl.obolibrary.org/obo/HP_0000090 Nephronophthisis\n", + "2025-10-02/automated_trait_mappings.tsv:nephronophthisis http://purl.obolibrary.org/obo/HP_0000090 Nephronophthisis\n", + "2025-10-02/automated_trait_mappings.tsv:nephronophthisis http://purl.obolibrary.org/obo/MONDO_0019005 nephronophthisis\n", + "\n", + "$ grep -Pie '^nephronophthisis\\t' */trait_names_to_ontology_mappings.tsv\n", + "2022-02-03_test/trait_names_to_ontology_mappings.tsv:nephronophthisis http://www.orpha.net/ORDO/Orphanet_655 Nephronophthisis\n", + "2022-03-03/trait_names_to_ontology_mappings.tsv:nephronophthisis http://www.orpha.net/ORDO/Orphanet_655 Nephronophthisis\n", + "2022-05-11/trait_names_to_ontology_mappings.tsv:nephronophthisis http://www.orpha.net/ORDO/Orphanet_655 Nephronophthisis\n", + "2022-08-18/trait_names_to_ontology_mappings.tsv:nephronophthisis http://purl.obolibrary.org/obo/MONDO_0019005 nephronophthisis\n", + "2022-10-17_test/trait_names_to_ontology_mappings.tsv:nephronophthisis http://purl.obolibrary.org/obo/MONDO_0019005 nephronophthisis\n", + "2023-01-09/trait_names_to_ontology_mappings.tsv:nephronophthisis http://purl.obolibrary.org/obo/MONDO_0019005 nephronophthisis\n", + "2023-03-01/trait_names_to_ontology_mappings.tsv:nephronophthisis http://purl.obolibrary.org/obo/MONDO_0019005 nephronophthisis\n", + "2023-05-26_test/trait_names_to_ontology_mappings.tsv:nephronophthisis http://purl.obolibrary.org/obo/MONDO_0019005 nephronophthisis\n", + "2023-06-06/trait_names_to_ontology_mappings.tsv:nephronophthisis http://purl.obolibrary.org/obo/MONDO_0019005 nephronophthisis\n", + "2023-07-24/trait_names_to_ontology_mappings.tsv:nephronophthisis http://purl.obolibrary.org/obo/MONDO_0019005 nephronophthisis\n", + "2023-10-03/trait_names_to_ontology_mappings.tsv:nephronophthisis http://purl.obolibrary.org/obo/MONDO_0019005 nephronophthisis\n", + "2024-01-19/trait_names_to_ontology_mappings.tsv:nephronophthisis http://purl.obolibrary.org/obo/MONDO_0019005 nephronophthisis\n", + "2024-04-11/trait_names_to_ontology_mappings.tsv:nephronophthisis http://purl.obolibrary.org/obo/MONDO_0019005 nephronophthisis\n", + "2024-07-02/trait_names_to_ontology_mappings.tsv:nephronophthisis http://purl.obolibrary.org/obo/MONDO_0019005 nephronophthisis\n", + "2024-10-11/trait_names_to_ontology_mappings.tsv:nephronophthisis http://purl.obolibrary.org/obo/MONDO_0019005 nephronophthisis\n", + "2025-01-16/trait_names_to_ontology_mappings.tsv:nephronophthisis http://purl.obolibrary.org/obo/MONDO_0019005 nephronophthisis\n", + "2025-04-15/trait_names_to_ontology_mappings.tsv:nephronophthisis http://purl.obolibrary.org/obo/MONDO_0019005 nephronophthisis\n", + "2025-06-20/trait_names_to_ontology_mappings.tsv:nephronophthisis http://purl.obolibrary.org/obo/HP_0000090 Nephronophthisis\n", + "2025-06-20/trait_names_to_ontology_mappings.tsv:nephronophthisis http://purl.obolibrary.org/obo/MONDO_0019005 nephronophthisis\n", + "2025-10-02/trait_names_to_ontology_mappings.tsv:nephronophthisis http://purl.obolibrary.org/obo/HP_0000090 Nephronophthisis\n", + "2025-10-02/trait_names_to_ontology_mappings.tsv:nephronophthisis http://purl.obolibrary.org/obo/MONDO_0019005 nephronophthisis\n", + "```\n", + "\n", + "* For June and October 2025, most mappings I've checked were NOT deprecated, but returned to needing curation for an unknown reason. Since the spreadsheet no longer reports Zooma confidence, we can't confirm what the Zooma confidence was at the time of spreadsheet generation, since we've now provided Zooma feedback for the October batch and it may be incorporated already.\n", + " * For example, `severe cystic degeneration of the brain` => `HP_0007313`, which was an automated mapping through 2025-06-20 but had to go through [curation in 2025-10-02](https://docs.google.com/spreadsheets/d/1dYIHJ4QfC8cjCwJFzXnYzLglmg2V7h4wIicRqIVqxdg/edit?gid=88027652#gid=88027652&range=6279:6279), at which point it was given the same mapping as previous.\n", + " * Around June 2025 was the major revamp of the manual curation pipeline ([see PR](https://github.com/EBIvariation/CMAT/pull/465)), so maybe the influx of \"previously automated, now curated\" terms indicates an issue in the new pipeline. We should definitely look into this.\n", + " \n", + "Basically, my conclusion here is that due to a potential problem in the new pipeline, as well as uncertainty around how Zooma behaves and how reliable it is, we should for now maintain our memory of previous mappings including automated ones, and not rely solely on Zooma for this." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "38f44bcd-7778-4b5a-a432-22e88ee1ceee", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "my-pyenv", + "language": "python", + "name": "my-pyenv" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tests/trait_mapping/test_main.py b/tests/trait_mapping/test_main.py index 1821595e..5afa8578 100644 --- a/tests/trait_mapping/test_main.py +++ b/tests/trait_mapping/test_main.py @@ -91,7 +91,7 @@ def test_ols_exact_match(self): # Only goes through OLS as it finds an exact match in EFO trait = Trait('chédiak-higashi syndrome', None, None) processed_trait = self.run_process_trait(trait) - assert len(processed_trait.ols_result_list) == 3 + assert len(processed_trait.ols_result_list) == 8 assert processed_trait.is_finished def test_zooma_high_confidence(self):