diff --git a/immunology_kg/notebooks/snorkel_re_example/Snorkel RE example.ipynb b/immunology_kg/notebooks/snorkel_re_example/Snorkel RE example.ipynb new file mode 100644 index 0000000..86b90af --- /dev/null +++ b/immunology_kg/notebooks/snorkel_re_example/Snorkel RE example.ipynb @@ -0,0 +1,1952 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "from snorkel.preprocess.nlp import SpacyPreprocessor\n", + "from snorkel.labeling import PandasLFApplier,filter_unlabeled_dataframe,LFAnalysis ,labeling_function\n", + "from snorkel.labeling.model import MajorityClassVoter,MajorityLabelVoter,RandomVoter ,LabelModel\n", + "\n", + "\n", + "from snorkel.analysis import metric_score , get_label_buckets\n", + "\n", + "from snorkel.utils import probs_to_preds\n", + "\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "import pandas as pd\n", + "import re\n", + "import os\n", + "from collections import OrderedDict\n", + "\n", + "#importing self-defined helped modules\n", + "from snorkel_preprocessing_example import make_source_target_preprocessor,make_text_between_preprocessor" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 1. Load the data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
| \n", + " | Unnamed: 0 | \n", + "text | \n", + "source | \n", + "relation | \n", + "target | \n", + "link | \n", + "pmc_id | \n", + "doi_id | \n", + "
|---|---|---|---|---|---|---|---|---|
| 0 | \n", + "0 | \n", + "While blocking TPC2 activity by tetrandrine, a... | \n", + "{'(+)-Tetrandrine': {'namespace': 'chebi', 'na... | \n", + "negativeCorrelation | \n", + "{'Tpcn2': {'namespace': 'mgi', 'name': 'Tpcn2'... | \n", + "{'annotations': {}, 'citation': {'authors': ['... | \n", + "32221306.0 | \n", + "NaN | \n", + "
| 1 | \n", + "1 | \n", + "Chemoinformatics searches yielded 15 approved ... | \n", + "{'(S)-verapamil': {'namespace': 'chebi', 'name... | \n", + "negativeCorrelation | \n", + "{'hypertension': {'namespace': 'doid', 'name':... | \n", + "{'annotations': {}, 'citation': {'db': 'DOI', ... | \n", + "NaN | \n", + "https://doi.org/10.1101/2020.03.22.002386 | \n", + "
| 2 | \n", + "2 | \n", + "Thyroid stimulating hormone and free triiodoth... | \n", + "{\"3,3',5'-triiodothyronine\": {'namespace': 'ch... | \n", + "negativeCorrelation | \n", + "{'COVID-19': {'namespace': 'doid', 'name': 'CO... | \n", + "{'annotations': {'mesh': {'D044967': True}}, '... | \n", + "32217556.0 | \n", + "NaN | \n", + "
| 3 | \n", + "3 | \n", + "Based on these results, we performed virtual d... | \n", + "{\"4'-epidoxorubicin\": {'namespace': 'chebi', '... | \n", + "decreases | \n", + "{'3.4.22.69': {'namespace': 'eccode', 'name': ... | \n", + "{'annotations': {}, 'citation': {'authors': ['... | \n", + "32173287.0 | \n", + "NaN | \n", + "
| 4 | \n", + "4 | \n", + "Doctors can also use a clinically approved bil... | \n", + "{'4-methylumbelliferone': {'namespace': 'chebi... | \n", + "decreases | \n", + "{'HAS2': {'namespace': 'hgnc', 'name': 'HAS2',... | \n", + "{'annotations': {'mesh': {'D008168': True}}, '... | \n", + "32205856.0 | \n", + "NaN | \n", + "
| 5 | \n", + "5 | \n", + "Since Vitamin B3 is highly lung protective, it... | \n", + "{'4-methylumbelliferone': {'namespace': 'chebi... | \n", + "decreases | \n", + "{'HAS2': {'namespace': 'hgnc', 'name': 'HAS2',... | \n", + "{'annotations': {}, 'citation': {'authors': ['... | \n", + "32205856.0 | \n", + "NaN | \n", + "
| 6 | \n", + "6 | \n", + "Doctors can also use a clinically approved bil... | \n", + "{'4-methylumbelliferone': {'namespace': 'chebi... | \n", + "decreases | \n", + "{'inflammatory response': {'namespace': 'go', ... | \n", + "{'annotations': {'mesh': {'D008168': True}}, '... | \n", + "32205856.0 | \n", + "NaN | \n", + "
| \n", + " | text | \n", + "source | \n", + "relation | \n", + "target | \n", + "link | \n", + "pmc_id | \n", + "doi_id | \n", + "
|---|---|---|---|---|---|---|---|
| 0 | \n", + "While blocking TPC2 activity by tetrandrine, a... | \n", + "{'(+)-Tetrandrine': {'namespace': 'chebi', 'na... | \n", + "True | \n", + "{'Tpcn2': {'namespace': 'mgi', 'name': 'Tpcn2'... | \n", + "{'annotations': {}, 'citation': {'authors': ['... | \n", + "32221306.0 | \n", + "NaN | \n", + "
| 1 | \n", + "Chemoinformatics searches yielded 15 approved ... | \n", + "{'(S)-verapamil': {'namespace': 'chebi', 'name... | \n", + "True | \n", + "{'hypertension': {'namespace': 'doid', 'name':... | \n", + "{'annotations': {}, 'citation': {'db': 'DOI', ... | \n", + "NaN | \n", + "https://doi.org/10.1101/2020.03.22.002386 | \n", + "
| 2 | \n", + "Thyroid stimulating hormone and free triiodoth... | \n", + "{\"3,3',5'-triiodothyronine\": {'namespace': 'ch... | \n", + "True | \n", + "{'COVID-19': {'namespace': 'doid', 'name': 'CO... | \n", + "{'annotations': {'mesh': {'D044967': True}}, '... | \n", + "32217556.0 | \n", + "NaN | \n", + "
| 3 | \n", + "The administration of methylprednisolone appea... | \n", + "{'6-methylprednisolone': {'namespace': 'chebi'... | \n", + "True | \n", + "{'Death': {'namespace': 'mesh', 'name': 'Death... | \n", + "{'annotations': {'doid': {'11394': True}}, 'ci... | \n", + "32167524.0 | \n", + "NaN | \n", + "
| 4 | \n", + "Adverse reactions of IFN-α mainly include low-... | \n", + "{'Interferon alfa-2a': {'namespace': 'chebi', ... | \n", + "False | \n", + "{'Low-grade fever': {'namespace': 'hp', 'name'... | \n", + "{'annotations': {}, 'citation': {'authors': ['... | \n", + "32166483.0 | \n", + "NaN | \n", + "
| \n", + " | text | \n", + "source | \n", + "relation | \n", + "target | \n", + "link | \n", + "pmc_id | \n", + "doi_id | \n", + "
|---|---|---|---|---|---|---|---|
| 0 | \n", + "While blocking TPC2 activity by tetrandrine, a... | \n", + "{'(+)-Tetrandrine': {'namespace': 'chebi', 'na... | \n", + "True | \n", + "{'Tpcn2': {'namespace': 'mgi', 'name': 'Tpcn2'... | \n", + "{'annotations': {}, 'citation': {'authors': ['... | \n", + "32221306.0 | \n", + "NaN | \n", + "
| 1 | \n", + "Chemoinformatics searches yielded 15 approved ... | \n", + "{'(S)-verapamil': {'namespace': 'chebi', 'name... | \n", + "True | \n", + "{'hypertension': {'namespace': 'doid', 'name':... | \n", + "{'annotations': {}, 'citation': {'db': 'DOI', ... | \n", + "NaN | \n", + "https://doi.org/10.1101/2020.03.22.002386 | \n", + "
| 2 | \n", + "Thyroid stimulating hormone and free triiodoth... | \n", + "{\"3,3',5'-triiodothyronine\": {'namespace': 'ch... | \n", + "True | \n", + "{'COVID-19': {'namespace': 'doid', 'name': 'CO... | \n", + "{'annotations': {'mesh': {'D044967': True}}, '... | \n", + "32217556.0 | \n", + "NaN | \n", + "
| 3 | \n", + "The administration of methylprednisolone appea... | \n", + "{'6-methylprednisolone': {'namespace': 'chebi'... | \n", + "True | \n", + "{'Death': {'namespace': 'mesh', 'name': 'Death... | \n", + "{'annotations': {'doid': {'11394': True}}, 'ci... | \n", + "32167524.0 | \n", + "NaN | \n", + "
| 4 | \n", + "In our opinion, during the COVID-19 pandemic, ... | \n", + "{'adrenergic antagonist': {'namespace': 'chebi... | \n", + "True | \n", + "{'COVID-19': {'namespace': 'doid', 'name': 'CO... | \n", + "{'annotations': {}, 'citation': {'authors': ['... | \n", + "32220710.0 | \n", + "NaN | \n", + "
| 5 | \n", + "Consistent with previous reports, 20mM NH4Cl a... | \n", + "{'ammonium chloride': {'namespace': 'chebi', '... | \n", + "True | \n", + "{'G protein, vesicular stomatitis virus': {'na... | \n", + "{'annotations': {}, 'citation': {'authors': ['... | \n", + "32221306.0 | \n", + "NaN | \n", + "
| 6 | \n", + "If the latter percentage would be found to be ... | \n", + "{'angiotensin receptor antagonist': {'namespac... | \n", + "True | \n", + "{'COVID-19': {'namespace': 'doid', 'name': 'CO... | \n", + "{'annotations': {}, 'citation': {'authors': ['... | \n", + "32129518.0 | \n", + "NaN | \n", + "
| 7 | \n", + "Consistent with previous reports, 20mM NH4Cl a... | \n", + "{'bafilomycin A1': {'namespace': 'chebi', 'nam... | \n", + "True | \n", + "{'G protein, vesicular stomatitis virus': {'na... | \n", + "{'annotations': {}, 'citation': {'authors': ['... | \n", + "32221306.0 | \n", + "NaN | \n", + "
| \n", + " | text | \n", + "source | \n", + "relation | \n", + "target | \n", + "link | \n", + "pmc_id | \n", + "doi_id | \n", + "
|---|---|---|---|---|---|---|---|
| 0 | \n", + "Adverse reactions of IFN-α mainly include low-... | \n", + "{'Interferon alfa-2a': {'namespace': 'chebi', ... | \n", + "False | \n", + "{'Low-grade fever': {'namespace': 'hp', 'name'... | \n", + "{'annotations': {}, 'citation': {'authors': ['... | \n", + "32166483.0 | \n", + "NaN | \n", + "
| 1 | \n", + "Adverse reactions of IFN-α mainly include low-... | \n", + "{'Interferon alfa-2a': {'namespace': 'chebi', ... | \n", + "False | \n", + "{'influenza': {'namespace': 'doid', 'name': 'i... | \n", + "{'annotations': {}, 'citation': {'authors': ['... | \n", + "32166483.0 | \n", + "NaN | \n", + "
| 2 | \n", + "This may be accounted for by two complementary... | \n", + "{'angiotensin II': {'namespace': 'chebi', 'nam... | \n", + "False | \n", + "{'COVID-19': {'namespace': 'doid', 'name': 'CO... | \n", + "{'annotations': {}, 'citation': {'authors': ['... | \n", + "32129518.0 | \n", + "NaN | \n", + "
| 3 | \n", + "ACE2 can also antagonize cardiac fibrosis and ... | \n", + "{'angiotensin II': {'namespace': 'chebi', 'nam... | \n", + "False | \n", + "{'Ventricular Remodeling': {'namespace': 'mesh... | \n", + "{'annotations': {}, 'citation': {'authors': ['... | \n", + "32221983.0 | \n", + "NaN | \n", + "
| 4 | \n", + "ACE2 can also antagonize cardiac fibrosis and ... | \n", + "{'angiotensin II': {'namespace': 'chebi', 'nam... | \n", + "False | \n", + "{'Myocardial fibrosis': {'namespace': 'hp', 'n... | \n", + "{'annotations': {}, 'citation': {'authors': ['... | \n", + "32221983.0 | \n", + "NaN | \n", + "
| 5 | \n", + "The existence of significantly increased fibri... | \n", + "{'Fibrin': {'namespace': 'chebi', 'name': 'Fib... | \n", + "False | \n", + "{'Hyperfibrinolysis': {'namespace': 'hp', 'nam... | \n", + "{'annotations': {}, 'citation': {'authors': ['... | \n", + "32216698.0 | \n", + "NaN | \n", + "
| 6 | \n", + "This opinion is supported by the presence of h... | \n", + "{'Fibrin': {'namespace': 'chebi', 'name': 'Fib... | \n", + "False | \n", + "{'Hemorrhage': {'namespace': 'mesh', 'name': '... | \n", + "{'annotations': {}, 'citation': {'authors': ['... | \n", + "32216698.0 | \n", + "NaN | \n", + "
| 7 | \n", + "In the influenza virus model, it was reported ... | \n", + "{'chloroquine': {'namespace': 'chebi', 'name':... | \n", + "False | \n", + "{'dendritic cell antigen processing and presen... | \n", + "{'annotations': {'mesh': {'D007251': True}}, '... | \n", + "32171740.0 | \n", + "NaN | \n", + "
| \n", + " | j | \n", + "Polarity | \n", + "Coverage | \n", + "Overlaps | \n", + "Conflicts | \n", + "
|---|---|---|---|---|---|
| contains_reduction_tokens | \n", + "0 | \n", + "[1] | \n", + "0.057027 | \n", + "0.019308 | \n", + "0.016165 | \n", + "
| contains_reduction_tokens_text_between | \n", + "1 | \n", + "[1] | \n", + "0.004041 | \n", + "0.004041 | \n", + "0.000898 | \n", + "
| contains_negative_corrrelation_regex | \n", + "2 | \n", + "[1] | \n", + "0.008532 | \n", + "0.000000 | \n", + "0.000000 | \n", + "
| contains_increase_decrease_pattern | \n", + "3 | \n", + "[] | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "
| contains_increase_tokens | \n", + "4 | \n", + "[0] | \n", + "0.130220 | \n", + "0.016165 | \n", + "0.016165 | \n", + "
| contains_increase_tokens_text_between | \n", + "5 | \n", + "[] | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "
| contains_positive_corrrelation_regex | \n", + "6 | \n", + "[1] | \n", + "0.002245 | \n", + "0.000000 | \n", + "0.000000 | \n", + "
| contains_increase_increase_pattern | \n", + "7 | \n", + "[] | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "
| \n", + " | j | \n", + "Polarity | \n", + "Coverage | \n", + "Overlaps | \n", + "Conflicts | \n", + "Correct | \n", + "Incorrect | \n", + "Emp. Acc. | \n", + "
|---|---|---|---|---|---|---|---|---|
| contains_reduction_tokens | \n", + "0 | \n", + "[1] | \n", + "0.057027 | \n", + "0.019308 | \n", + "0.016165 | \n", + "69 | \n", + "58 | \n", + "0.543307 | \n", + "
| contains_reduction_tokens_text_between | \n", + "1 | \n", + "[1] | \n", + "0.004041 | \n", + "0.004041 | \n", + "0.000898 | \n", + "8 | \n", + "1 | \n", + "0.888889 | \n", + "
| contains_negative_corrrelation_regex | \n", + "2 | \n", + "[1] | \n", + "0.008532 | \n", + "0.000000 | \n", + "0.000000 | \n", + "18 | \n", + "1 | \n", + "0.947368 | \n", + "
| contains_increase_decrease_pattern | \n", + "3 | \n", + "[] | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0 | \n", + "0 | \n", + "0.000000 | \n", + "
| contains_increase_tokens | \n", + "4 | \n", + "[0] | \n", + "0.130220 | \n", + "0.016165 | \n", + "0.016165 | \n", + "263 | \n", + "27 | \n", + "0.906897 | \n", + "
| contains_increase_tokens_text_between | \n", + "5 | \n", + "[] | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0 | \n", + "0 | \n", + "0.000000 | \n", + "
| contains_positive_corrrelation_regex | \n", + "6 | \n", + "[1] | \n", + "0.002245 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0 | \n", + "5 | \n", + "0.000000 | \n", + "
| contains_increase_increase_pattern | \n", + "7 | \n", + "[] | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0 | \n", + "0 | \n", + "0.000000 | \n", + "
| \n", + " | text | \n", + "
|---|---|
| 0 | \n", + "While blocking TPC2 activity by tetrandrine, a... | \n", + "
| 2 | \n", + "Thyroid stimulating hormone and free triiodoth... | \n", + "
| 3 | \n", + "The administration of methylprednisolone appea... | \n", + "
| 7 | \n", + "Consistent with previous reports, 20mM NH4Cl a... | \n", + "
| 12 | \n", + "Consistent with previous reports, 20mM NH4Cl a... | \n", + "
| ... | \n", + "... | \n", + "
| 1644 | \n", + "Actual bicarbonate and total carbon dioxide co... | \n", + "
| 1655 | \n", + "Albumin concentrations were significantly lowe... | \n", + "
| 1657 | \n", + "Moreover, the frequencies of regulatory T cell... | \n", + "
| 1658 | \n", + "The reduced expressions of interferon-γ (IFN-γ... | \n", + "
| 1668 | \n", + "Spleen atrophy was observed in all reported ca... | \n", + "
127 rows × 1 columns
\n", + "