diff --git a/doc/tutorial-draft.bib b/doc/tutorial-draft.bib new file mode 100644 index 0000000..42cee26 --- /dev/null +++ b/doc/tutorial-draft.bib @@ -0,0 +1,249 @@ + +@incollection{vandin_discovery_2011, + title = {Discovery of mutated subnetworks associated with clinical data in cancer}, + isbn = {978-981-4596-37-4}, + url = {http://www.worldscientific.com.myaccess.library.utoronto.ca/doi/abs/10.1142/9789814366496_0006}, + urldate = {2017-02-07}, + booktitle = {Biocomputing 2012}, + publisher = {WORLD SCIENTIFIC}, + author = {Vandin, Fabio and Clay, Patrick and Upfal, Eli and Raphael, Benjamin J.}, + month = nov, + year = {2011}, + note = {DOI: 10.1142/9789814366496\_0006 +DOI: 10.1142/9789814366496\_0006}, + pages = {55--66}, + file = {Snapshot:/home/joel/Zotero/storage/UBRUPBPJ/9789814366496_0006.html:text/html;vandin_et_al_2011_discovery_of_mutated_subnetworks_associated_with_clinical_data_in_cancer.pdf:/home/joel/edu/articles/vandin_et_al_2011_discovery_of_mutated_subnetworks_associated_with_clinical_data_in_cancer.pdf:application/pdf} +} + +@article{leiserson_pan-cancer_2015, + title = {Pan-cancer network analysis identifies combinations of rare somatic mutations across pathways and protein complexes}, + volume = {47}, + copyright = {© 2014 Nature Publishing Group, a division of Macmillan Publishers Limited. All Rights Reserved.}, + issn = {1061-4036}, + url = {http://www.nature.com.myaccess.library.utoronto.ca/ng/journal/v47/n2/full/ng.3168.html#supplementary-information}, + doi = {10.1038/ng.3168}, + abstract = {Cancers exhibit extensive mutational heterogeneity, and the resulting long-tail phenomenon complicates the discovery of genes and pathways that are significantly mutated in cancer. We perform a pan-cancer analysis of mutated networks in 3,281 samples from 12 cancer types from The Cancer Genome Atlas (TCGA) using HotNet2, a new algorithm to find mutated subnetworks that overcomes the limitations of existing single-gene, pathway and network approaches. We identify 16 significantly mutated subnetworks that comprise well-known cancer signaling pathways as well as subnetworks with less characterized roles in cancer, including cohesin, condensin and others. Many of these subnetworks exhibit co-occurring mutations across samples. These subnetworks contain dozens of genes with rare somatic mutations across multiple cancers; many of these genes have additional evidence supporting a role in cancer. By illuminating these rare combinations of mutations, pan-cancer network analyses provide a roadmap to investigate new diagnostic and therapeutic opportunities across cancer types.}, + language = {en}, + number = {2}, + urldate = {2017-02-08}, + journal = {Nature Genetics}, + author = {Leiserson, Mark D. M. and Vandin, Fabio and Wu, Hsin-Ta and Dobson, Jason R. and Eldridge, Jonathan V. and Thomas, Jacob L. and Papoutsaki, Alexandra and Kim, Younhun and Niu, Beifang and McLellan, Michael and Lawrence, Michael S. and Gonzalez-Perez, Abel and Tamborero, David and Cheng, Yuwei and Ryslik, Gregory A. and Lopez-Bigas, Nuria and Getz, Gad and Ding, Li and Raphael, Benjamin J.}, + month = feb, + year = {2015}, + keywords = {Computational biology and bioinformatics, Systems biology, cancer}, + pages = {106--114}, + file = {leiserson_et_al_2015_pan-cancer_network_analysis_identifies_combinations_of_rare_somatic_mutations.pdf:/home/joel/edu/articles/leiserson_et_al_2015_pan-cancer_network_analysis_identifies_combinations_of_rare_somatic_mutations.pdf:application/pdf;leiserson_et_al_2015_pan-cancer_network_analysis_identifies_combinations_of_rare_somatic_mutations.pdf:/home/joel/edu/articles/leiserson_et_al_2015_pan-cancer_network_analysis_identifies_combinations_of_rare_somatic_mutations_2.pdf:application/pdf;Snapshot:/home/joel/Zotero/storage/CRE3S74K/ng.3168.html:text/html;Supplementary Table 24. Mutually exclusive and co-occurring test for pairwise genes within the pair of HotNet2 identified subnetworks across all pan-cancer samples.:/home/joel/Zotero/storage/MI2RV4JR/Leiserson et al. - 2015 - Pan-cancer network analysis identifies combination.xla:application/vnd.ms-excel;Supplementary Tables 1–23 and 25–39. Supplementary Tables 1–23 and 25–39.:/home/joel/Zotero/storage/SGNS4E43/Leiserson et al. - 2015 - Pan-cancer network analysis identifies combination.xla:application/vnd.ms-excel} +} + +@article{lawrence_mutational_2013, + title = {Mutational heterogeneity in cancer and the search for new cancer-associated genes}, + volume = {499}, + issn = {1476-4687}, + doi = {10.1038/nature12213}, + abstract = {Major international projects are underway that are aimed at creating a comprehensive catalogue of all the genes responsible for the initiation and progression of cancer. These studies involve the sequencing of matched tumour-normal samples followed by mathematical analysis to identify those genes in which mutations occur more frequently than expected by random chance. Here we describe a fundamental problem with cancer genome studies: as the sample size increases, the list of putatively significant genes produced by current analytical methods burgeons into the hundreds. The list includes many implausible genes (such as those encoding olfactory receptors and the muscle protein titin), suggesting extensive false-positive findings that overshadow true driver events. We show that this problem stems largely from mutational heterogeneity and provide a novel analytical methodology, MutSigCV, for resolving the problem. We apply MutSigCV to exome sequences from 3,083 tumour-normal pairs and discover extraordinary variation in mutation frequency and spectrum within cancer types, which sheds light on mutational processes and disease aetiology, and in mutation frequency across the genome, which is strongly correlated with DNA replication timing and also with transcriptional activity. By incorporating mutational heterogeneity into the analyses, MutSigCV is able to eliminate most of the apparent artefactual findings and enable the identification of genes truly associated with cancer.}, + language = {eng}, + number = {7457}, + journal = {Nature}, + author = {Lawrence, Michael S. and Stojanov, Petar and Polak, Paz and Kryukov, Gregory V. and Cibulskis, Kristian and Sivachenko, Andrey and Carter, Scott L. and Stewart, Chip and Mermel, Craig H. and Roberts, Steven A. and Kiezun, Adam and Hammerman, Peter S. and McKenna, Aaron and Drier, Yotam and Zou, Lihua and Ramos, Alex H. and Pugh, Trevor J. and Stransky, Nicolas and Helman, Elena and Kim, Jaegil and Sougnez, Carrie and Ambrogio, Lauren and Nickerson, Elizabeth and Shefler, Erica and Cortés, Maria L. and Auclair, Daniel and Saksena, Gordon and Voet, Douglas and Noble, Michael and DiCara, Daniel and Lin, Pei and Lichtenstein, Lee and Heiman, David I. and Fennell, Timothy and Imielinski, Marcin and Hernandez, Bryan and Hodis, Eran and Baca, Sylvan and Dulak, Austin M. and Lohr, Jens and Landau, Dan-Avi and Wu, Catherine J. and Melendez-Zajgla, Jorge and Hidalgo-Miranda, Alfredo and Koren, Amnon and McCarroll, Steven A. and Mora, Jaume and Lee, Ryan S. and Crompton, Brian and Onofrio, Robert and Parkin, Melissa and Winckler, Wendy and Ardlie, Kristin and Gabriel, Stacey B. and Roberts, Charles W. M. and Biegel, Jaclyn A. and Stegmaier, Kimberly and Bass, Adam J. and Garraway, Levi A. and Meyerson, Matthew and Golub, Todd R. and Gordenin, Dmitry A. and Sunyaev, Shamil and Lander, Eric S. and Getz, Gad}, + month = jul, + year = {2013}, + pmid = {23770567}, + pmcid = {PMC3919509}, + keywords = {Humans, Neoplasms, Artifacts, DNA Replication Timing, Exome, False Positive Reactions, Gene Expression, Genetic Heterogeneity, Genome, Human, Lung Neoplasms, Mutation, Mutation Rate, Neoplasms, Squamous Cell, Oncogenes, Reproducibility of Results, Sample Size}, + pages = {214--218} +} + +@article{szklarczyk_string_2015, + title = {{STRING} v10: protein-protein interaction networks, integrated over the tree of life}, + volume = {43}, + issn = {1362-4962}, + shorttitle = {{STRING} v10}, + doi = {10.1093/nar/gku1003}, + abstract = {The many functional partnerships and interactions that occur between proteins are at the core of cellular processing and their systematic characterization helps to provide context in molecular systems biology. However, known and predicted interactions are scattered over multiple resources, and the available data exhibit notable differences in terms of quality and completeness. The STRING database (http://string-db.org) aims to provide a critical assessment and integration of protein-protein interactions, including direct (physical) as well as indirect (functional) associations. The new version 10.0 of STRING covers more than 2000 organisms, which has necessitated novel, scalable algorithms for transferring interaction information between organisms. For this purpose, we have introduced hierarchical and self-consistent orthology annotations for all interacting proteins, grouping the proteins into families at various levels of phylogenetic resolution. Further improvements in version 10.0 include a completely redesigned prediction pipeline for inferring protein-protein associations from co-expression data, an API interface for the R computing environment and improved statistical analysis for enrichment tests in user-provided networks.}, + language = {eng}, + number = {Database issue}, + journal = {Nucleic Acids Research}, + author = {Szklarczyk, Damian and Franceschini, Andrea and Wyder, Stefan and Forslund, Kristoffer and Heller, Davide and Huerta-Cepas, Jaime and Simonovic, Milan and Roth, Alexander and Santos, Alberto and Tsafou, Kalliopi P. and Kuhn, Michael and Bork, Peer and Jensen, Lars J. and von Mering, Christian}, + month = jan, + year = {2015}, + pmid = {25352553}, + pmcid = {PMC4383874}, + keywords = {Proteins, Databases, Protein, Gene Expression Profiling, Internet, Protein Interaction Mapping, Software}, + pages = {D447--452} +} + +@article{orchard_mintact_2014, + title = {The {MIntAct} project--{IntAct} as a common curation platform for 11 molecular interaction databases}, + volume = {42}, + issn = {1362-4962}, + doi = {10.1093/nar/gkt1115}, + abstract = {IntAct (freely available at http://www.ebi.ac.uk/intact) is an open-source, open data molecular interaction database populated by data either curated from the literature or from direct data depositions. IntAct has developed a sophisticated web-based curation tool, capable of supporting both IMEx- and MIMIx-level curation. This tool is now utilized by multiple additional curation teams, all of whom annotate data directly into the IntAct database. Members of the IntAct team supply appropriate levels of training, perform quality control on entries and take responsibility for long-term data maintenance. Recently, the MINT and IntAct databases decided to merge their separate efforts to make optimal use of limited developer resources and maximize the curation output. All data manually curated by the MINT curators have been moved into the IntAct database at EMBL-EBI and are merged with the existing IntAct dataset. Both IntAct and MINT are active contributors to the IMEx consortium (http://www.imexconsortium.org).}, + language = {eng}, + number = {Database issue}, + journal = {Nucleic Acids Research}, + author = {Orchard, Sandra and Ammari, Mais and Aranda, Bruno and Breuza, Lionel and Briganti, Leonardo and Broackes-Carter, Fiona and Campbell, Nancy H. and Chavali, Gayatri and Chen, Carol and del-Toro, Noemi and Duesbury, Margaret and Dumousseau, Marine and Galeota, Eugenia and Hinz, Ursula and Iannuccelli, Marta and Jagannathan, Sruthi and Jimenez, Rafael and Khadake, Jyoti and Lagreid, Astrid and Licata, Luana and Lovering, Ruth C. and Meldal, Birgit and Melidoni, Anna N. and Milagros, Mila and Peluso, Daniele and Perfetto, Livia and Porras, Pablo and Raghunath, Arathi and Ricard-Blum, Sylvie and Roechert, Bernd and Stutz, Andre and Tognolli, Michael and van Roey, Kim and Cesareni, Gianni and Hermjakob, Henning}, + month = jan, + year = {2014}, + pmid = {24234451}, + pmcid = {PMC3965093}, + keywords = {Databases, Protein, Internet, Protein Interaction Mapping, Software}, + pages = {D358--363} +} + +@article{mermel_gistic2.0_2011, + title = {{GISTIC}2.0 facilitates sensitive and confident localization of the targets of focal somatic copy-number alteration in human cancers}, + volume = {12}, + issn = {1474-760X}, + doi = {10.1186/gb-2011-12-4-r41}, + abstract = {We describe methods with enhanced power and specificity to identify genes targeted by somatic copy-number alterations (SCNAs) that drive cancer growth. By separating SCNA profiles into underlying arm-level and focal alterations, we improve the estimation of background rates for each category. We additionally describe a probabilistic method for defining the boundaries of selected-for SCNA regions with user-defined confidence. Here we detail this revised computational approach, GISTIC2.0, and validate its performance in real and simulated datasets.}, + language = {eng}, + number = {4}, + journal = {Genome Biology}, + author = {Mermel, Craig H. and Schumacher, Steven E. and Hill, Barbara and Meyerson, Matthew L. and Beroukhim, Rameen and Getz, Gad}, + year = {2011}, + pmid = {21527027}, + pmcid = {PMC3218867}, + keywords = {Humans, Neoplasms, Software, Algorithms, Computational Biology, Computer Simulation, Gene Dosage, Models, Theoretical, Tumor Suppressor Proteins}, + pages = {R41} +} + +@article{dees_music:_2012, + title = {{MuSiC}: identifying mutational significance in cancer genomes}, + volume = {22}, + issn = {1549-5469}, + shorttitle = {{MuSiC}}, + doi = {10.1101/gr.134635.111}, + abstract = {Massively parallel sequencing technology and the associated rapidly decreasing sequencing costs have enabled systemic analyses of somatic mutations in large cohorts of cancer cases. Here we introduce a comprehensive mutational analysis pipeline that uses standardized sequence-based inputs along with multiple types of clinical data to establish correlations among mutation sites, affected genes and pathways, and to ultimately separate the commonly abundant passenger mutations from the truly significant events. In other words, we aim to determine the Mutational Significance in Cancer (MuSiC) for these large data sets. The integration of analytical operations in the MuSiC framework is widely applicable to a broad set of tumor types and offers the benefits of automation as well as standardization. Herein, we describe the computational structure and statistical underpinnings of the MuSiC pipeline and demonstrate its performance using 316 ovarian cancer samples from the TCGA ovarian cancer project. MuSiC correctly confirms many expected results, and identifies several potentially novel avenues for discovery.}, + language = {eng}, + number = {8}, + journal = {Genome Research}, + author = {Dees, Nathan D. and Zhang, Qunyuan and Kandoth, Cyriac and Wendl, Michael C. and Schierding, William and Koboldt, Daniel C. and Mooney, Thomas B. and Callaway, Matthew B. and Dooling, David and Mardis, Elaine R. and Wilson, Richard K. and Ding, Li}, + month = aug, + year = {2012}, + pmid = {22759861}, + pmcid = {PMC3409272}, + keywords = {Female, Humans, Mutation, Reproducibility of Results, Software, Algorithms, Computational Biology, BRCA1 Protein, DNA Mutational Analysis, Genes, Neoplasm, Molecular Sequence Annotation, Ovarian Neoplasms}, + pages = {1589--1598} +} + +@article{vogelstein_cancer_2013, + title = {Cancer genome landscapes}, + volume = {339}, + issn = {1095-9203}, + doi = {10.1126/science.1235122}, + abstract = {Over the past decade, comprehensive sequencing efforts have revealed the genomic landscapes of common forms of human cancer. For most cancer types, this landscape consists of a small number of "mountains" (genes altered in a high percentage of tumors) and a much larger number of "hills" (genes altered infrequently). To date, these studies have revealed {\textasciitilde}140 genes that, when altered by intragenic mutations, can promote or "drive" tumorigenesis. A typical tumor contains two to eight of these "driver gene" mutations; the remaining mutations are passengers that confer no selective growth advantage. Driver genes can be classified into 12 signaling pathways that regulate three core cellular processes: cell fate, cell survival, and genome maintenance. A better understanding of these pathways is one of the most pressing needs in basic cancer research. Even now, however, our knowledge of cancer genomes is sufficient to guide the development of more effective approaches for reducing cancer morbidity and mortality.}, + language = {eng}, + number = {6127}, + journal = {Science (New York, N.Y.)}, + author = {Vogelstein, Bert and Papadopoulos, Nickolas and Velculescu, Victor E. and Zhou, Shibin and Diaz, Luis A. and Kinzler, Kenneth W.}, + month = mar, + year = {2013}, + pmid = {23539594}, + pmcid = {PMC3749880}, + keywords = {Humans, Neoplasms, Genetic Heterogeneity, Genome, Human, Mutation, Genes, Neoplasm, Cell Transformation, Neoplastic, Mutagenesis, Signal Transduction}, + pages = {1546--1558} +} + +@article{hanahan_hallmarks_2011, + title = {Hallmarks of cancer: the next generation}, + volume = {144}, + issn = {1097-4172}, + shorttitle = {Hallmarks of cancer}, + doi = {10.1016/j.cell.2011.02.013}, + abstract = {The hallmarks of cancer comprise six biological capabilities acquired during the multistep development of human tumors. The hallmarks constitute an organizing principle for rationalizing the complexities of neoplastic disease. They include sustaining proliferative signaling, evading growth suppressors, resisting cell death, enabling replicative immortality, inducing angiogenesis, and activating invasion and metastasis. Underlying these hallmarks are genome instability, which generates the genetic diversity that expedites their acquisition, and inflammation, which fosters multiple hallmark functions. Conceptual progress in the last decade has added two emerging hallmarks of potential generality to this list-reprogramming of energy metabolism and evading immune destruction. In addition to cancer cells, tumors exhibit another dimension of complexity: they contain a repertoire of recruited, ostensibly normal cells that contribute to the acquisition of hallmark traits by creating the "tumor microenvironment." Recognition of the widespread applicability of these concepts will increasingly affect the development of new means to treat human cancer.}, + language = {eng}, + number = {5}, + journal = {Cell}, + author = {Hanahan, Douglas and Weinberg, Robert A.}, + month = mar, + year = {2011}, + pmid = {21376230}, + keywords = {Animals, Humans, Neoplasms, Signal Transduction, Genomic Instability, Neoplasm Invasiveness, Stromal Cells}, + pages = {646--674} +} + +@article{network_corrigendum:_2013, + title = {Corrigendum: {Comprehensive} genomic characterization defines human glioblastoma genes and core pathways}, + volume = {494}, + issn = {1476-4687}, + shorttitle = {Corrigendum}, + doi = {10.1038/nature11903}, + language = {eng}, + number = {7438}, + journal = {Nature}, + author = {Network, The Cancer Genome Atlas Research}, + month = feb, + year = {2013}, + pmid = {23389443}, + pages = {506} +} + +@article{printz_aacr_2017, + title = {{AACR} releases large cancer genomic data set from project {GENIE}}, + volume = {123}, + issn = {1097-0142}, + doi = {10.1002/cncr.30755}, + language = {eng}, + number = {10}, + journal = {Cancer}, + author = {Printz, Carrie}, + month = may, + year = {2017}, + pmid = {28475245}, + pages = {1685} +} + +@article{khurana_interpretation_2013, + title = {Interpretation of {Genomic} {Variants} {Using} a {Unified} {Biological} {Network} {Approach}}, + volume = {9}, + issn = {1553-7358}, + url = {http://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1002886}, + doi = {10.1371/journal.pcbi.1002886}, + abstract = {Author Summary The number of personal genomes sequenced has grown rapidly over the last few years and is likely to grow further. In order to use the DNA sequence variants amongst individuals for personalized medicine, we need to understand the functional impact of these variants. Deleterious variants in genes can have a wide spectrum of global effects, ranging from fatal for essential genes to no obvious damaging effect for loss-of-function tolerant genes. The global effect of a gene mutation is largely governed by the diverse biological networks in which the gene participates. Since genes participate in many networks, no singular network captures the global picture of gene interactions. Here we integrate the diverse modes of gene interactions (regulatory, genetic, phosphorylation, signaling, metabolic and physical protein-protein interactions) to create a unified biological network. We then exploit the unique properties of loss-of-function tolerant and essential genes in this unified network to build a computational model that can predict global perturbation caused by deleterious mutations in all genes. Our model can distinguish between these two gene sets with high accuracy and we further show that it can be used for interpretation of variants involved in Mendelian diseases and in complex disorders probed by genome-wide association studies.}, + number = {3}, + journal = {PLOS Computational Biology}, + author = {Khurana, Ekta and Fu, Yao and Chen, Jieming and Gerstein, Mark}, + month = mar, + year = {2013}, + keywords = {Genetic networks, Protein interaction networks, Protein interactions, Protein-protein interactions, Centrality, Enzyme metabolism, Metabolic networks, Protein metabolism}, + pages = {e1002886}, + file = {khurana_gerstein_2013_interpretation_of_genomic_variants_using_a_unified_biological_network_approach.pdf:/home/joel/edu/articles/khurana_gerstein_2013_interpretation_of_genomic_variants_using_a_unified_biological_network_approach.pdf:application/pdf;Snapshot:/home/joel/Zotero/storage/MBUKZXSW/article.html:text/html} +} + +@article{cancer_genome_atlas_research_network_integrated_2011, + title = {Integrated genomic analyses of ovarian carcinoma}, + volume = {474}, + issn = {1476-4687}, + doi = {10.1038/nature10166}, + abstract = {A catalogue of molecular aberrations that cause ovarian cancer is critical for developing and deploying therapies that will improve patients' lives. The Cancer Genome Atlas project has analysed messenger RNA expression, microRNA expression, promoter methylation and DNA copy number in 489 high-grade serous ovarian adenocarcinomas and the DNA sequences of exons from coding genes in 316 of these tumours. Here we report that high-grade serous ovarian cancer is characterized by TP53 mutations in almost all tumours (96\%); low prevalence but statistically recurrent somatic mutations in nine further genes including NF1, BRCA1, BRCA2, RB1 and CDK12; 113 significant focal DNA copy number aberrations; and promoter methylation events involving 168 genes. Analyses delineated four ovarian cancer transcriptional subtypes, three microRNA subtypes, four promoter methylation subtypes and a transcriptional signature associated with survival duration, and shed new light on the impact that tumours with BRCA1/2 (BRCA1 or BRCA2) and CCNE1 aberrations have on survival. Pathway analyses suggested that homologous recombination is defective in about half of the tumours analysed, and that NOTCH and FOXM1 signalling are involved in serous ovarian cancer pathophysiology.}, + language = {eng}, + number = {7353}, + journal = {Nature}, + author = {{Cancer Genome Atlas Research Network}}, + month = jun, + year = {2011}, + pmid = {21720365}, + pmcid = {PMC3163504}, + keywords = {Female, Genomics, Humans, Middle Aged, Mutation, Gene Expression Profiling, Gene Dosage, Ovarian Neoplasms, Aged, Carcinoma, DNA Methylation, Gene Expression Regulation, Neoplastic, MicroRNAs, RNA, Messenger}, + pages = {609--615} +} + +@article{crijns_survival-related_2009, + title = {Survival-related profile, pathways, and transcription factors in ovarian cancer}, + volume = {6}, + issn = {1549-1676}, + doi = {10.1371/journal.pmed.1000024}, + abstract = {BACKGROUND: Ovarian cancer has a poor prognosis due to advanced stage at presentation and either intrinsic or acquired resistance to classic cytotoxic drugs such as platinum and taxoids. Recent large clinical trials with different combinations and sequences of classic cytotoxic drugs indicate that further significant improvement in prognosis by this type of drugs is not to be expected. Currently a large number of drugs, targeting dysregulated molecular pathways in cancer cells have been developed and are introduced in the clinic. A major challenge is to identify those patients who will benefit from drugs targeting these specific dysregulated pathways.The aims of our study were (1) to develop a gene expression profile associated with overall survival in advanced stage serous ovarian cancer, (2) to assess the association of pathways and transcription factors with overall survival, and (3) to validate our identified profile and pathways/transcription factors in an independent set of ovarian cancers. +METHODS AND FINDINGS: According to a randomized design, profiling of 157 advanced stage serous ovarian cancers was performed in duplicate using approximately 35,000 70-mer oligonucleotide microarrays. A continuous predictor of overall survival was built taking into account well-known issues in microarray analysis, such as multiple testing and overfitting. A functional class scoring analysis was utilized to assess pathways/transcription factors for their association with overall survival. The prognostic value of genes that constitute our overall survival profile was validated on a fully independent, publicly available dataset of 118 well-defined primary serous ovarian cancers. Furthermore, functional class scoring analysis was also performed on this independent dataset to assess the similarities with results from our own dataset. An 86-gene overall survival profile discriminated between patients with unfavorable and favorable prognosis (median survival, 19 versus 41 mo, respectively; permutation p-value of log-rank statistic = 0.015) and maintained its independent prognostic value in multivariate analysis. Genes that composed the overall survival profile were also able to discriminate between the two risk groups in the independent dataset. In our dataset 17/167 pathways and 13/111 transcription factors were associated with overall survival, of which 16 and 12, respectively, were confirmed in the independent dataset. +CONCLUSIONS: Our study provides new clues to genes, pathways, and transcription factors that contribute to the clinical outcome of serous ovarian cancer and might be exploited in designing new treatment strategies.}, + language = {eng}, + number = {2}, + journal = {PLoS medicine}, + author = {Crijns, Anne P. G. and Fehrmann, Rudolf S. N. and de Jong, Steven and Gerbens, Frans and Meersma, Gert Jan and Klip, Harry G. and Hollema, Harry and Hofstra, Robert M. W. and te Meerman, Gerard J. and de Vries, Elisabeth G. E. and van der Zee, Ate G. J.}, + month = feb, + year = {2009}, + pmid = {19192944}, + pmcid = {PMC2634794}, + keywords = {Adult, Female, Humans, Middle Aged, Gene Expression Profiling, Ovarian Neoplasms, Aged, Gene Expression Regulation, Neoplastic, Aged, 80 and over, Kaplan-Meier Estimate, Metabolic Networks and Pathways, Survival Analysis, Transcription Factors}, + pages = {e24} +} \ No newline at end of file diff --git a/doc/tutorial-draft.md b/doc/tutorial-draft.md new file mode 100644 index 0000000..34780c3 --- /dev/null +++ b/doc/tutorial-draft.md @@ -0,0 +1,105 @@ +--- +title: Rete Tutorial +bibliography: ./tutorial-draft.bib +... + +# Scientific problem +- Increased understanding of which gene mutations contribute to cancer development is crucial both for genetic screening in preventive purposes and drug development to slow down disease progression. +- There are many approaches to identify single genes that are significantly mutated across different cancers [@dees_music:_2012_; @lawrence_mutational_2013; @mermel_gistic2.0_2011]. However, searching for significant mutations at the level of individual genes fails to identify functionally equivalent mutations that might occur in different genes. + - For example, mutations to different genes could inhibit the same pathway, resulting in the pathway being altered in a high number of cancers although each individual gene is mutated at a low frequency. + - This is especially troublesome in cancer development, where mutational heterogeneity is observed to be high across cancer samples and even between tumors of the same cancer [@vogelstein_cancer_2013; @hanahan_hallmarks_2011]. + - On the individual gene level, these rare somatic mutations which are important for cancer development, can occur with the same frequency as, and be hard to distinguish from, non-essential passenger mutations. +- To combat this, many current approaches [@printz_aacr_2017; @network_corrigendum:_2013] have tried to identify pathways of interest and look enrichment of mutations within these pathways. + - However, cross-talk between pathways makes it difficult to assign genes strictly only to one pathway, or even one biological function. + - Aside from incorrect pathways assignment of genes, limiting the search to specific pathways, might introduce bias which compromises the results and misses key regulatory genes. This is especially concern as current pathway annotation of genes is incomplete. + +## Alternate approach +- To overcome these difficulties, our approach ([Rete](https://github.com/hyginn/rete)) is based on the methods developed by Leiserson _et al_ [@leiserson_pan-cancer_2015], which divides genes into subnetworks based on their known physical interactions with each other, rather than assigning them to any particular biological pathway. Such subnetworks can span multiple signalling pathways and render a more complete picture of what biological functions are targeted by cancerous mutations, and how such mutations can affect multiple pathways simultaneously. +- To find these commonly altered subnetworks, significantly mutated genes and their connectivity to other genes needs to be identified. +- At a high level, this process proceeds as follows + 1. Retrieve information on the mutational frequency of genes in cancer samples, and the known interactions of the protein products of these genes. + 2. Distribute the mutational frequency from a single gene across the network of genes. + 3. Identify significant subnetworks in this distributed gene network graph. + +# Input data +For validation, we use the same data set used by Leiserson _et al_, consisting of ovarian adenocarcinoma samples from the TCGA [@cancer_genome_atlas_research_network_integrated_2011]. This data includes of 489 high-grade serous ovarian adenocarcinomas. Specifically, we select one of the four subnetworks discovered by Vandin _et al._ [@vandin_discovery_2011], subnetwork S1. The genes in this network have independently been identified to overlap with genes in the focal adhesion pathway and significantly correlated with overall survival in ovarian adenocarcinoma compared to what is expected by chance [@crijns_survival-related_2009]. This data set also contains outliers, or cold nodes, as identified by Vandin _et al_. + +| Reported Symbol | HGNC gene symbol | UniProt ID | Approved Name | Note | +|-----------------|------------------|------------|----------------------------------------|------| +| ADAM9 | ADAM9 | Q13443 | ADAM metallopeptidase domain 9 | - | +| ITGAV | ITGAV | P06756 | integrin subunit alpha V | - | +| ITGA6 | ITGA6 | P23229 | integrin subunit alpha 6 | - | +| ITGA3 | ITGA3 | P26006 | integrin subunit alpha 3 | - | +| ITGB5 | ITGB5 | P18084 | integrin subunit beta 5 | - | +| LIMK1 | LIMK1 | P53667 | LIM domain kinase 1 | - | +| FGFR2 | FGFR2 | P21802 | fibroblast growth factor receptor 2 | - | +| DLST | DLST | P36957 | dihydrolipoamide S-succinyltransferase | - | +| UMPS | UMPS | P11172 | uridine monophosphate synthetase | - | +| PAK4 | PAK4 | O96013 | p21 (RAC1) activated kinase 4 | - | +| GATAD2A | GATAD2A | Q86YP4 | GATA zinc finger domain containing 2A | - | +| C2orf65 | M1AP | Q8TC57 | meiosis 1 associated protein | Outlier | +| DOK1 | DOK1 | Q99704 | docking protein 1 | Outlier | +| DQX1 | DQX1 | Q8TE96 | DEAQ-box RNA dependent ATPase 1 | Outlier | +| LOXL3 | LOXL3 | P58215 | lysyl oxidase like 3 | Outlier | +| SEMA4F | SEMA4F | O95754 | semaphorin 4F | Outlier | + +**Table 1** All the members of subnetwork 1 from Vandin _et al_. + +## Mutations +- Mutations are often categorized as Single Nucleotide Variations (SNVs) or Copy Number Aberrations (CNAs). SNVs are changes changes in just a single nucleotide, while CNAs can be multiplication of anything from two base pairs to entire genes. These types of mutations are common and only a few of them are associated with cancerous outcomes. +- Data of CNAs known to be associated with different cancers can be downloaded from the [Project Genie](http://www.aacr.org/Research/Research/Pages/aacr-project-genie.aspx) and [Firehose](http://archive.broadinstitute.org/cancer/cga/Firehose) databases. +- Data of SNVs important for cancer development can be downloaded from the Project Genie, [COSMIC](http://cancer.sanger.ac.uk/cosmic) and [The Cancer Genome Database](https://cancergenome.nih.gov/). +- Examples of the input data format of the SNV and CNA data sets can be found in the [Rete GitHub repository](https://github.com/hyginn/rete/tree/master/inst/extdata). + +## Protein interactions +- In addition to knowing which genes are commonly mutated in cancers, it is crucial to know how these genes can affect other genes and cellular functions. +- Most gene communication is mediated by interactions of their protein products, so this information can be approximated by our current knowledge of which proteins can bind each other and are thus could possibly interact in biologically meaningful ways. +- Protein protein interactions (PPI) can be downloaded in the form of PPI network files, for example from STRING [@szklarczyk_string_2015]and MultiNet [@khurana_interpretation_2013]]. +- Examples of the input data format for the STRING PPI data can be found in the [Rete GitHub repository](https://github.com/hyginn/rete/tree/master/inst/extdata). + +# Analysis +- IMPORT-M Read in mutation data of both SNVs and CNAs from their respective online repositories. +- FILTER Filter out mutations that are not associated with the cancerous phenotype, such as hypermutator genes which are known to be altered under conditions not associated with disease progression. +- COMBINE Combine the different types of mutation data into a common table containing the given mutation rate for all genes of interest. +- SCORE Assign genes a score, referred to as heat, based on their mutation frequency by scoring the combined dataset. Existing algorithms, such as MutSigCV [@lawrence_mutational_2013], can be used for this task. +- IMPORT-M Read in the PPI data from the online databases. +- ANNOTATE Annotate each vertex (protein) in the PPI network with the heat score from the corresponding gene. +- DIFFUSE Distribute the heat from one protein to its neighboring proteins according to the PPI connections. The heat distribution can be in form of a random walk to all neighbours and a probability for restarting the walk from the source. The heat flowing through an edge at each iteration is added to the total influence score of this edge. This process will highlight which paths in the PPI that mutations of a single gene can influence. +- THRESH Calculate the thresholds for FINDSUB to consider an edge for removal due to not being part of a significantly mutated path in the network. +- FINDSUB Extract the significantly mutated subnetworks from each graph by removing edges that are below the threshold calculated from THRESH. After removal of edges, subnetworks are extracted by finding the strongly connected components in the remaining network graph. Each of these networks can be assigned an aggregated heat score by summing the influence of the individual edges in the network. +- CONSENSE Find the consensus networks from all the subnetworks returned by FINDSUB, by weighing each edge by the number of subnetworks where it is present. Also identify linker nodes, which are genes that exist in multiple networks and might enable crosstalk between biological pathways. +- Downstream analyses of statistical significance and comparison to known signalling pathways by pathway enrichment scores. + +# Results and interpretation +- The final results will be a list of genes constituting subnetworks and an indication of which genes facilitate crosstalk between these networks. + - These genes have been derived from a combination the number of cancer samples that have mutations in the genes, and the interactions between genes in the subnetwork according to the PPI network. + - As our method of edge removal uses the same heat diffusion approach as in Leiserson _et al_, we expect to arrive at similar results for the ovarian adenocarcinoma validation data set. + - However, our implementation is flexible, in that it allows for alternate methods of edge removal to be implemented, including machine learning approaches, which could be an extension to better identify what constitute a relevant edge. + - Machine learning approaches could be more successful in detecting edges that are relevant in driving cancer development, by integrating additional information about the connected nodes. Such node features could include the betweeness centrality, degree, semantic similarity of GO terms, and independent validation in other types of experiments, such as co-expression data sets. + - A key challenge with machine learning approaches is to identify correctly labelled negative and positive data to use for training. For negative data, there are initiatives like the negatome which lists proteins known _not_ to interact, and positive data can be extracted from individual studies confirming the importance for disease progress of communications over a specific edge. + - The linker genes that enable crosstalk between subnetworks might be of specific interest as key facilitators of the disease progression. +- The consensus subnetworks will contain genes that can be compared to biologically known pathways to see if they correspond closely to already charted biological functions, and if they indicate significant communication between pathways. +- The strength of this approach is primarily to pick up networks and pathways that are commonly affected by heterogeneous cancer mutations, which could not be detected at the single gene level. Here, the overall network mutations that were previously distributed across the many genes in the network, will be aggregated and the effected of mutating any important gene in this network can be assessed. + - Conversely, one could miss single genes that are highly important by themselves, owing to problems with the incomplete PPI networks (this gene is not well connected, and consequently cannot create significantly mutated subnetworks of large enough size to be considered in the analyses. To remedy this, once could use this approach in combination with existing single mutation scoring approaches such as MutSig and combine the results. +- There might be nodes that are not physically connected and thus not annotated in the PPI network, but which might share a functional path of edges via other nodes. Such nodes include subunits of the ribosome which not all bind to each other, but function together as a complex. If these complexes are large, the heat on each individual edge could be minor and potentially not picked through the random walk heat diffusion. Likewise, if there are many outgoing edges from the same complex, but from different subunits, these might receive a low score individually. In these cases, it could be beneficial to consider a strategy where known complexes are grouped together as one network component instead of many smaller ones. Complex data could be attained from the IntAct database [@orchard_mintact_2014]. +- Where there is a hub cancer gene, like p-53, it will become extremely hot and diffuse heat to all neighbours, generating a star shaped network that is likely not of biological significance. This is largely mitigated by the restart probability of the random walk heat diffusion, but there might still be false positives with this distinct network shape, which is important to keep in mind when exploring the results. +- As always, the quality of input data is crucial for the accuracy of the suggested genes and one should be careful drawing conclusions about genes where the PPI information is known to be especially incomplete. +- Note that there are other ways of transferring information between genes than via protein to protein interactions, for example protein to gene interactions, such as is the case of transcription factor proteins binding to the DNA and regulating its expression. These interactions could be added as a future extension of the network, through the analyses of Chip-seq data. + - Possible interaction as indicated by a PPI network also does not necessarily mean biologically meaningful interactions. Gene co-expression data, tissue/cell co-localization studies, and gene location on the genome could be used to increase the confidence that the possible interaction between two proteins would produce biologically useful information. + +# References +Dees, N.D., Zhang, Q., Kandoth, C., Wendl, M.C., Schierding, W., Koboldt, D.C., Mooney, T.B., Callaway, M.B., Dooling, D., Mardis, E.R., et al. (2012). MuSiC: identifying mutational significance in cancer genomes. Genome Res. 22, 1589–1598. +Cancer Genome Atlas Research Network (2011). Integrated genomic analyses of ovarian carcinoma. Nature 474, 609–615. +Cancer Genome Atlas Research Network (2011). Integrated genomic analyses of ovarian carcinoma. Nature 474, 609–615. +Crijns, A.P.G., Fehrmann, R.S.N., de Jong, S., Gerbens, F., Meersma, G.J., Klip, H.G., Hollema, H., Hofstra, R.M.W., te Meerman, G.J., de Vries, E.G.E., et al. (2009). Survival-related profile, pathways, and transcription factors in ovarian cancer. PLoS Med. 6, e24. +Hanahan, D., and Weinberg, R.A. (2011). Hallmarks of cancer: the next generation. Cell 144, 646–674. +Khurana, E., Fu, Y., Chen, J., and Gerstein, M. (2013). Interpretation of Genomic Variants Using a Unified Biological Network Approach. PLOS Computational Biology 9, e1002886. +Lawrence, M.S., Stojanov, P., Polak, P., Kryukov, G.V., Cibulskis, K., Sivachenko, A., Carter, S.L., Stewart, C., Mermel, C.H., Roberts, S.A., et al. (2013). Mutational heterogeneity in cancer and the search for new cancer-associated genes. Nature 499, 214–218. +Leiserson, M.D.M., Vandin, F., Wu, H.-T., Dobson, J.R., Eldridge, J.V., Thomas, J.L., Papoutsaki, A., Kim, Y., Niu, B., McLellan, M., et al. (2015). Pan-cancer network analysis identifies combinations of rare somatic mutations across pathways and protein complexes. Nat Genet 47, 106–114. +Mermel, C.H., Schumacher, S.E., Hill, B., Meyerson, M.L., Beroukhim, R., and Getz, G. (2011). GISTIC2.0 facilitates sensitive and confident localization of the targets of focal somatic copy-number alteration in human cancers. Genome Biol. 12, R41. +Network, T.C.G.A.R. (2013). Corrigendum: Comprehensive genomic characterization defines human glioblastoma genes and core pathways. Nature 494, 506. +Orchard, S., Ammari, M., Aranda, B., Breuza, L., Briganti, L., Broackes-Carter, F., Campbell, N.H., Chavali, G., Chen, C., del-Toro, N., et al. (2014). The MIntAct project--IntAct as a common curation platform for 11 molecular interaction databases. Nucleic Acids Res. 42, D358-363. +Printz, C. (2017). AACR releases large cancer genomic data set from project GENIE. Cancer 123, 1685. +Szklarczyk, D., Franceschini, A., Wyder, S., Forslund, K., Heller, D., Huerta-Cepas, J., Simonovic, M., Roth, A., Santos, A., Tsafou, K.P., et al. (2015). STRING v10: protein-protein interaction networks, integrated over the tree of life. Nucleic Acids Res. 43, D447-452. +Vandin, F., Clay, P., Upfal, E., and Raphael, B.J. (2011). Discovery of mutated subnetworks associated with clinical data in cancer. In Biocomputing 2012, (WORLD SCIENTIFIC), pp. 55–66. +Vogelstein, B., Papadopoulos, N., Velculescu, V.E., Zhou, S., Diaz, L.A., and Kinzler, K.W. (2013). Cancer genome landscapes. Science 339, 1546–1558.