diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..107a846 --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +.gitignore +/importer/nbproject/private/ +/importer/dist/ +/importer/build/ +/importer/target/ +/importer/src/main/resources/application.properties +/importer/nbproject/ +/importer/nb-configuration.xml diff --git a/docs/File-Formats.md b/docs/File-Formats.md new file mode 100644 index 0000000..b1239b5 --- /dev/null +++ b/docs/File-Formats.md @@ -0,0 +1,1045 @@ +* [Introduction](#introduction) +* [Formats](#formats) + * [Cancer Study](#cancer-study) + * [Cancer Type](#cancer-type) + * [Clinical Data](#clinical-data) + * [Discrete Copy Number Data](#discrete-copy-number-data) + * [Continuous Copy Number Data](#continuous-copy-number-data) + * [Segmented Data](#segmented-data) + * [Expression Data](#expression-data) + * [Mutation Data](#mutation-data) + * [Fusion Data](#fusion-data) + * [Methylation Data](#methylation-data) + * [RPPA Data](#rppa-data) + * [Case Lists](#case-lists) + * [Timeline Data](#timeline-data) + * [Gistic Data](#gistic-data) + * [Mutsig Data](#mutsig-data) + * [Gene Panel Data](#gene-panel-data) + +# Introduction + +This page describes the file formats that cancer study data should assume in order to be successfully imported into the database. Unless otherwise noted, all data files are in tabular-TSV (tab separated value) format and have an associated metadata file which is in a multiline record format. The metadata and data files should follow a [few rules documented at the Data Loading page](Data-Loading.md#preparing-study-data). + +# Formats + +## Cancer Study + +As described in the [Data Loading tool](Data-Loading.md) page, the following file is needed to describe the cancer study: + +#### Meta file +This file contains metadata about the cancer study. The file contains the following fields: + +1. **type_of_cancer**: The cancer type abbreviation, e.g., "brca". This should be the same cancer type as specified in the meta_cancer_type.txt file, if available. +2. **cancer_study_identifier**: A string used to uniquely identify this cancer study within the database, e.g., "brca_joneslab_2013". +3. **name**: The name of the cancer study, e.g., "Breast Cancer (Jones Lab 2013)". +4. **description**: A description of the cancer study, e.g., "Comprehensive profiling of 103 breast cancer samples. Generated by the Jones Lab 2013". This description may contain one or more URLs to relevant information. +5. **citation (optional)**: A relevant citation, e.g., "TCGA, Nature 2012". +6. **pmid (optional)**: A relevant pubmed id. If used, the field citation has to be filled, too. +7. **short_name**: A short name used for display used on various web pages within the cBioPortal, e.g., "BRCA (Jones)". +8. **groups (optional)**: When using an authenticating cBioPortal, lists the user-groups that are allowed access to this study. Multiple groups are separated with a semicolon ";". The study will be invisible to users not in _at least one_ of the listed groups, as if it wasn't loaded at all. e.g., "PUBLIC;GDAC;SU2C-PI3K". see [User-Authorization](User-Authorization.md) for more information on groups +9. **add_global_case_list (optional)**: set to 'true' if you would like the "All samples" case list to be generated automatically for you. See also [Case lists](#case-lists). + +##### Example +An example meta_study.txt file would be: +``` +type_of_cancer: brca +cancer_study_identifier: brca_joneslab_2013 +name: Breast Cancer (Jones Lab 2013) +short_name: BRCA (Jones) +description: Comprehensive profiling of 103 breast cancer samples. Generated by the Jones Lab 2013. +add_global_case_list: true +``` + +## Cancer Type +If the type_of_cancer specified in the meta_study.txt does not yet exist in the type_of_cancer database table, a meta_cancer_type.txt file is also mandatory. + +#### Meta file +The file is comprised of the following fields: + +1. **genetic_alteration_type**: CANCER_TYPE +2. **datatype**: CANCER_TYPE +3. **data_filename**: <your datafile> + +##### Example +An example meta_cancer_type.txt file would be: +``` +genetic_alteration_type: CANCER_TYPE +datatype: CANCER_TYPE +data_filename: cancer_type.txt +``` + +#### Data file +The file is comprised of the following columns in the order specified: + +1. **type_of_cancer**: The cancer type abbreviation, e.g., "brca". +2. **name**: The name of the cancer type, e.g., "Breast Invasive Carcinoma". +3. **clinical_trial_keywords**: A comma separated list of keywords used to identify this study, e.g., "breast,breast invasive". +4. **dedicated_color**: CSS color name of the color associated with this cancer study, e.g., "HotPink". See [this list](https://www.w3.org/TR/css3-color/#svg-color) for supported names, and follow the [awareness ribbons](http://en.wikipedia.org/wiki/List_of_awareness_ribbons) color schema. This color is associated with the cancer study on various web pages within the cBioPortal. +5. **parent_type_of_cancer**: The `type_of_cancer` field of the cancer type of which this is a subtype, e.g., "Breast". :information_source: : you can set parent to `tissue`, which is the reserved word to place the given cancer type at "root" level in the "studies oncotree" that will be generated in the homepage (aka query page) of the portal. + +##### Example +An example record would be: +``` +brcaBreast Invasive Carcinomabreast,breast invasiveHotPinkBreast +``` + +## Clinical Data + +The clinical data is used to capture both clinical attributes and the mapping between patient and sample ids. The software supports multiple samples per patient. + +As of March 2016, the clinical file is split into a patient file and a clinical file. The *sample* file is required, whereas the *patient* file is optional. + +#### Meta files +The two clinical metadata files (or just one metadata file if you choose to leave the *patient* file out) have to contain the following fields: + +1. **cancer_study_identifier**: same value specified in meta_study.txt +2. **genetic_alteration_type**: CLINICAL +3. **datatype**: PATIENT_ATTRIBUTES or SAMPLE_ATTRIBUTES +4. **data_filename**: <your datafile> + +##### Examples +An example metadata file, e.g. named meta_clinical_sample.txt, would be: +``` +cancer_study_identifier: brca_tcga_pub +genetic_alteration_type: CLINICAL +datatype: SAMPLE_ATTRIBUTES +data_filename: data_clinical_samples.txt +``` + +An example metadata file, e.g. named meta_clinical_patient.txt, would be: +``` +cancer_study_identifier: brca_tcga_pub +genetic_alteration_type: CLINICAL +datatype: PATIENT_ATTRIBUTES +data_filename: data_clinical_patients.txt +``` + +#### Data files +For both patients and samples, the clinical data file is a two dimensional matrix with multiple clinical attributes. When the attributes are defined in the *patient* file they are considered to be patient attributes; when they are defined in the *sample* file they are considered to be sample attributes. + +The first four rows of the clinical data file contain tab-delimited metadata about the clinical attributes. These rows **have to start with a '#' symbol**. Each of these four rows contain different type of information regarding each of the attributes that are defined in the fifth row: + +- Row 1: **The attribute Display Names**: The display name for each clinical attribute +- Row 2: **The attribute Descriptions**: Long(er) description of each clinical attribute +- Row 3: **The attribute Datatype**: The datatype of each clinical attribute (must be one of: STRING, NUMBER, BOOLEAN) +- Row 4: **The attribute Priority**: A number which indicates the importance of each attribute. In the future, higher priority attributes will appear in more prominent places than lower priority ones on relevant pages (such as the [Study View](http://www.cbioportal.org/study?id=brca_tcga)). A lower number indicates a higher priority. + +#### Example metadata rows +Below is an example of the first 4 rows with the respective metadata for the attributes defined in the 5th row. +``` +#Patient IdentifierOverall Survival StatusOverall Survival (Months)Disease Free StatusDisease Free (Months)... +#Patient identifierOverall survival statusOverall survival in months since diagnosisDisease free statusDisease free in months since treatment... +#STRINGSTRINGNUMBERSTRINGNUMBER... +#11111 +PATIENT_IDOS_STATUSOS_MONTHSDFS_STATUSDFS_MONTHS... +.... +data - see examples below +.... +``` + +Following the metadata rows comes a tab delimited list of clinical attributes (column headers). The sixth row is the first row to contain actual data. + +#####The patient file##### + +The file containing the patient attributes has one **required** column: +- **PATIENT_ID (required)**: a unique patient ID. + +The following columns are used by the study view as well as the patient view. In the the [study view](http://www.cbioportal.org/study?id=brca_tcga) they are used to create the survival plots. In the patient view they are used to add information to the [header] (http://www.cbioportal.org/case.do?cancer_study_id=lgg_ucsf_2014&case_id=P05). +- **OS_STATUS**: Overall patient survival status + - Possible values: DECEASED, LIVING + - In the patient view, LIVING creates a green label, DECEASED a red label. + - In visualisation of [Timeline data](#timeline-data), DECEASED will result in a new event of type STATUS +- **OS_MONTHS (required if OS_STATUS is DECEASED)**: Overall survival in months since initial diagnosis +- **DFS_STATUS**: Disease free status since initial treatment + - Possible values: DiseaseFree, Recurred/Progressed + - In the patient view, DiseaseFree creates a green label, Recurred/Progressed a red label. +- **DFS_MONTHS**: Disease free (months) since initial treatment + +These columns, when provided, add additional information to the patient description in the header: +- **PATIENT_DISPLAY_NAME**: Patient display name (string) +- **GENDER** or **SEX**: Gender or sex of the patient (string) +- **AGE**: Age at which the condition or disease was first diagnosed, in years (number) + +Optional attributes: +- **Other Clinical Attribute Headers**: Clinical attribute headers are free-form. You can add any additional clinical attribute and cBioPortal will add them to the database. Be sure to provide the correct `'Datatype'`, as described above, for optimal search, sorting, filtering (in [clinical data tab](http://www.cbioportal.org/study?id=brca_tcga#clinical)) and display. + +###### Example *patient* data file +``` +#Patient IdentifierOverall Survival StatusOverall Survival (Months)Disease Free StatusDisease Free (Months)... +#Patient identifierOverall survival statusOverall survival in months since diagnosisDisease free statusDisease free in months since treatment... +#STRINGSTRINGNUMBERSTRINGNUMBER... +#11111 +PATIENT_IDOS_STATUSOS_MONTHSDFS_STATUSDFS_MONTHS... +PATIENT_ID_1DECEASED17.97Recurred/Progressed30.98... +PATIENT_ID_2LIVING63.01DiseaseFree63.01... +... +``` + +#####The samples file##### +The file containing the sample attributes has two **required** columns: +- **PATIENT_ID (required)**: A patient ID. +- **SAMPLE_ID (required)**: A sample ID. + +By adding `PATIENT_ID` here, cBioPortal will map the given sample to this patient. This enables one to associate multiple samples to one patient. For example, a single patient may have had multiple biopsies, each of which has been genomically profiled. See [this example for a patient with multiple samples](http://www.cbioportal.org/case.do?cancer_study_id=lgg_ucsf_2014&case_id=P04). + +The following columns are required if you want the [pan-cancer summary statistics tab in a pan-cancer study](http://www.cbioportal.org/index.do?cancer_study_list=cellline_ccle_broad&cancer_study_id=cellline_ccle_broad&genetic_profile_ids_PROFILE_MUTATION_EXTENDED=cellline_ccle_broad_mutations&genetic_profile_ids_PROFILE_COPY_NUMBER_ALTERATION=cellline_ccle_broad_CNA&Z_SCORE_THRESHOLD=2.0&data_priority=0&case_set_id=cellline_ccle_broad_cnaseq&case_ids=&patient_case_select=sample&gene_set_choice=prostate-cancer%3A-ar-signaling-%2810-genes%29&gene_list=SOX9+RAN+TNK2+EP300+PXN+NCOA2+AR+NRIP1+NCOR1+NCOR2&clinical_param_selection=null&tab_index=tab_visualize&Action=Submit#pancancer_study_summary): +- **CANCER_TYPE**: Cancer Type +- **CANCER_TYPE_DETAILED**: Cancer Type Detailed, a sub-type of the specified CANCER_TYPE + +The following columns affect the header of the patient view by adding text to the samples in the header: +- **SAMPLE_DISPLAY_NAME**: displayed in addition to the ID +- **TYPE_OF_CANCER**: Overrides CANCER_TYPE in the header +- **DETAILED_CANCER_TYPE**: Overrides CANCER_TYPE_DETAILED in the header +- **KNOWN_MOLECULAR_CLASSIFIER** +- **TUMOR_SITE** +- **METASTATIC_SITE** or **PRIMARY_SITE**: Override TUMOR_SITE depending on sample type +- **SAMPLE_CLASS** +- **GLEASON_SCORE**: Radical prostatectomy Gleason score for prostate cancer +- **HISTOLOGY** +- **TUMOR_STAGE_2009** +- **TUMOR_GRADE** +- **ETS_RAF_SPINK1_STATUS** +- **TMPRSS2_ERG_FUSION_STATUS** +- **ERG_FUSION_ACGH** +- **SERUM_PSA** +- **DRIVER_MUTATIONS** + +The following columns additionally affect the [Timeline data](#timeline-data) visualization: +- **OTHER_SAMPLE_ID**: sometimes the timeline data (see the [timeline data section](#timeline-data)) will not have the SAMPLE_ID but instead an alias to the sample (in the field `SPECIMEN_REFERENCE_NUMBER`). To ensure that the timeline data field `SPECIMEN_REFERENCE_NUMBER` is correctly linked to this sample, be sure to add this column `OTHER_SAMPLE_ID` as an attribute to your sample attributes file. +- **SAMPLE_TYPE**, **TUMOR_TISSUE_SITE** or **TUMOR_TYPE**: gives sample icon in the timeline a color. + - If set to `recurrence`, `recurred`, `progression` or `progressed`: orange + - If set to `metastatic` or `metastasis`: red + - If set to `primary` or otherwise: black + +Optional attributes +- **Other Clinical Attribute Headers**: Clinical attribute headers are free-form. You can add any additional clinical attribute you have tracked and cBioPortal will add them to the database. Be sure to provide the correct `'Datatype'`, as described above (for the header lines), for optimal search, sorting, filtering (in [clinical data tab](http://www.cbioportal.org/study?id=brca_tcga#clinical)) and display. + + +###### Example sample data file +``` +#Patient IdentifierSample IdentifierSubtype... +#Patient identifierSample IdentifierSubtype description... +#STRINGSTRINGSTRING... +#111... +PATIENT_IDSAMPLE_IDSUBTYPE... +PATIENT_ID_1SAMPLE_ID_1basal-like... +PATIENT_ID_2SAMPLE_ID_2Her2 enriched... +... +``` + +## Discrete Copy Number Data +The discrete copy number data file contain values that would be derived from copy-number analysis algorithms like [GISTIC](http://www.ncbi.nlm.nih.gov/sites/entrez?term=18077431) or [RAE](http://www.ncbi.nlm.nih.gov/sites/entrez?term=18784837). GISTIC can be [installed](http://www.broadinstitute.org/cgi-bin/cancer/publications/pub_paper.cgi?mode=view&paper_id=216&p=t) or run online using the GISTIC 2.0 module on [GenePattern](http://genepattern.broadinstitute.org/gp/pages/login.jsf). For some help on using GISTIC, check the [Data Loading: Tips and Best Practices](Data-Loading-Tips-and-Best-Practices.md) page. + +##### Meta file +The meta file is comprised of the following fields: + +1. **cancer_study_identifier**: same value as specified in [study meta file](#cancer-study) +2. **genetic_alteration_type**: COPY_NUMBER_ALTERATION +3. **datatype**: DISCRETE +4. **stable_id**: gistic, cna, cna_rae or cna_consensus +5. **show_profile_in_analysis_tab**: true +6. **profile_name**: A name for the discrete copy number data, e.g., "Putative copy-number alterations from GISTIC" +7. **profile_description**: A description of the copy number data, e.g., "Putative copy-number from GISTIC 2.0. Values: -2 = homozygous deletion; -1 = hemizygous deletion; 0 = neutral / no change; 1 = gain; 2 = high level amplification." +8. **data_filename**: <your datafile> +9. **gene_panel**: optional gene panel stable id + +##### Example +An example metadata file could be named meta_CNA.txt and its contents could be: +``` +cancer_study_identifier: brca_tcga_pub +genetic_alteration_type: COPY_NUMBER_ALTERATION +datatype: DISCRETE +stable_id: gistic +show_profile_in_analysis_tab: true +profile_name: Putative copy-number alterations from GISTIC +profile_description: Putative copy-number from GISTIC 2.0. Values: -2 = homozygous deletion; -1 = hemizygous deletion; 0 = neutral / no change; 1 = gain; 2 = high level amplification. +data_filename: data_CNA.txt +``` + +#### Data file + +For each gene (row) in the data file, the following columns are required in the order specified: + +One or both of: +- ***Hugo_Symbol***: A [HUGO](http://www.genenames.org/) gene symbol. +- ***Entrez_Gene_Id***: A [Entrez Gene](http://www.ncbi.nlm.nih.gov/gene) identifier. + +And: +- An additional column for each sample in the dataset using the sample id as the column header. + +For each gene-sample combination, a copy number level is specified: +- "-2" is a deep loss, possibly a homozygous deletion +- "-1" is a single-copy loss (heterozygous deletion) +- "0" is diploid +- "1" indicates a low-level gain +- "2" is a high-level amplification. + +#### Example +An example data file which includes the required column header would look like: +``` +Hugo_SymbolEntrez_Gene_IdSAMPLE_ID_1SAMPLE_ID_2... +ACAP31169830-1... +AGRN37579020... +... +... +``` + +## Continuous Copy Number Data + +#### Meta file +The continuous copy number metadata file should contain the following fields: + +1. **cancer_study_identifier**: same value as specified in [study meta file](#cancer-study) +2. **genetic_alteration_type**: COPY_NUMBER_ALTERATION. +3. **datatype**: CONTINUOUS +4. **stable_id**: linear_CNA +5. **show_profile_in_analysis_tab**: false. +6. **profile_name**: A name for the copy number data, e.g., "copy-number values". +7. **profile_description**: A description of the copy number data, e.g., "copy-number values for each gene (from Affymetrix SNP6).". +8. **data_filename**: <your datafile> +9. **gene_panel**: optional gene panel stable id + +cBioPortal also supports log2 copy number data. If your data is in log2, change the following fields: + +3. **datatype**: LOG2-VALUE +4. **stable_id**: log2CNA + +_**TODO: In issue [#571](https://github.com/cBioPortal/cbioportal/issues/571) log2 is changed to linear. This means the information that it is a log value is now lost. It should be discussed, as this is probably not a good idea.**_ + +##### Example +An example metadata file, e.g. meta_CNA_log2.txt, would be: +``` +cancer_study_identifier: brca_tcga_pub +genetic_alteration_type: COPY_NUMBER_ALTERATION +datatype: LOG2-VALUE +stable_id: log2CNA +show_profile_in_analysis_tab: false +profile_description: Log2 copy-number values for each gene (from Affymetrix SNP6). +profile_name: Log2 copy-number values +data_filename: data_log2CNA.txt +``` + +#### Data file + +The log2 copy number data file follows the same format as expression data files. See [Expression Data](#expression-data) for a description of the expression data file format. + + +## Segmented Data + +A SEG file (segmented data; .seg or .cbs) is a tab-delimited text file that lists loci and associated numeric values. The segmented data file format is the output of the Circular Binary Segmentation algorithm (Olshen et al., 2004). **Segment data for import into the cBioPortal should be based on build 37 (hg19)**. This Segment data enables the 'CNA' lane in the Genomic overview of the Patient view (as [can be seen in this example](http://www.cbioportal.org/case.do?sample_id=TCGA-BH-A0E6-01&cancer_study_id=brca_tcga)). + +#### Meta file +The segmented metadata file should contain the following fields: + +1. **cancer_study_identifier**: same value as specified in [study meta file](#cancer-study) +2. **genetic_alteration_type**: COPY_NUMBER_ALTERATION +3. **datatype**: SEG +4. **reference_genome_id**: Reference genome version. Supported values: "hg19" +5. **description**: A description of the segmented data, e.g., "Segment data for the XYZ cancer study.". +6. **data_filename**: <your datafile> +7. **gene_panel**: optional gene panel stable id + +#### Example: +An example metadata file, e.g. meta_cna_seg.txt, would be: +``` +cancer_study_identifier: brca_tcga_pub +genetic_alteration_type: COPY_NUMBER_ALTERATION +datatype: SEG +reference_genome_id: hg19 +description: Somatic CNA data (copy number ratio from tumor samples minus ratio from matched normals) from TCGA. +data_filename: brca_tcga_data_cna_hg19.seg +``` + +#### Data file + +The first row contains column headings and each subsequent row contains a locus and an associated numeric value. See also the [Broad IGV page on this format](https://www.broadinstitute.org/software/igv/SEG). + +#### Example: + +An example data file which includes the required column header would look like: +``` +'IDchromloc.startloc.endnum.markseg.mean +SAMPLE_ID_1132084702458803291289230.0025 +SAMPLE_ID_2247422255054922639-0.0112 +SAMPLE_ID_22550607055062042-1.5012 +SAMPLE_ID_22551237415900477580678-0.0013 +... +... +``` + + + +## Expression Data + +An expression data file is a two dimensional matrix with a gene per row and a sample per column. For each gene-sample pair, a real number represents the gene expression in that sample. + +#### Meta file + +The expression metadata file should contain the following fields: + +1. **cancer_study_identifier**: same value as specified in [study meta file](#cancer-study) +2. **genetic_alteration_type**: MRNA_EXPRESSION +3. **datatype**: CONTINUOUS, DISCRETE or Z-SCORE +4. **stable_id**: see table below. +5. **show_profile_in_analysis_tab**: false (you can set to **true** if Z-SCORE to enable it in the oncoprint, for example). +6. **profile_name**: A name for the expression data, e.g., "mRNA expression (microarray)". +7. **profile_description**: A description of the expression data, e.g., "Expression levels (Agilent microarray).". +8. **data_filename**: <your datafile> +9. **gene_panel**: optional gene panel stable id + +#### Supported stable_id values for MRNA_EXPRESSION +For historical reasons, cBioPortal expects the `stable_id` to be one of those listed in the following static set. + +datatype | stable_id | description +--- | --- | --- +CONTINUOUS|mrna_U133|Affymetrix U133 Array +Z-SCORE|mrna_U133_Zscores|Affymetrix U133 Array +Z-SCORE|rna_seq_mrna_median_Zscores|RNA-seq data +Z-SCORE|mrna_median_Zscores|mRNA data +CONTINUOUS|rna_seq_mrna|RNA-seq data +CONTINUOUS|rna_seq_v2_mrna|RNA-seq data +Z-SCORE|rna_seq_v2_mrna_median_Zscores|RNA-seq data +CONTINUOUS|mirna|MicroRNA data +Z-SCORE|mirna_median_Zscores|MicroRNA data +Z-SCORE|mrna_merged_median_Zscores|? +CONTINUOUS|mrna|mRNA data +DISCRETE|mrna_outliers|mRNA data of outliers +Z-SCORE|mrna_zbynorm|? +CONTINUOUS|rna_seq_mrna_capture|data from Roche mRNA Capture Kit +Z-SCORE|rna_seq_mrna_capture_Zscores|data from Roche mRNA Capture Kit + + +#### Example + +An example metadata, e.g. meta_expression_file.txt file would be: +``` +cancer_study_identifier: brca_tcga_pub +genetic_alteration_type: MRNA_EXPRESSION +datatype: CONTINUOUS +stable_id: rna_seq_mrna +show_profile_in_analysis_tab: false +profile_name: mRNA expression +profile_description: Expression levels +data_filename: data_expression_file.txt +``` + +#### Data file + +For each gene (row) in the data file, the following columns are required in the order specified: + +One or both of: +- ***Hugo_Symbol***: A [HUGO](http://www.genenames.org/) gene symbol. +- ***Entrez_Gene_Id***: A [Entrez Gene](http://www.ncbi.nlm.nih.gov/gene) identifier. + +And: +- An additional column for each sample in the dataset using the sample id as the column header. + +For each gene-sample combination, a value is specified: +- A real number for each sample id (column) in the dataset, representing the expression value for the gene in the respective sample. +- or `NA` for when the expression value for the gene in the respective sample could not (or was not) be measured (or detected). + +##### z-score instructions + +For mRNA expression data, we typically expect the relative expression of an individual gene and tumor to the gene's expression distribution in a reference population. That reference population is either all tumors that are diploid for the gene in question, or, when available, normal adjacent tissue. The returned value indicates the number of standard deviations away from the mean of expression in the reference population (Z-score). This measure is useful to determine whether a gene is up- or down-regulated relative to the normal samples or all other tumor samples. **Note, the importer tool can create normalized (z-score) expression data on your behalf. Please visit the [Z-Score normalization script](Z-Score-normalization-script.md) wiki page for more information. +A corresponding z-score metadata file would be something like**: +``` +cancer_study_identifier: brca_tcga_pub +genetic_alteration_type: MRNA_EXPRESSION +datatype: Z-SCORE +stable_id: rna_seq_mrna_median_Zscores +show_profile_in_analysis_tab: true +profile_name: mRNA expression z-scores +profile_description: Expression levels z-scores +data_filename: data_expression_zscores_file.txt +``` + +#### Examples of data files: + +An example data file which includes the required column header and *leaves out Hugo_Symbol* (recommended) would look like: + +``` +Entrez_Gene_IdSAMPLE_ID_1SAMPLE_ID_2... +116983-0.005-0.550... +3757900.1420.091... +... +... +``` + +An example data file which includes both Hugo_Symbo and Entrez_Gene_Id would look like (supported, but not recommended as it increases the chances of errors regarding [ambiguous Hugo symbols](Data-Loading-How-the-loader-deals-with-Hugo-symbols.md)): +``` +Hugo_SymbolEntrez_Gene_IdSAMPLE_ID_1SAMPLE_ID_2... +ACAP3116983-0.005-0.550... +AGRN3757900.1420.091... +... +... +``` + +An example data file with only Hugo_Symbol column (supported, but not recommended as it increases the chances of errors regarding [ambiguous Hugo symbols](Data-Loading-How-the-loader-deals-with-Hugo-symbols.md): +``` +Hugo_SymbolSAMPLE_ID_1SAMPLE_ID_2... +ACAP3-0.005-0.550... +AGRN0.1420.091... +... +... +``` + + +## Mutation Data +The mutation data file extends the [Mutation Annotation Format](https://wiki.nci.nih.gov/display/TCGA/Mutation+Annotation+Format+%28MAF%29+Specification) (MAF) created as part of the [Cancer Genome Atlas](https://wiki.nci.nih.gov/display/TCGA/TCGA+Home) project, by adding *extra annotations* to each mutation record. If your mutation data is already in [VCF](http://www.1000genomes.org/wiki/Analysis/Variant%20Call%20Format/vcf-variant-call-format-version-41) format (which by default most variant callers produce) you can use this [vcf2maf](https://github.com/ckandoth/vcf2maf) converter. +**Please note that all data should be mapped to UniProt canonical isoforms.** This can be done by calling the vcf2maf or maf2maf with the ```--custom-enst``` flag and the mapping file available [here](https://github.com/mskcc/vcf2maf/blob/master/data/isoform_overrides_uniprot). This will ensure the SWISSPROT column, which contains the UniProt canonical isoform, can be used correctly by cBioPortal. + +#### Meta file +The mutation metadata file should contain the following fields: + +1. **cancer_study_identifier**: same value as specified in [study meta file](#cancer-study) +2. **genetic_alteration_type**: MUTATION_EXTENDED +3. **datatype**: MAF +4. **stable_id**: mutations +5. **show_profile_in_analysis_tab**: true +6. **profile_name**: A name for the mutation data, e.g., "Mutations". +7. **profile_description**: A description of the mutation data, e.g., "Mutation data from whole exome sequencing.". +8. **data_filename**: <your data file> +9. **gene_panel**: optional gene panel stable id +10. **swissprot_identifier (optional)**: either `accession` or `name`, indicating the type of identifier in the `SWISSPROT` column + +An example metadata file would be: + +``` +cancer_study_identifier: brca_tcga_pub +genetic_alteration_type: MUTATION_EXTENDED +datatype: MAF +stable_id: mutations +show_profile_in_analysis_tab: true +profile_description: Mutation data from whole exome sequencing. +profile_name: Mutations +data_filename: brca_tcga_pub.maf +``` + +#### Data file +A minimal mutation annotations file can contain just three of the MAF columns plus one annotation column, which is normally added to the end of each MAF row: + +* **Hugo_Symbol**: (MAF column) A [HUGO](http://www.genenames.org/) gene symbol. +* **Tumor_Sample_Barcode**: (MAF column) This is the sample ID as listed in the clinical data file. +* **Variant_Classification**: (MAF column) Translational effect of variant allele, e.g. Missense_Mutation, Silent, etc. cBioPortal skips the following types during the import: _Silent_, _Intron_, _3'UTR_, _3'Flank_, _5'UTR_, _5'Flank_, _IGR_ and _RNA_ +* **HGVSp_Short**: (annotation column) Amino Acid Change, e.g. p.V600E. + +Note: next to Hugo_Symbol, it is recommended to have the Entrez gene ID: + +* **Entrez_Gene_Id (Optional, but recommended)**: An [Entrez Gene](http://www.ncbi.nlm.nih.gov/gene) identifier. + +:information_source: special case for **Entrez_Gene_Id=0** and **Hugo_Symbol=Unknown**: when this combination is given, the record is parsed in the same way as **Variant_Classification=IGR** and therefore filtered out. + +The following extra annotation columns are also important for making sure mutation specific UI functionality works well in the portal: + +* **Protein_position**: (annotation column) Required to initialize the 3D viewer in [mutations view](http://www.cbioportal.org/index.do?cancer_study_list=brca_tcga_pub&cancer_study_id=brca_tcga_pub&genetic_profile_ids_PROFILE_MUTATION_EXTENDED=brca_tcga_pub_mutations&genetic_profile_ids_PROFILE_COPY_NUMBER_ALTERATION=brca_tcga_pub_gistic&genetic_profile_ids_PROFILE_MRNA_EXPRESSION=brca_tcga_pub_mrna_median_Zscores&Z_SCORE_THRESHOLD=2.0&RPPA_SCORE_THRESHOLD=2.0&data_priority=0&case_set_id=brca_tcga_pub_complete&case_ids=&patient_case_select=sample&gene_set_choice=prostate-cancer%3A-ar-signaling-%2810-genes%29&gene_list=TP53&clinical_param_selection=null&tab_index=tab_visualize&Action=Submit#mutation_details) +* **SWISSPROT**: (annotation column) UniProtKB/SWISS-PROT name (formerly called ID) or accession code depending on the value of the `swissprot_identifier` metadatum, e.g. O11H1_HUMAN or Q8NG94. Is not absolutely required, but not having it may result in inconsistent PDB structure matching in [mutations view](http://www.cbioportal.org/index.do?cancer_study_list=brca_tcga_pub&cancer_study_id=brca_tcga_pub&genetic_profile_ids_PROFILE_MUTATION_EXTENDED=brca_tcga_pub_mutations&genetic_profile_ids_PROFILE_COPY_NUMBER_ALTERATION=brca_tcga_pub_gistic&genetic_profile_ids_PROFILE_MRNA_EXPRESSION=brca_tcga_pub_mrna_median_Zscores&Z_SCORE_THRESHOLD=2.0&RPPA_SCORE_THRESHOLD=2.0&data_priority=0&case_set_id=brca_tcga_pub_complete&case_ids=&patient_case_select=sample&gene_set_choice=prostate-cancer%3A-ar-signaling-%2810-genes%29&gene_list=TP53&clinical_param_selection=null&tab_index=tab_visualize&Action=Submit#mutation_details). + +##### Extending the MAF format +:warning: **Attention**: for the list of ***required*** and ***recommended*** fields, check the subsection above :arrow_up:. The section below :arrow_down: only describes some of the *extra* fields you can have in your mutations file. + +*Adding your mutation annotation columns to the complete MAF rows* can also be done. In this way, the portal will parse and store the MAF fields as well. For example, mutation data that you find on cBioPortal.org comes from MAF files that have been further enriched with information from [mutationassessor.org](http://mutationassessor.org/), which leads to a 'Mutation Assessor” column in the [mutation table](http://www.cbioportal.org/index.do?cancer_study_list=acc_tcga&cancer_study_id=acc_tcga&genetic_profile_ids_PROFILE_MUTATION_EXTENDED=acc_tcga_mutations&Z_SCORE_THRESHOLD=2.0&RPPA_SCORE_THRESHOLD=2.0&data_priority=0&case_set_id=acc_tcga_sequenced&case_ids=&patient_case_select=sample&gene_set_choice=user-defined-list&gene_list=ZFPM1&clinical_param_selection=null&tab_index=tab_visualize&Action=Submit). + +The MAF format recognized by the portal (excluding the annotation columns already mentioned above) has 32 columns + 4 columns with information on reference and variant allele counts in tumor and normal samples. A more detailed example MAF can be found on our [Downloads](Downloads.md#maf-example) page. Description of each column is provided below: + +1. **Hugo_Symbol (Required)**: A [HUGO](http://www.genenames.org/) gene symbol. +2. **Entrez_Gene_Id (Optional, but desired)**: A [Entrez Gene](http://www.ncbi.nlm.nih.gov/gene) identifier. +3. **Center (Optional)**: The sequencing center. +4. **NCBI_Build (Optional)**: Must be "37". +5. **Chromosome (Optional)**: A chromosome number, e.g., "7". +6. **Start_Position (Optional)**: Start position of event. +7. **End_Position (Optional)**: End position of event. +8. **Strand (Optional)**: We assume that the mutation is reported for the + strand. +9. **Variant_Classification (Required)**: Translational effect of variant allele, e.g. Missense_Mutation, Silent, etc. +10. **Variant_Type (Optional)**: Variant Type, e.g. SNP, DNP, etc. +11. **Reference_Allele (Optional)**: The plus strand reference allele at this position. +12. **Tumor_Seq_Allele1 (Optional)**: Primary data genotype. +13. **Tumor_Seq_Allele2 (Optional)**: Primary data genotype. +14. **dbSNP_RS (Optional)**: Latest dbSNP rs ID. +15. **dbSNP_Val_Status (Optional)**: dbSNP validation status. +16. **Tumor_Sample_Barcode (Required)**: This is the sample ID. Either a TCGA barcode (patient identifier will be extracted), or for non-TCGA data, a literal SAMPLE_ID as listed in the clinical data file. +17. **Matched_Norm_Sample_Barcode (Optional)**: The sample ID for the matched normal sample. +18. **Match_Norm_Seq_Allele1 (Optional)**: Primary data. +19. **Match_Norm_Seq_Allele2 (Optional)**: Primary data. +20. **Tumor_Validation_Allele1 (Optional)**: Secondary data from orthogonal technology. +21. **Tumor_Validation_Allele2 (Optional)**: Secondary data from orthogonal technology. +22. **Match_Norm_Validation_Allele1 (Optional)**: Secondary data from orthogonal technology. +23. **Match_Norm_Validation_Allele2 (Optional)**: Secondary data from orthogonal technology. +24. **Verification_Status (Optional)**: Second pass results from independent attempt using same methods as primary data source. +25. **Validation_Status (Optional)**: -- "Valid" or "Unknown". +26. **Mutation_Status (Optional)**: Ideally "Somatic". +27. **Sequencing_Phase (Optional)**: Indicates current sequencing phase. +28. **Sequence_Source (Optional)**: Molecular assay type used to produce the analytes used for sequencing. +29. **Validation_Method (Optional)**: The assay platforms used for the validation call. +30. **Score (Optional)**: Not in use. +31. **BAM_File (Optional)**: Not used. +32. **Sequencer (Optional)**: Instrument used to produce primary data. +33. **t_alt_count (Optional)**: Variant allele count (tumor). +34. **t_ref_count (Optional)**: Reference allele count (tumor). +35. **n_alt_count (Optional)**: Variant allele count (normal). +36. **n_ref_count (Optional)**: Reference allele count (normal). + +⚠️ Please make sure that, even if you are using the MAF format, all required columns specified in the beginning of the Data File section are present! + +## Methylation Data +The Portal expects a single value for each gene in each sample, usually a beta-value from the Infinium methylation array platform. + +#### Meta file + +The methylation metadata file should contain the following fields: + +1. **cancer_study_identifier**: same value as specified in [study meta file](#cancer-study) +2. **genetic_alteration_type**: METHYLATION +3. **datatype**: CONTINUOUS +4. **stable_id**: "methylation_hm27" or "methylation_hm450" (depending on platform). +5. **show_profile_in_analysis_tab**: false +6. **profile_name**: A name for the methylation data, e.g., "Methlytation (HM27)". +7. **profile_description**: A description of the methlytation data, e.g., "Methylation beta-values (HM27 platform). For genes with multiple methylation probes, the probe least correlated with expression is selected.". +8. **data_filename**: <your datafile> +9. **gene_panel**: optional gene panel stable id + + +#### Example + +An example metadata file would be: +``` +cancer_study_identifier: brca_tcga_pub +genetic_alteration_type: METHYLATION +datatype: CONTINUOUS +stable_id: methylation_hm27 +show_profile_in_analysis_tab: false +profile_name: Methylation (HM27) +profile_description: Methylation beta-values (HM27 platform). For genes with multiple methylation probes, the probe least correlated with expression is selected. +data_filename: data_methylation_hm27.txt +``` + +#### Data file + +The methylation data file follows the same format as expression data files. See [Expression Data](#expression-data) for a description of the expression data file format. The Portal expects a single value for each gene in each sample, usually a beta-value from the Infinium methylation array platform. + + +## RPPA Data + +Protein expression measured by reverse-phase protein array. Antibody-sample pairs, with a real number representing the RPPA level for that sample. + +#### Meta file + +The RPPA metadata file should contain the following fields: + +1. **cancer_study_identifier**: same value as specified in [study meta file](#cancer-study) +2. **genetic_alteration_type**: PROTEIN_LEVEL +3. **datatype**: LOG2-VALUE or Z-SCORE +4. **stable_id**: rppa or rppa_Zscores +5. **show_profile_in_analysis_tab**: false (**true** for Z-SCORE datatype) +6. **profile_name**: A name for the RPPA data, e.g., "RPPA data". +7. **profile_description**: A description of the RPPA data, e.g., "RPPA levels.". +8. **data_filename**: <your datafile> +9. **gene_panel**: optional gene panel stable id + +An example metadata file would be: +``` +cancer_study_identifier: brca_tcga +genetic_alteration_type: PROTEIN_LEVEL +datatype: LOG2-VALUE +stable_id: rppa +show_profile_in_analysis_tab: false +profile_description: Protein expression measured by reverse-phase protein array +profile_name: Protein expression (RPPA) +data_filename: data_rppa.txt +``` + +**NB:** You also need a Z-SCORE file if you want RPPA to be available in query UI and in [Oncoprint visualization](http://www.cbioportal.org/index.do?cancer_study_list=brca_tcga_pub&cancer_study_id=brca_tcga_pub&genetic_profile_ids_PROFILE_MUTATION_EXTENDED=brca_tcga_pub_mutations&genetic_profile_ids_PROFILE_COPY_NUMBER_ALTERATION=brca_tcga_pub_gistic&genetic_profile_ids_PROFILE_MRNA_EXPRESSION=brca_tcga_pub_mrna_median_Zscores&Z_SCORE_THRESHOLD=2.0&genetic_profile_ids_PROFILE_PROTEIN_EXPRESSION=brca_tcga_pub_rppa_Zscores&RPPA_SCORE_THRESHOLD=2.0&data_priority=0&case_set_id=brca_tcga_pub_complete&case_ids=&patient_case_select=sample&gene_set_choice=prostate-cancer%3A-ar-signaling-%2810-genes%29&gene_list=TP53+SOX9+RAN+TNK2+EP300+PXN+NCOA2+AR+NRIP1+NCOR1+NCOR2&clinical_param_selection=null&tab_index=tab_visualize&Action=Submit). E.g.: + +``` +cancer_study_identifier: brca_tcga +genetic_alteration_type: PROTEIN_LEVEL +datatype: Z-SCORE +data_filename: data_rppa.txt +stable_id: rppa_Zscores +show_profile_in_analysis_tab: true +profile_description: Protein expression Z-scores (RPPA) +profile_name: Protein expression Z-scores (RPPA) +``` + +#### Data file + +An RPPA data file is a two dimensional matrix with an antibody per row and a sample per column. For each antibody-sample pair, a real number represents the RPPA level for that sample. The antibody information can contain one or more HUGO gene symbols and/or entrez gene identifiers, separated by a space, and an antibody ID pair separated by the "|" symbol. + +#### Example + +An example data file which includes the required column header would look like: + +``` +Composite.Element.REFSAMPLE_ID_1SAMPLE_ID_2... +BRAF|B-Raf-M-NA1.095066763250.5843256495... +MAPK1 MAPK3|MAPK_PT202_Y2041.704445820251.0982864685... +AKT1 AKT2 10000|AKT0.170714927250.264067254391 +... +``` + + +## Fusion Data +This type data is not yet being validated. It can, however, be uploaded. + +#### Meta file +The fusion metadata file should contain the following fields: + +1. **cancer_study_identifier**: same value as specified in [study meta file](#cancer-study) +2. **genetic_alteration_type**: FUSION +3. **datatype**: FUSION +4. **stable_id**: mutations +5. **show_profile_in_analysis_tab**: true. +6. **profile_name**: A name for the fusion data, e.g., "Fusions.". +7. **profile_description**: A description of the fusion data. +8. **data_filename**: <your datafile> +9. **gene_panel**: optional gene panel stable id + +#### Example +An example metadata file would be: +``` +cancer_study_identifier: brca_tcga_pub +genetic_alteration_type: FUSION +datatype: FUSION +stable_id: mutations +profile_description: Fusions. +show_profile_in_analysis_tab: true +profile_name: Fusions +data_filename: data_fusions.txt +``` + +#### Data file +A fusion data file is a two dimensional matrix with one gene per row. For each gene (row) in the data file, the following tab-delimited values are required in the order specified: + +1. **Hugo_Symbol**: A [HUGO](http://www.genenames.org/) gene symbol. +2. **Entrez_Gene_Id**: A [Entrez Gene](http://www.ncbi.nlm.nih.gov/gene) identifier. +3. **Center**: The sequencing center. +4. **Tumor_Sample_Barcode**: This is the sample ID. +5. **Fusion**: A description of the fusion, e.g., "TMPRSS2-ERG fusion". +6. **DNA support**: Fusion detected from DNA sequence data, "yes" or "no". +7. **RNA support**: Fusion detected from RNA sequence data, "yes" or "no". +8. **Method**: Fusion detected algorithm/tool. +9. **Frame**: "in-frame" or "frameshift". + +An example data file which includes the required column header would look like: +``` +Hugo_SymbolEntrez_Gene_IdCenterTumor_Sample_BarcodeFusionDNA supportRNA supportMethodFrame> +ALK238center.eduSAMPLE_ID_1Fusionunknownyesunknownin-frame +ALK238center.eduSAMPLE_ID_2Fusionunknownyesunknownin-frame +RET5979center.eduSAMPLE_ID_3Fusionunknownyesunknownin-frame +... +... +``` + +## Case Lists + +There should be 1 or more case lists associated with each cancer study. You should provide **at least one case list which contains all sample ids** (the importer can generate this for your if you set the attribute *add_global_case_list* to 'true' in the [Study metadata](#cancer-study). + +When **not** using the *add_global_case_list* attribute in [Study metadata](#cancer-study), or if you want to add custom case lists: +- the case list files should be placed in a sub-directory called "case_lists" which exists alongside all the other cancer study data. + +The case list file should contain the following fields: + +1. **cancer_study_identifier**: same value as specified in [study meta file](#cancer-study) +2. **stable_id**: it must contain the cancer_study_identifier followed by an underscore. Typically, after this a relevant suffix, e.g., "_custom", is added. There are some naming rules to follow if you want the case list to be selected automatically in the query UI base on the selected sample profiles. See subsection below. +3. **case_list_name**: A name for the patient list, e.g., "All Tumors". +4. **case_list_description**: A description of the patient list, e.g., "All tumor samples (825 samples).". +5. **case_list_ids**: A tab-delimited list of sample ids from the dataset. +6. **case_list_category**: Optional *alternative* way of linking your case list to a specific molecular profile. E.g. setting this to `all_cases_with_cna_data` will signal to the portal that this is the list of samples to be associated with CNA data in *some* of the analysis. + +#### Example + +An example case list file would be: +``` +cancer_study_identifier: brca_tcga_pub +stable_id: brca_tcga_pub_custom +case_list_name: Custom subset of samples +case_list_description: Custom subset of samples (825 samples) +case_list_ids: SAMPLE_ID_1SAMPLE_ID_2SAMPLE_ID_3... +``` + +:warning: In order for sample counts to propagate to the data sets widget on the home page and the table on the [Data Sets](http://www.cbioportal.org/public-portal/data_sets.jsp) page, the following case list suffixes need to be used in the stable_id property. This is also needed for correct statistics in the Study view page when calculating the frequency of CNA and of mutations per gene in the respective summary tables. + +_TODO: add missing data types to this list and clear up how exactly these relate to genetic profiles, especially if there are multiple for the same data type – do the stable_id fields of case lists correspond to those of profiles?_ + +* **Sequenced Samples**: "_sequenced" (e.g. "brca_tcga_pub_sequenced"). +* **CNA Patients**: "_cna". :warning: the size of this list is used to determine the percentage of genes with CNA in the study view. If this case list is not given, the system will assume that *all* samples have been sequenced and will calculate the frequency accordingly. +* **mRNA (RNA-SeqV2)**: "_rna_seq_v2_mrna". +* **mRNA (microarray)**: "_mrna". +* **Methylation (HM27)**: "_methylation_hm27". +* **RPPA**: "_rppa". +* **Complete**: "_3way_complete" (mRNA, CNA, & sequencing). + +Finally, if you are not using *add_global_case_list* attribute in [Study metadata](#cancer-study), you need to generate the "All samples" case list as well and give it the following stable_id: + +* **All Samples**: "_all" (e.g. "brca_tcga_pub_all"). + + +## Timeline Data +The timeline data is a representation of the various events that occur during the course of treatment for a patient from initial diagnosis. In cBioPortal timeline data is represented as one or more tracks in the patient view. Each main track is based on an event type, such as "Specimen", "Imaging", "Lab_test", etc. + +:warning: some clinical attributes affect the timeline visualization. Please check the [Clinical Data](#clinical-data) section for more information. + +This type data is not yet being validated. It can, however, be uploaded. + +#### Meta file +Each event type requires its own meta file. A timeline meta file should contain the following fields: + +1. **cancer_study_identifier**: same value as specified in [study meta file](#cancer-study) +2. **genetic_alteration_type**: CLINICAL +3. **datatype**: TIMELINE +4. **data_filename**: <your datafile> + +An example metadata file would be: +``` +cancer_study_identifier: brca_tcga +genetic_alteration_type: CLINICAL +datatype: TIMELINE +data_filename: data_timeline_imaging.txt +``` + + +#### Data file +Each event type requires its own data file, which contains all the events that each patient undergoes. The data format used for timeline data is extremely flexible. There are three required columns: + +1. **PATIENT_ID**: the patient ID from the dataset +2. **START_DATE**: the start point of any event, calculated in **_days_* from the date of diagnosis (which will act as point zero on the timeline scale) +3. **STOP_DATE**: The end date of the event is calculated in days from the date of diagnosis (which will act as point zero on the timeline scale). If the event occurs over time (e.g. a Treatment, ...) the STOP_DATE column should have values. If the event occurs at a time point (e.g. a Lab_test, Imaging, ...) the STOP_DATE is still mandatory, but the values should be blanks. +4. **EVENT_TYPE**: the category of the event. You are free to define any type of event here. For several event types cBioPortal has column naming suggestions and for several events there are column names which have special effects. See [event types](#event-types) for more information. + +And one optional columns with a special effect: + +1. **SPECIMEN_REFERENCE_NUMBER**: when this column has values that match the SAMPLE_ID/OTHER_SAMPLE_ID (defined in the clinical data file), the timeline will show case labels with black/red/etc 1, 2, 3, 4 circles. This only works for the first track and only if no STOP_DATE is set. + +#####Event Types +As previously mentioned, the EVENT_TYPE can be anything. However, several event types have columns with special effects. Furthermore, for some event types cBioPortal has column naming suggestions. + +_**EVENT_TYPE: TREATMENT**_ + +Suggested columns + + * **TREATMENT_TYPE**: This can be either Medical Therapy or Radiation Therapy. + * **SUBTYPE**: Depending upon the TREATMENT_TYPE, this can either be Chemotherapy, Hormone Therapy, Targeted Therapy etc. (for Medical Therapies) or WPRT, IVRT etc. (for Radiation Therapies). + * **AGENT**: for medical therapies, the agent is defined with number of cycles if applicable and for radiation therapy, the agent is defined as standard dose given to the patient during the course. + * Based on different cancer types you can add additional data here. + +Special: When using the AGENT and SUBTYPE columns, each agent and subtype will be split into its own track. + +_**EVENT_TYPE: LAB_TEST**_ + +Suggested columns + * **TEST**: type of test performed + * **RESULT**: corresponding value of the test + * Based on different cancer types you can add additional data here. + +Special: When using the TEST and RESULT columns, each test gets its own track and the dots are sized by the values of the RESULT if the TEST is PSA, ALK, TEST, HGB, PHOS or LDH. + +_**EVENT_TYPE: IMAGING**_ + +Suggested columns + * **DIAGNOSTIC_TYPE**: This attribute will cover the different diagnostics tools used (for example: MRI, CT scan etc.) + * **DIAGNOSTIC_TYPE_DETAILED**: Detailed description of the event type. + * **RESULT**: Results of the diagnostic tests + * **SOURCE**: Where was the Imaging done. + * Based on different cancer types you can add additional data here. + +Special: all dots in the IMAGING track are squares. + +_**EVENT_TYPE: STATUS**_ + +Suggested columns + + * **STATUS**: If the EVENT_TYPE is status, data is entered under STATUS to define either the best response from the treatment or if there is a diagnosis of any stage progression etc. + * **SOURCE**: Where the status was monitored. + * Based on different cancer types you can add additional data here. + +_**EVENT_TYPE: SPECIMEN**_ + +Suggested columns + + * **SPECIMEN_REFERENCE_NUMBER**: This corresponds to the SAMPLE_ID/OTHER_SAMPLE_ID + * **SPECIMEN_SITE**: This is the site from where the specimen was collected. + * **SPECIMEN_TYPE**: This can either be tissue or blood. + * **SOURCE**: Where was the specimen collection done. + * Based on different cancer types you can add additional data here. + +Special: when the SPECIMEN_REFERENCE_NUMBER column has values that match the SAMPLE_ID/OTHER_SAMPLE_ID (defined in the clinical data file), the timeline will show case labels with black/red/etc 1, 2, 3, 4 circles. This only works for the first track and only if no STOP_DATE is set. + +#####Clinical Track Ordering +Clinical tracks are ordered as follows (if available): + +1. Specimen +2. Surgery +3. Status +4. Diagnostics +5. Diagnostic +6. Imaging +7. Lab_test +8. Treatment +9. First custom event +10. etc. + +#### Example + +An example timeline file for SPECIMEN would be: +``` +PATIENT_IDSTART_DATEEVENT_TYPESPECIMEN_REFERENCE_NUMBERSPECIMEN_SITESPECIMEN_TYPESOURCEMyCustomColumn +CACO20SPECIMENCACO2_S1livertissuehospitalT1 +CACO2100SPECIMENCACO2_S2lungtissuehospitalT2 +... +``` +Assuming the sample identifiers were also defined in the clinical file, this will lead to a timeline track with numbered specimen samples. + +An example timeline file for Lab_test would be: +``` +PATIENT_IDSTART_DATEEVENT_TYPETESTRESULT +CACO2100LAB_TESTPSA10 +CACO2250LAB_TESTPSA100 +... +``` +This will lead to a timeline track for Lab_test with an additional subtrack specifically for PSA. PSA's events will be sized based on the result. + + +## Gistic Data +Running GISTIC 2.0 on e.g. GenePattern not only provides the [Discrete Copy Number Data](#discrete-copy-number-data), but also provides an amp_genes and a del_genes file. These cannot be directly imported into cBioPortal, but first have to be converted to a different file format. Currently, there is no easy way available to do this. However, the cBioPortal team is aiming to make the necessary [cbioportal_pipelines](https://github.com/cBioPortal/cbioportal-pipelines/blob/master/importer/src/main/java/org/mskcc/cbio/importer/converter/internal/GisticGenesConverterImpl.java) functionality available via issue [#873](https://github.com/cBioPortal/cbioportal/issues/873). + +After uploading a gistic_amp and/or gistic_del file, a new button becomes available in the Enter Gene Set section, called "Select Genes from Recurrent CNAs (Gistic)". + +#### Meta file +The Gistic metadata file should contain the following fields: + +1. **cancer_study_identifier**: same value as specified in [study meta file](#cancer-study) +2. **genetic_alteration_type**: GISTIC_GENES_AMP or GISTIC_GENES_DEL +3. **datatype**: Q-VALUE +4. **reference_genome_id**: reference genome version. Supported values: "hg19" +5. **data_filename**: <your datafile> + +An example metadata file would be: +``` +cancer_study_identifier: brca_tcga +genetic_alteration_type: GISTIC_GENES_AMP +datatype: Q-VALUE +reference_genome_id: hg19 +data_filename: data_gistic_genes_amp.txt +``` + +#### Data file + +The following fields from the generated Gistic file are used by the cBioPortal importer: + +* **chromosome**: chromosome on which the region was found, without the `chr` prefix +* **peak_start**: start coordinate of the region of maximal amplification or deletion within the significant region +* **peak_end**: end coordinate of the region of maximal amplification or deletion within the significant region +* **genes_in_region**: comma-separated list of HUGO gene symbols in the `wide peak' (allowing for single-sample errors in the peak boundaries) +* **amp**: 1 for amp, 0 for del +* **cytoband**: cytogenetic band specification of the region, including chromosome (Giemsa stain) +* **q_value**: the q-value of the peak region + +#### Example +An example data file which includes the required column header would look like: + +``` +chromosomepeak_startpeak_endgenes_in_regionampcytobandq_value +1150563314150621176SNORA40|ENSG00000253047.1,RN7SL600P,RN7SL473P,C1orf138,LINC00568,CTSS,ECM1,ENSA,MCL1,RPRD2,ADAMTSL4,GOLPH3L,TARS2,HORMAD1,MIR4257,11q21.32.7818E-43 +18598856485991712DDAH1,11p22.34.1251E-13 +... +``` + + +## MutSig Data +MutSig stands for "Mutation Significance". MutSig analyzes lists of mutations discovered in DNA sequencing, to identify genes that were mutated more often than expected by chance given background mutation processes. You can download mutsig from [broadinstitute](https://www.broadinstitute.org/cancer/cga/mutsig_download) (MutSigCV 1.4 is available) or run mutsig (MutSigCV 1.2 is available) using [GenePattern](http://genepattern.broadinstitute.org/gp/pages/login.jsf). + +**Note:** The tcga files that are uploaded to cBioPortal are generated using MutSig2.0. This version is not available outside broadinstitute. + +_**The MutSigCV 1.2 output is different from the MutSig2.0 header. TODO: test the 1.4 version. Requires > 10GB of memory**_ + +After uploading a MutSig file, a new button becomes available in the Enter Gene Set section, called "Select From Recurrently Mutated Genes (MutSig)". + +This type data is not yet being validated. It can, however, be uploaded. + +#### Meta file +The MutSig metadata file should contain the following fields: + +1. **cancer_study_identifier**: same value as specified in [study meta file](#cancer-study) +2. **genetic_alteration_type**: MUTSIG +3. **datatype**: Q-VALUE +4. **data_filename**: <your datafile> + +An example metadata file would be: +``` +cancer_study_identifier: brca_tcga +genetic_alteration_type: MUTSIG +datatype: Q-VALUE +data_filename: data_mutsig.txt +``` + + +#### Data file + +The following fields from a MutSig file are used by the cBioPortal importer: +* **rank** +* **gene**: this is the HUGO symbol +* **N (or Nnon)**: bases covered +* **n (or nnon)**: number of mutations +* **p**: result of testing the hypothesis that all of the observed mutations in this gene are a consequence of random background mutation processes, taking into account the list of bases that are successfully interrogated by sequencing (i.e., “covered”) and the list of observed somatic mutations, as well as the length and composition of the gene in addition to the background mutation rates in different sequence contexts (http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3059829/) +* **q**: p value correct for multiple testing + + +#### Example + +An example data file which includes the required column header would look like: + +``` +rankgeneNnpq +1RUNX11051659291.11E-161.88E-12 +2PIK3CA3200341351<1.00e-15<2.36e-12 +... +``` + +## Gene Panel Data +Gene panel information can assign a list of genes that a genetic profile should consist of for a specific sample. + +#### Gene Panel File +The gene panel file follows the format of a meta file with the following fields: +1. **stable_id**: The name of the gene panel. This should be unique across all studies, as gene panels can be globally applied to any sample and any genetic profile. +2. **description**: A descripion of the gene panel. +3. **gene_list**: Tab separated genes, represented either by all hugo symbols or all entrez_gene_ids. + +An example gene panel file would be: +``` +stable_id: IMPACT410 +description: Targeted (410 cancer genes) sequencing of various tumor types via MSK-IMPACT on Illumina HiSeq sequencers. +gene_list: ABL1 ACVR1 AKT1 AKT3 ... +``` + +#### Sample-Profile Matrix + +The second component to gene panel data is associating samples and profile to the panel which applies. The following column is required : + +- ***SAMPLE_ID***: Sample Id from the study + +And: +- An additional column for each profile in the dataset using the stable_id as the column header. + +For each sample-profile combination, a gene panel should be specified by using the stable_id from the gene panel file, or NA to reflect whole exome. + +#### Example +An example sample-profile matrix file would look like: + +``` +SAMPLE_IDcnamutations... +SAMPLE_ID_1IMPACT410IMPACT410 ... +SAMPLE_ID_2NANA ... +... +``` + +#### Meta file +The sample-profile matrix requires a meta file should contain the following fields: + +1. **cancer_study_identifier**: same value as specified in [study meta file](#cancer-study) +2. **data_filename**: <your datafile> + +If all samples in a genetic profile will have the same gene panel associated with them, an optional field can be specified in the meta data file of that datatype called **gene_panel**. If this is present, the sample-profile matrix will automatically be generated and the gene panel applied if it exists in the database already. + +If all profiles for a sample will have the sample gene panel, in the clinical data a column can be added called **GENE_PANEL** which can specify the gene panel stable id. + +In both of these cases, the sample-profile matrix file does not need to be provided in order to associate gene panel information with a sample-profile. diff --git a/docs/Importer-Workflow.md b/docs/Importer-Workflow.md new file mode 100644 index 0000000..7211922 --- /dev/null +++ b/docs/Importer-Workflow.md @@ -0,0 +1,271 @@ +## Outline: +* [Introduction](#introduction) +* [Importer Workflow](#importer-workflow) +* [Job Configuration Specs](#job-configuration-specs) +* [Step Configuration Specs](#step-configuration-specs) + +# [Introduction](#introduction) +The Spring Batch importer comprises one job with a series of steps that are executed sequentially or in parallel depending on their datatype. For reference, the importer workflow diagram can be viewed [here](ImporterWorkflowDiagram.pdf). + +The `application.properties` file requires the following: +* `spring.batch.job.enabled`=false +* `chunk.interval`= some integer that will be set as the chunk processing interval value +* `db.user`= the db username +* `db.password`= the db password +* `db.driver`= the driver classname +* `db.connection_string`= the db connection string +* `db.portal_db_name`= the db name +* `db.url`= the db url +* `db.version=${db.version}`: db.version is defined in the master pom file + +The user must pass one argument when running the importer (`-s`, `--staging`) which will be added to the `JobExecutionContext` as a `JobParameter` called `stagingDirectory`. The following is an example command for importing a cancer study at a specified directory: + +``` +$JAVA_HOME/bin/java -jar importer/target/importer-0.1.0.jar -s /path/to/cancer/study +``` + +# [Importer Workflow](#importer-workflow): +The following describes the components of the batch importer job and its steps. + +## [Job Configuration Specs](#job-configuration-specs): +* `JobExecutionListener`: Used for letting user know whether cancer study data successfully imported or not. +* `JobExecutionDecider`: Checks the DB schema version and stops the job if the DB version does not match what is expected for the portal. If the DB schema does not match what is expected then the `JobExecutionDecider` sets the `FlowExecutionStatus` to **STOPPED**. Otherwise, the `FlowExecutionStatus` is set to **CONTINUE**. +* `Flow`: Defines the conditional flow for importing a cancer study based on the `FlowExecutionStatus` from the `JobExecutionDecider`. On **STOPPED**, the job execution is halted and the application closes. On **CONTINUE**, the job attempts to import a cancer study based on the staging directory passed on the command line. + +## [Step Configuration Specs](#step-configuration-specs): +* [Cancer Study Step](#cancer-study-step) +* [Cancer Study Datatype Metadata](#cancer-study-datatype-metadata) +* [Clinical Data Step](#clinical-data-step) +* [Timeline Data Step](#timeline-data-step) +* [Mutsig Data Step](#mutsig-data-step) +* [Copy Number Segment Data Step](#copy-number-segment-data-step) +* [Gistic Data Step](#gistic-data-step) +* [Profile Datatypes](#profile-datatypes): + * [Protein-level Data Step](#protein-level-data-step) + * [CNA Data Step](#cna-data-step) + * [Gene Expression Data Step](#gene-expression-data-step) + * [Methylation Data Step](#methylation-data-step) + * [Mutation Data Step](#mutation-data-step) + * [Fusion Data Step](#fusion-data-step) + * [Structural Variant Data Step](#structural-variant-data-step) +* [Case List Step](#case-list-step) + +### [Cancer Study Step](#cancer-study-step): +Importing cancer study data will be executed by the `importCancerStudy` Step (executed by the main batch importer job described above. +* `StepExecutionListener`: Used for setting the `ExitStatus` to **STOPPED** after the step has executed if the cancer study metadata could not be imported or if the cancer study metadata could not be loaded from `meta_study.txt`. +* `Tasklets`: A tasklet loads cancer study metadata from `meta_study.txt`. If a cancer study already exists by the cancer study identifier then the existing study is deleted and the new study is imported. Another tasklet searches the cancer study directory for metafiles for all datatypes before executing any datatype import steps. If metafiles and datafiles are found for a datatype, then its import status is set to **true**, otherwise it is set to **false**. The import status will be used by each datatype's `JobExecutionDecider` to determine whether or not the datatype's import step should execute. +* `JobExecutionDecider`: Used to determine whether the import job should continue or not. If the cancer study metadata was successfully loaded and imported into the DB then the `FlowExecutionStatus` is set to **CONTINUE**, otherwise the `FlowExecutionStatus` is set to **STOPPED**. +* `Flow`: Defines the conditional flow for importing other datatypes for a cancer study based on the `FlowExecutionStatus` from the `JobExecutionDecider`. If the `FlowExecutionStatus` is set to **CONTINUE** then the job will continue executing by first loading and importing clinical data and then executing the remaining import steps in parallel. + +### [Cancer Study Datatype Metadata](#cancer-study-datatype-metadata) +Before data is imported for each datatype, a tasklet will be executed by the `loadCancerStudyDatatypeMetadata` Step, which will: + +1. iterate through each datatype and search for a datatype's meta filename +2. load properties from a datatype's metafile +3. search for data filenames matching the `data_filename` specified in the metafile properties + +A MultiKeyMap is used to store whether a datatype's metafile and datafile(s) exist in the current cancer study path. If so, then the key `importData` is set to **true**, otherwise it is set to **false**. If **true** then the properties and a list of datafile(s) for the datatype are stored in the MultiKeyMap as well. In the cases for datatypes such as `clinical-supp` where it shares the same meta filename as `clinical`, the tasklet will search the cancer study path for a default data filename pattern (`data_clinical_supp*.txt`). Other cases include `mutation-germline` and `mutation-manual`, which share the same meta filename as `mutation`. After checking the cancer study path for each datatype, the MultiKeyMap is then added to the `JobExecutionContext` to be used by the individual datatype `JobExecutionDecider`s described below. + +### [Common Step Components for Datatypes](#common-step-components): +Each Step for all datatypes will have a `JobExecutionDecider` that will search the cancer study path for a datatype's meta filename and date filename pattern. If the meta file exists and data files can be found then the `FlowExecutionStatus` is set to **RUN** and the current step name is injected into the `JobExecutionContext`, otherwise the `FlowExecutionStatus` is set to **SKIP**. + +### [Clinical Data Step](#clinical-data-step) +The following datatype(s) are loaded and imported sequentially if metafiles are found for them: + +| Datatype | Meta Filename | Data Filename | Status | Step Name | +| -------------------- | ----------------------------- | ---------------------------------- | ------------- | ---------------------- | +| clinical | meta_clinical.txt | data_clinical.txt | **Complete** | clinicalStep | +| clinical-patient | meta_clinical_patient.txt | data_clinical_patient.txt | **Complete** | clinicalPatientStep | +| clinical-sample | meta_clinical_sample.txt | data_clinical_sample.txt | **Complete** | clinicalSampleStep | +| bcr-clinical | meta_bcr_clinical.txt | data_bcr_clinical.txt | **Complete** | bcrClinicalStep | +| bcr-clinical-patient | meta_bcr_clinical_patient.txt | data_bcr_clinical_data_patient.txt | **Complete** | bcrClinicalPatientStep | +| bcr-clinical-sample | meta_bcr_clinical_sample.txt | data_bcr_clinical_data_sample.txt | **Complete** | bcrClinicalSampleStep | +| clinical-supp | meta_clinical.txt | data_clinical_supp*.txt | **Complete** | clinicalSuppStep | +| clinical-caises | meta_clinical_caises.txt | data_clinical_caises.xml | _In Progress_ | -- | + +A main Flow (`clinicalStepFlow`) will execute the Step(s) for importing each clinical datatype sequentially. All clinical datatype steps will have the following components in common: +* `Tasklet`: A universal tasklet loads clinical attributes from a list of clinical datafiles based on the data filename corresponding to the `currentStep` stored in the `JobExecutionContext`. It is assumed that metadata for clinical attributes are stored at the top of the file and that the clinical datafile(s) have passed validation. A HashMap of data filenames and clinical attributes loaded are injected into the `JobExecutionContext` to be used by the clinical data readers, processors, and writers. +* `Step`: A universal step builder for clinical data implements the reading, processing, and writing of clinical data using the HashMap of datafile clinical attributes loaded from the `Tasklet` above. +* `StepExecutionListener`: Before the clinical datatype step executes, the listener will add the cancer study imported from [Cancer Study Step](#cancer-study-step) to the `StepExecutionContext`, as well as the datafile clinical attributes loaded from the `Tasklet` described above. After the clinical datatype step executes, the listener will report the total records imported into PATIENT, SAMPLE, CLINICAL_PATIENT, and CLINICAL_SAMPLE, how many new clinical attributes were imported into CLINICAL_ATTRIBUTE, and how many rollbacks or skips occurred during import. + +### [Timeline Data Step](#timeline-data-step) +The following datatype(s) are loaded and imported sequentially if metafiles are found for them: + +| Datatype | Meta Filename | Data Filename | Status | Step Name | +| -------------- | ----------------- | ------------------ | ------------ | ------------ | +| time-line-data | meta_timeline.txt | data_timeline*.txt | **Complete** | timelineStep | + +A main Flow (`timelineStepFlow`) will execute the Step(s) for importing timeline data. Components for the timeline step flow include: +* `Tasklet`: A tasklet loads timeline metadata (header) from the datafile and checks if header is valid. +* `JobExecutionDecider`: In addition to the `JobExecutionDecider` described above, an additional `JobExecutionDecider` is used to determine whether the timeline metadata was loaded properly. If so then the `FlowExecutionStatus` is set to **CONTINUE**, otherwise the `FlowExecutionStatus` is set to **STOPPED**. +* `Step`: A Step to load timeline metadata is executed if both the timeline metafile and datafiles exist. A second Step implements the importing of timeline data from the datafiles. +* `StepExecutionListener`: Before the timeline data import step executes, the listener will add the cancer study imported from [Cancer Study Step](#cancer-study-step) to the `StepExecutionContext`, datafile list, and timeline metadata to the `StepExecutionContext`. After the timeline data import step executes, the listener will report how many genes were loaded from the datafile, the number of entries skipped, and total records imported into CLINICAL_EVENT and CLINICAL_EVENT_DATA. + +### [Mutsig Data Step](#mutsig-data-step) +The following datatype(s) are loaded and imported sequentially if metafiles are found for them: + +| Datatype | Meta Filename | Data Filename | Status | Step Name | +| ------------------------ | --------------- | --------------- | ------------ | ---------- | +| mutation-significance-v2 | meta_mutsig.txt | data_mutsig.txt | **Complete** | mutSigStep | + +A main Flow (`mutSigStepFlow`) will execute the Step(s) for importing mutsig data. Components for the mutsig step flow include: +* `Tasklet`: A tasklet loads mutsig metadata (header, number of records) from the datafile and adds the metadata to the `JobExecutionContext`. +* `JobExecutionDecider`: In addition to the `JobExecutionDecider` described above, an additional `JobExecutionDecider` is used to determine whether the mutsig metadata was loaded properly. If so then the `FlowExecutionStatus` is set to **CONTINUE**, otherwise the `FlowExecutionStatus` is set to **STOPPED**. +* `Step`: A Step to load mutsig metadata is executed if both the mutsig metafile and datafile exist. A second Step implements the reading, processing, and importing of mutsig data from the datafile. +* `StepExecutionListener`: Before the mutsig data import step executes, the listener will add the cancer study imported to the `StepExecutionContext`, datafile, and mutsig metadata to the `StepExecutionContext`. After the mutsig data import step executes, the listener will report how many genes were loaded from the datafile, the number of entries skipped, and total records imported into MUT_SIG. + +### [Copy Number Segment Data Step](#copy-number-segment-data-step): +The following datatype(s) are loaded and imported sequentially if metafiles are found for them: + +| Datatype | Meta Filename | Data Filename | Status | Step Name | +| ------------ | ------------------------------------------------- | ------------------------------------------------- | ------------ | ----------------------------- | +| cna-hg19-seg | ``_meta_cna_hg19_seg.txt | ``_data_cna_hg19_seg.txt | **Complete** | copyNumberSegmentHg19DataStep | +| cna-hg18-seg | ``_meta_cna_hg18_seg.txt | ``_data_cna_hg18_seg.txt | **Complete** | copyNumberSegmentHg18DataStep | + +A main Flow (`copyNumberSegStepFlow`) will execute the Step(s) for importing copy number segment data. Components for the copy number segment flow include: +* `Tasklet`: A tasklet imports a copy number segment file record loaded from the current datatype's metafile into COPY_NUMBER_SEG_FILE and also loads copy number segment metadata (header, number of records) from the datafile and adds the metadata to the `JobExecutionContext`. These information are used by the copy number segment data reader and writers. +* `Step`: A universal step builder for copy number segment data implements the reading, processing, and writing of copy number segment data using the MultiKeyMap of the datafile metadata (header, number of records) loaded from the `Tasklet` above. A report of records loaded is printed after each file is read. +* `StepExecutionListener`: Before the copy number segment datatype step executes, the listener will add the data loaded from the `Tasklet` above to the `StepExecutionContext`. After the copy number segment datatype step executes, the listener will report the total samples and entries skipped, as well as the total records imported into COPY_NUMBER_SEG for the current datatype. + +### [Gistic Data Step](#gistic-data-step) +The following datatype(s) are loaded and imported sequentially if metafiles are found for them: + +| Datatype | Meta Filename | Data Filename | Status | Step Name | +| ---------------- | ------------------------- | ------------------------- | ------------ | ------------------ | +| gistic-genes-amp | meta_gistic_genes_amp.txt | data_gistic_genes_amp.txt | **Complete** | gisticGenesAmpStep | +| gistic-genes-del | meta_gistic_genes_del.txt | data_gistic_genes_del.txt | **Complete** | gisticGenesDelStep | + +A main Flow (`gisticGenesStepFlow`) will execute the Step(s) for importing gistic genes data. Components for the gistic genes flow include: +* `Tasklet`: A loads gistic genes metadata (header, number of records) from the datafile and adds the metadata to the `JobExecutionContext`. These information are used by the gistic genes data reader and writers. +* `Step`: A universal step builder for gistic genes data implements the reading and writing of gistic genes data using the MultiKeyMap of the datafile metadata (header, number of records) loaded from the `Tasklet` above. A report of records loaded is printed after each file is read. +* `StepExecutionListener`: Before the gistic genes datatype step executes, the listener will add the data loaded from the `Tasklet` above to the `StepExecutionContext`. After the gistic genes datatype step executes, the listener will report the total entries skipped, total genes loaded, and the total records imported into GISTIC and GISTIC_TO_GENE for the current datatype. + + +## [Profile Datatypes](#profile-datatypes): + +### [Protein-level Data Step](#protein-level-data-step) +The following datatype(s) are loaded and imported sequentially if metafiles are found for them: + +| Datatype | Meta Filename | Data Filename | Status | Step Name | +| ---------------------- | ------------------------------- | ------------------------------- | ------------ | ------------------------- | +| rppa | meta_rppa.txt | data_rppa.txt | **Complete** | rppaStep | +| rppa-zscores | meta_rppa_Zscores.txt | data_rppa_Zscores.txt | **Complete** | rppaZscoresStep | +| protein-quantification | meta_protein_quantification.txt | data_protein_quantification.txt | **Complete** | proteinQuantificationStep | + +A main Flow (`proteinLevelStepFlow`) will execute the Step(s) for importing each protein-level datatype sequentially. All protein-level datatype steps will have the following components in common: +* `Tasklets`: Two universal tasklets are used for protein-level data steps. One tasklet loads and imports a genetic profile from a datatype's metafile and injects the genetic profile into the `JobExecutionContext`. The second tasklet injects the list of datafiles into the `JobExecutionContext`, as well as a MultiKeyMap of the profile metadata (header, non-case id columns, case id maps, normal case ids). These information are used by the profile data reader, as well as the protein-level data processors and writers. +* `Step`: A universal step builder for protein-level data implements the reading and writing of protein-level data using the genetic profile and the MultiKeyMap of datafile profile metadata (header, non-case id columns, case id maps, normal case ids) loaded from the `Tasklet`s above. A report of records loaded is printed after each file is read. +* `StepExecutionListener`: Before the protein-level datatype step executes, the listener will add the data loaded from the `Tasklet`s above to the `StepExecutionContext`. After the protein-level datatype step executes, the listener will report the total genes loaded, samples skipped, and entries skipped, as well as the total records imported into GENETIC_ALTERATION for the current datatype. + +### [CNA Data Step](#cna-data-step) +The following datatype(s) are loaded and imported sequentially if metafiles are found for them: + +| Datatype | Meta Filename | Data Filename | Status | Step Name | +| ----------------- | ----------------------- | ----------------------- | ------------ | ----------------- | +| cna-gistic | meta_CNA.txt | data_CNA.txt | **Complete** | cnaStep | +| cna-foundation | meta_CNA_foundation.txt | data_CNA_foundation.txt | **Complete** | cnaFoundationStep | +| cna-rae | meta_CNA_RAE.txt | data_CNA_RAE.txt | **Complete** | cnaRaeStep | +| cna-consensus | meta_CNA_consensus.txt | data_CNA_consensus.txt | **Complete** | cnaConsensusStep | +| linear-cna-gistic | meta_linear_CNA.txt | data_linear_CNA.txt | **Complete** | cnaLinearStep | +| log2-cna | meta_log2CNA.txt | data_log2CNA.txt | **Complete** | cnaLog2Step | + +A main Flow (`cnaStepFlow`) will execute the Step(s) for importing each CNA data type sequentially. All CNA datatype steps will have the following components in common: +* `Tasklets`: Two universal tasklets are used for CNA data steps. One tasklet loads and imports a genetic profile from a datatype's metafile and injects the genetic profile into the `JobExecutionContext`. The second tasklet injects the list of datafiles into the `JobExecutionContext`, as well as a MultiKeyMap of the profile metadata (header, non-case id columns, case id maps, normal case ids). These information are used by the profile data reader, and writers, as well as the CNA data processors. +* `Step`: A universal step builder for CNA data implements the reading, processing, and writing of CNA data using the genetic profile and the MultiKeyMap of datafile profile metadata (header, non-case id columns, case id maps, normal case ids) loaded from the `Tasklet`s above. A report of records loaded is printed after each file is read. +* `StepExecutionListener`: Before the CNA datatype step executes, the listener will add the data loaded from the `Tasklet`s above to the `StepExecutionContext`. After the CNA datatype step executes, the listener will report the total genes loaded, samples skipped, and entries skipped, as well as the total records imported into GENETIC_ALTERATION, CNA_EVENT, SAMPLE_CNA_EVENT for the current datatype. + +### [Gene Expression Data Step](#gene-expression-data-step) +The following datatype(s) are loaded and imported sequentially if metafiles are found for them: + +| Datatype | Meta Filename | Data Filename | Status | Step Name | +| ---------------------------------- | ------------------------------------------- | ------------------------------------------- | ------------- | ------------------------------------------ | +| affymetrix-gene-expression | meta_expression.txt | data_expression.txt | **Complete** | geneExpressionAffymetrixStep | +| affymetrix-gene-expression-zscores | meta_expression_Zscores.txt | data_expression_Zscores.txt | **Complete** | geneExpressionAffymetrixZscoresStep | +| gene-expression-merged | meta_expression_merged.txt | data_expression_merged.txt | **Complete** | geneExpressionMergedStep | +| gene-expression-merged-zscores | meta_expression_merged_Zscores.txt | data_expression_merged_Zscores.txt | **Complete** | geneExpressionMergedZscoresStep | +| rnaseq-gene-expression | meta_RNA_Seq_expression_median.txt | data_RNA_Seq_expression_median.txt | **Complete** | geneExpressionRnaSeqStep | +| rnaseq-gene-expression-zscores | meta_RNA_Seq_mRNA_median_Zscores.txt | data_RNA_Seq_mRNA_median_Zscores.txt | **Complete** | geneExpressionRnaSeqZscoresStep | +| agilent-gene-expression | meta_expression_median.txt | data_expression_median.txt | **Complete** | geneExpressionAgilentStep | +| agilent-gene-expression-zscores | meta_mRNA_median_Zscores.txt | data_mRNA_median_Zscores.txt | **Complete** | geneExpressionAgilentZscoresStep | +| rnaseq-v2-gene-expression | meta_RNA_Seq_v2_expression_median.txt | data_RNA_Seq_v2_expression_median.txt | **Complete** | geneExpressionRnaSeqV2Step | +| rnaseq-v2-gene-expression-zscores | meta_RNA_Seq_v2_mRNA_median_Zscores.txt | data_RNA_Seq_v2_mRNA_median_Zscores.txt | **Complete** | geneExpressionRnaSeqV2ZscoresStep | +| mirna-expression | meta_expression_miRNA.txt | data_expression_miRNA.txt | **Complete** | geneExpressionMiRnaStep | +| mirna-median-zscores | meta_miRNA_median_Zscores.txt | data_miRNA_median_Zscores.txt | **Complete** | geneExpressionMiRnaMedianZscoresStep | +| mirna-merged-median-zscores | meta_expression_merged_median_Zscores.txt | data_expression_merged_median_Zscores.txt | **Complete** | geneExpressionMiRnaMergedMedianZscoresStep | +| mrna-outliers | meta_mRNA_outliers.txt | data_mRNA_outliers.txt | **Complete** | geneExpressionMRnaOutliersStep | +| capture-gene-expression | meta_RNA_Seq_expression_capture.txt | data_RNA_Seq_expression_capture.txt | **Complete** | geneExpressionCaptureStep | +| capture-gene-expression-zscores | meta_RNA_Seq_expression_capture_Zscores.txt | data_RNA_Seq_expression_capture_Zscores.txt | **Complete** | geneExpressionCaptureZscoresStep | +| other-gene-expression-zscores | meta_expression_other_Zscores.txt | data_expression_other_Zscores.txt | **Complete** | geneExpressionOtherZscoresStep | +| mrna-seq-fpkm | meta_mRNA_seq_fpkm.txt | data_mRNA_seq_fpkm.txt | **Complete** | geneExpressionMRnaSeqFpkmStep | +| mrna-seq-rsem | meta_mrnaseq_rsem.txt | data_mrnaseq_rsem.txt | _In Progress_ | -- | +| mrna-seq-fcount | meta_mrnaseq_fcount.txt | data_mrnaseq_fcount.txt | _In Progress_ | -- | +_* **Note:** Support for_ `mrna-seq-rsem` _and_ `mrna-seq-fcount` _will be added after meta datatypes are resolved._ + +A main Flow (`geneExpressionFlow`) will execute the Step(s) for importing each gene expression datatype sequentially. All gene expression datatype steps will have the following components in common: +* `Tasklets`: Two universal tasklets are used for protein-level data steps. One tasklet loads and imports a genetic profile from a datatype's metafile and injects the genetic profile into the `JobExecutionContext`. The second tasklet injects the list of datafiles into the `JobExecutionContext`, as well as a MultiKeyMap of the profile metadata (header, non-case id columns, case id maps, normal case ids). These information are used by the profile data reader, as well as the protein-level data processors and writers. +* `Step`: A universal step builder for gene expression data implements the reading and writing of gene expression data using the genetic profile and the MultiKeyMap of datafile profile metadata (header, non-case id columns, case id maps, normal case ids) loaded from the `Tasklet`s above. A report of records loaded is printed after each file is read. +* `StepExecutionListener`: Before the gene expression datatype step executes, the listener will add the data loaded from the `Tasklet`s above to the `StepExecutionContext`. After the gene expression datatype step executes, the listener will report the total genes loaded, samples skipped, and entries skipped, as well as the total records imported into GENETIC_ALTERATION for the current datatype. + +As of now, dependencies are being ignored for any gene expression datatypes that require them as the importer is assuming that any conversion have already taken place and that z-scores have been generated. + +### [Methylation Data Step](#methylation-data-step) +The following datatype(s) are loaded and imported sequentially if metafiles are found for them: + +| Datatype | Meta Filename | Data Filename | Status | Step Name | +| ------------------ | --------------------------- | --------------------------- | ------------- | --------------------- | +| methylation-hm27 | meta_methylation_hm27.txt | data_methylation_hm27.txt | **Complete** | methylationHm27Step | +| methylation-hm450 | meta_methylation_hm450.txt | data_methylation_hm450.txt | **Complete** | methylationHm450Step | +| methylation-binary | meta_methylation_binary.txt | data_methylation_binary.txt | _In Progress_ | -- | +_* **Note:** Support for_ `methylation-binary` _will be added after meta datatype is resolved._ + +A main Flow (`methylationStepFlow`) will execute the Step(s) for importing each methylation data type sequentially. All methylation datatype steps will have the following components in common: +* `Tasklets`: Two universal tasklets are used for methylation data steps. One tasklet loads and imports a genetic profile from a datatype's metafile and injects the genetic profile into the `JobExecutionContext`. The second tasklet injects the list of datafiles into the `JobExecutionContext`, as well as a MultiKeyMap of the profile metadata (header, non-case id columns, case id maps, normal case ids). These information are used by the profile data reader and writer. +* `Step`: A universal step builder for methylation data implements the reading, processing, and writing of methylation data using the genetic profile and the MultiKeyMap of datafile profile metadata (header, non-case id columns, case id maps, normal case ids) loaded from the `Tasklet`s above. A report of records loaded is printed after each file is read. +* `StepExecutionListener`: Before the methylation datatype step executes, the listener will add the data loaded from the `Tasklet`s above to the `StepExecutionContext`. After the methylation datatype step executes, the listener will report the total genes loaded, samples skipped, and entries skipped, as well as the total records imported into GENETIC_ALTERATION for the current datatype. + +### [Mutation Data Step](#mutation-data-step) +The following datatype(s) are loaded and imported sequentially if metafiles are found for them: + +| Datatype | Meta Filename | Data Filename | Status | Step Name | +| ------------------- | --------------------------- | -------------------------------------- | ------------ | ---------------------- | +| mutation | meta_mutations_extended.txt | data_mutations_extended*.txt | **Complete** | mutationStep | +| mutation-germline | meta_mutations_extended.txt | data_mutations_germline.txt | **Complete** | mutationGermlineStep | +| mutation-manual | meta_mutations_extended.txt | data_mutations_manual.txt | **Complete** | mutationManualStep | + +A main Flow (`mutationStepFlow`) will execute the Step(s) for importing each mutation datatype sequentially. All mutation datatype steps will have the following components in common: +* `Tasklets`: Two universal tasklets are used for mutation data steps. One tasklet loads and imports a genetic profile from a datatype's metafile and injects the genetic profile into the `JobExecutionContext`. The second tasklet injects the list of datafiles into the `JobExecutionContext`, as well as a MultiKeyMap of the headers of each MAF file and number of records in the file (row count). These information are used by the mutation data readers, processors, and writers. +* `Step`: A universal step builder for mutation data implements the reading, processing, and writing of mutation data using the genetic profile and the MultiKeyMap of MAF file metadata (headers and number of records in file) loaded from the `Tasklet`s above. A report of mutations filtered out is printed after each file is read. +* `StepExecutionListener`: Before the mutation datatype step executes, the listener will add the data loaded from the `Tasklet`s above to the `StepExecutionContext`. After the mutation datatype step executes, the listener will report the total samples loaded, total genes loaded, samples skipped, and entries skipped, as well as the total records imported into MUTATION and MUTATION_EVENT for the current datatype. + + +### [Fusion Data Step](#fusion-data-step) +The following datatype(s) are loaded and imported sequentially if metafiles are found for them: + +| Datatype | Meta Filename | Data Filename | Status | Step Name | +| -------- | ---------------- | ---------------- | ------------ | ---------- | +| fusion | meta_fusions.txt | data_fusions.txt | **Complete** | fusionStep | + +A main Flow (`fusionStepFlow`) will execute the Step(s) for importing each fusion datatype sequentially. All fusion datatype steps will have the following components in common: +* `Tasklets`: Two universal tasklets are used for fusion data steps. One tasklet loads and imports a genetic profile from a datatype's metafile and injects the genetic profile into the `JobExecutionContext`. The second tasklet injects the list of datafiles into the `JobExecutionContext`, as well as a MultiKeyMap of the headers of each fusion file and number of records in the file (row count). These information are used by the fusion data readers, processors, and writers. +* `Step`: A step fusion data implements the reading, processing, and writing of fusion data using the genetic profile and the MultiKeyMap of fusion file metadata (headers and number of records in file) loaded from the `Tasklet`s above. +* `StepExecutionListener`: Before the fusion datatype step executes, the listener will add the data loaded from the `Tasklet`s above to the `StepExecutionContext`. After the fusion datatype step executes, the listener will report the total samples loaded, total genes loaded, samples skipped, and entries skipped, as well as the total records imported into MUTATION and MUTATION_EVENT for the current datatype. + +### [Structural Variant Data Step](#structural-variant-data-step) +The following datatype(s) are loaded and imported sequentially if metafiles are found for them: + +| Datatype | Meta Filename | Data Filename | Status | Step Name | +| ------------------ | ------------- | ------------- | ------------ | --------------------- | +| structural-variant | meta_SV.txt | data_SV.txt | **Complete** | structuralVariantStep | + +A main Flow (`structuralVariantStepFlow`) will execute the Step(s) for importing structural variant data. All structural variant datatype steps will have the following components in common: +* `Tasklet`: Two universal tasklets are used for structural variant data steps. One tasklet loads and imports a genetic profile from a datatype's metafile and injects the genetic profile into the `JobExecutionContext`. The second tasklet injects the list of datafiles into the `JobExecutionContext`, as well as a MultiKeyMap of the headers of each datafile and number of records in the file (row count). These information are used by the structural variant data reader, processor, and writer. +* `Step`: A step for structural variant data implements the reading and writing of structural variant data using the genetic profile and the MultiKeyMap of structural variant file metadata (headers and number of records in file) loaded from the `Tasklet` above. +* `StepExecutionListener`: Before the structural variant datatype step executes, the listener will add the data loaded from the `Tasklet` above to the `StepExecutionContext`. After the structural variant datatype step executes, the listener will report the total number of samples and entries skipped, as well as the total records imported into STRUCTURAL_VARIANT for the current datatype. + +### [Case List Step](#case-list-step) +Case lists are imported after every datatype importing step is executed. The following components make up the case list Step: +* `Reader`: A reader looks for the `case_lists` directory and will search for default (standard) case list filenames. If the case list file does not have the `case_list_ids` property set then the list of case ids that were imported for the case list staging files will be used instead (i.e., for the case list file `cases_sequenced.txt`, if the `case_list_ids` property is not defined then the set of case ids loaded during the `mutations` step will be used instead). +* `Writer`: A writer imports the case lists found by the reader. +* `StepExecutionListener`: The listener will add datatype metadata and the cancer study imported to the `StepExecutionContext` before the Step executes. After the Step executes, the listener reports how many records were imported into SAMPLE_LIST and SAMPLE_LIST_LIST. + + diff --git a/docs/ImporterWorkflowDiagram.pdf b/docs/ImporterWorkflowDiagram.pdf new file mode 100644 index 0000000..9295606 Binary files /dev/null and b/docs/ImporterWorkflowDiagram.pdf differ diff --git a/importer/pom.xml b/importer/pom.xml new file mode 100644 index 0000000..eee9302 --- /dev/null +++ b/importer/pom.xml @@ -0,0 +1,135 @@ + + + 4.0.0 + cBioPortal Importer Pipeline + Spring Batch importer pipeline + importer + 0.1.0 + + + org.cbio.portal.pipelines + master + 0.1.0 + + + + + + org.mskcc.cbio + model + 0.1.0 + jar + + + org.mskcc.cbio + persistence-jdbc + 0.1.0 + jar + + + + org.springframework + spring-web + + + commons-lang + commons-lang + 2.4 + + + commons-cli + commons-cli + 1.3 + + + commons-collections + commons-collections + jar + + + + + org.springframework + spring-jdbc + jar + + + commons-dbcp + commons-dbcp + jar + + + mysql + mysql-connector-java + + + + com.google.guava + guava + 19.0 + jar + + + + + + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.5.1 + + 1.8 + 1.8 + + + + + + + + model-mvn-repo + https://raw.github.com/angelicaochoa/persistence/model-0.1.0-mvn-repo/ + + true + always + + + + persistence-mybatis-mvn-repo + https://raw.github.com/angelicaochoa/persistence/persistence-mybatis-0.1.0-mvn-repo/ + + true + always + + + + persistence-jdbc-mvn-repo + https://raw.github.com/angelicaochoa/persistence/persistence-jdbc-0.1.0-mvn-repo/ + + true + always + + + + \ No newline at end of file diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/ImporterPipeline.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/ImporterPipeline.java new file mode 100644 index 0000000..bba5d54 --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/ImporterPipeline.java @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer; + +import org.cbio.portal.pipelines.importer.config.BatchConfiguration; + +import java.io.*; +import java.util.*; +import org.apache.commons.cli.*; +import org.apache.commons.logging.*; + +import org.springframework.boot.SpringApplication; +import org.springframework.boot.autoconfigure.SpringBootApplication; +import org.springframework.context.ConfigurableApplicationContext; +import org.springframework.batch.core.*; +import org.springframework.batch.core.launch.JobLauncher; + +/** + * + * @author ochoaa + */ +@SpringBootApplication +public class ImporterPipeline { + + private static final Log LOG = LogFactory.getLog(ImporterPipeline.class); + + private static Options getOptions(String[] args) { + Options gnuOptions = new Options(); + gnuOptions.addOption("h", "help", false, "shows this help document and quits.") + .addOption("i", "import_study", true, "Cancer study directory to import") + .addOption("d", "delete_study", true, "Cancer study identifier for deleting study"); + return gnuOptions; + } + + private static void help(Options gnuOptions, int exitStatus) { + HelpFormatter helpFormatter = new HelpFormatter(); + helpFormatter.printHelp("ImporterPipeline", gnuOptions); + System.exit(exitStatus); + } + + private static void launchImporterJob(String[] args, String stagingDirectory) throws Exception { + + SpringApplication app = new SpringApplication(ImporterPipeline.class); + ConfigurableApplicationContext ctx = app.run(args); + JobLauncher jobLauncher = ctx.getBean(JobLauncher.class); + + Job batchImporterJob = ctx.getBean(BatchConfiguration.BATCH_STUDY_IMPORTER_JOB, Job.class); + + JobParameters jobParameters = new JobParametersBuilder() + .addString("stagingDirectory", stagingDirectory) + .addDate("date", new Date()) + .toJobParameters(); + + JobExecution jobExecution = jobLauncher.run(batchImporterJob, jobParameters); + if (jobExecution.getExitStatus().getExitCode().equals("STOPPED")) { + LOG.error("Error importing cancer study."); + } + ctx.close(); + } + + private static void launchDeleteStudyJob(String[] args, String cancerStudyIdentifier) throws Exception { + SpringApplication app = new SpringApplication(ImporterPipeline.class); + ConfigurableApplicationContext ctx = app.run(args); + JobLauncher jobLauncher = ctx.getBean(JobLauncher.class); + + Job deleteStudyJob = ctx.getBean(BatchConfiguration.DELETE_CANCER_STUDY_JOB, Job.class); + + JobParameters jobParameters = new JobParametersBuilder() + .addString("cancerStudyIdentifier", cancerStudyIdentifier) + .toJobParameters(); + + JobExecution jobExecution = jobLauncher.run(deleteStudyJob, jobParameters); + ctx.close(); + } + + public static void main(String[] args) throws Exception { + Options gnuOptions = ImporterPipeline.getOptions(args); + CommandLineParser parser = new GnuParser(); + CommandLine commandLine = parser.parse(gnuOptions, args); + if (commandLine.hasOption("h") || + (!commandLine.hasOption("i") && !commandLine.hasOption("d"))) { + help(gnuOptions, 0); + } + + if (commandLine.hasOption("d")) { + String cancerStudyIdentifier = commandLine.getOptionValue("d"); + launchDeleteStudyJob(args, cancerStudyIdentifier); + } + + if (commandLine.hasOption("i")) { + String stagingDirectory = commandLine.getOptionValue("i"); + if (!(new File(stagingDirectory).exists())) { + LOG.error("Staging directory does not exist - please check argument: " + stagingDirectory); + System.exit(2); + } + launchImporterJob(args, stagingDirectory); + } + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/BatchConfiguration.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/BatchConfiguration.java new file mode 100644 index 0000000..0b1f110 --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/BatchConfiguration.java @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config; + +import org.mskcc.cbio.persistence.jdbc.InfoJdbcDaoImpl; +import org.cbio.portal.pipelines.importer.config.listener.BatchStudyImporterListener; + +import javax.annotation.Resource; +import org.apache.commons.logging.*; + +import org.springframework.batch.core.*; +import org.springframework.batch.core.configuration.annotation.*; +import org.springframework.batch.core.job.builder.FlowBuilder; +import org.springframework.batch.core.job.flow.*; +import org.springframework.batch.core.launch.support.RunIdIncrementer; + +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.context.annotation.*; + +/** + * + * @author ochoaa + */ +@Configuration +@EnableBatchProcessing +@ComponentScan("org.mskcc.cbio.persistence.jdbc") +public class BatchConfiguration { + + public static final String BATCH_STUDY_IMPORTER_JOB = "batchStudyImporterJob"; + public static final String DELETE_CANCER_STUDY_JOB = "deleteCancerStudyJob"; + + @Autowired + public JobBuilderFactory jobBuilderFactory; + + @Autowired + public StepBuilderFactory stepBuilderFactory; + + @Value("${db.version}") + private String dbVersion; + + @Resource(name="importCancerStudy") + public Step importCancerStudy; + + @Resource(name="deleteCancerStudy") + public Step deleteCancerStudy; + + @Autowired + InfoJdbcDaoImpl infoJdbcDaoImpl; + + private static final Log LOG = LogFactory.getLog(BatchConfiguration.class); + + /** + * Job for importing a cancer study. + * + * @return Job + * @throws Exception + */ + @Bean + public Job batchStudyImporterJob() throws Exception { + return jobBuilderFactory.get(BATCH_STUDY_IMPORTER_JOB) + .incrementer(new RunIdIncrementer()) + .listener(batchStudyImporterListener()) + .start(batchStudyImporterFlow()) + .build() + .build(); + } + + /** + * Job for deleting a cancer study given a cancer study identifier. + * + * @return Job + * @throws Exception + */ + @Bean + public Job deleteCancerStudyJob() throws Exception { + return jobBuilderFactory.get(DELETE_CANCER_STUDY_JOB) + .start(deleteCancerStudy) + .build(); + } + + /** + * Flow for initiating cancer study batch import. + * + * @return Flow + */ + @Bean + public Flow batchStudyImporterFlow() { + return new FlowBuilder("batchStudyImporterFlow") + .start(batchStudyImporterDecider()) + .on("STOPPED").end() + .from(batchStudyImporterDecider()) + .on("CONTINUE") + .to(importCancerStudy) + .build(); + } + + /** + * Listener for batch study importer job execution. + * + * @return JobExecutionListener + */ + @Bean + public JobExecutionListener batchStudyImporterListener() { + return new BatchStudyImporterListener(); + } + + /** + * Decider for checking DB schema compatibility. + * If DB schema is not compatible then job will stop executing. + * + * @return JobExecutionDecider + */ + @Bean + public JobExecutionDecider batchStudyImporterDecider() { + return (JobExecution jobExecution, StepExecution stepExecution) -> { + LOG.info("Checking DB schema compatibility"); + if (!infoJdbcDaoImpl.checkPortalDbVersion()) { + LOG.error("DB version expected by portal: " + dbVersion + + ". DB schema version found: " + infoJdbcDaoImpl.getDbSchemaVersion()); + return FlowExecutionStatus.STOPPED; + } + else { + LOG.info("DB schema version matches version expected by portal"); + return new FlowExecutionStatus("CONTINUE"); + } + }; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/CaseListMetadata.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/CaseListMetadata.java new file mode 100644 index 0000000..8e9968b --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/CaseListMetadata.java @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config; + +import java.util.*; +import org.springframework.context.annotation.*; + +/** + * + * @author ochoaa + */ +@Configuration +public class CaseListMetadata { + + /** + * Standard case list meta data map. + * key = case list filename + * value = |-delimited datatypes + * & indicates intersection of 1+ datatypes + * + * @return Map + */ + @Bean(name="caseListMetadataMap") + public Map caseListMetadataMap() { + Map caseListMetadataMap = new LinkedHashMap<>(); + caseListMetadataMap.put("cases_all.txt", "cna-gistic|rnaseq-gene-expression-zscores|rnaseq-v2-gene-expression-zscores|agilent-gene-expression-zscores|mutation|methylation-hm27|methylation-hm450|rppa|fusion|structural-variant|clinical|clinical-sample|bcr-clinical-sample"); + caseListMetadataMap.put("cases_cna.txt", "cna-gistic"); + caseListMetadataMap.put("cases_cnaseq.txt", "cna-gistic&mutation"); + caseListMetadataMap.put("cases_complete.txt", "rnaseq-v2-gene-expression-zscores&cna-gistic&mutation|rnaseq-gene-expression&cna-gistic&mutation|agilent-gene-expression-zscores&cna-gistic&mutation|affymetrix-gene-expression-zscores&cna-gistic&mutation"); + caseListMetadataMap.put("cases_log2CNA.txt", "log2-cna"); + caseListMetadataMap.put("cases_methylation_hm27.txt", "methylation-hm27"); + caseListMetadataMap.put("cases_methylation_hm450.txt", "methylation-hm450"); + caseListMetadataMap.put("cases_miRNA.txt", "mirna-median-zscores"); + caseListMetadataMap.put("cases_mRNA_U133.txt", "affymetrix-gene-expression-zscores"); + caseListMetadataMap.put("cases_mRNA.txt", "agilent-gene-expression-zscores"); + caseListMetadataMap.put("cases_RNA_Seq_mRNA.txt", "rnaseq-gene-expression"); + caseListMetadataMap.put("cases_RNA_Seq_v2_mRNA.txt", "rnaseq-v2-gene-expression-zscores"); + caseListMetadataMap.put("cases_rppa.txt", "rppa"); + caseListMetadataMap.put("cases_sequenced.txt", "mutation"); + + return caseListMetadataMap; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/DatatypeMetadata.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/DatatypeMetadata.java new file mode 100644 index 0000000..6618139 --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/DatatypeMetadata.java @@ -0,0 +1,337 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config; + +import org.apache.commons.collections.map.*; +import org.springframework.context.annotation.*; + +/** + * + * @author ochoaa + */ +@Configuration +public class DatatypeMetadata { + + /** + * All datatype metadata configurations. + * + * @return MultiKeyMap + */ + @Bean(name="datatypeMetadataMap") + public MultiKeyMap datatypeMetadataMap() { + MultiKeyMap datatypeMetadataMap = MultiKeyMap.decorate(new LinkedMap()); + datatypeMetadataMap.putAll(clinicalMetadata()); + datatypeMetadataMap.putAll(timelineMetadata()); + datatypeMetadataMap.putAll(mutSigMetadata()); + datatypeMetadataMap.putAll(copyNumberSegmentMetadata()); + datatypeMetadataMap.putAll(gisticMetadata()); + datatypeMetadataMap.putAll(proteinLevelMetadata()); + datatypeMetadataMap.putAll(cnaMetadata()); + datatypeMetadataMap.putAll(geneExpressionMetadata()); + datatypeMetadataMap.putAll(methylationMetadata()); + datatypeMetadataMap.putAll(mutationMetadata()); + datatypeMetadataMap.putAll(fusionMetadata()); + datatypeMetadataMap.putAll(structuralVariantMetadata()); + + return datatypeMetadataMap; + } + + /** + * Clinical datatype metadata configuration. + * + * @return MultiKeyMap + */ + public MultiKeyMap clinicalMetadata() { + MultiKeyMap clinicalMetadataMap = MultiKeyMap.decorate(new LinkedMap()); + + // clinical datatype metadata + clinicalMetadataMap.put("clinical", "meta_filename", "meta_clinical.txt"); + + // clinical-patient datatype metadata + clinicalMetadataMap.put("clinical-patient", "meta_filename", "meta_clinical_patient.txt"); + + // clinical-sample datatype metadata + clinicalMetadataMap.put("clinical-sample", "meta_filename", "meta_clinical_sample.txt"); + + // bcr-clinical datatype metadata + clinicalMetadataMap.put("bcr-clinical", "meta_filename", "meta_bcr_clinical.txt"); + + // bcr-clinical-patient datatype metadata + clinicalMetadataMap.put("bcr-clinical-patient", "meta_filename", "meta_bcr_clinical_patient.txt"); + + // bcr-clinical-sample datatype metadata + clinicalMetadataMap.put("bcr-clinical-sample", "meta_filename", "meta_bcr_clinical_sample.txt"); + + // clinical-supp datatype metadata + clinicalMetadataMap.put("clinical-supp", "meta_filename", "meta_clinical_supp.txt"); + + return clinicalMetadataMap; + } + + /** + * Timeline datatype metadata configuration. + * + * @return MultiKeyMap + */ + public MultiKeyMap timelineMetadata() { + MultiKeyMap timelineMetadataMap = MultiKeyMap.decorate(new LinkedMap()); + + // time-line datatype metadata + timelineMetadataMap.put("time-line-data", "meta_filename", "meta_timeline.txt"); + + return timelineMetadataMap; + } + + /** + * Genetic profile datatype metadata configuration. + * + * @return MultiKeyMap + */ + public MultiKeyMap mutSigMetadata() { + MultiKeyMap mutSigMetadataMap = MultiKeyMap.decorate(new LinkedMap()); + + // mutsig datatype metadata + mutSigMetadataMap.put("mutation-significance-v2", "meta_filename", "meta_mutsig.txt"); + + return mutSigMetadataMap; + } + + /** + * Copy number segment datatype metadata configuration. + * + * @return MultiKeyMap + */ + public MultiKeyMap copyNumberSegmentMetadata() { + MultiKeyMap copyNumberSegmentMetadataMap = MultiKeyMap.decorate(new LinkedMap()); + + // copy number segment datatype metadata for reference genome hg19 + copyNumberSegmentMetadataMap.put("cna-hg19-seg", "meta_filename", "_meta_cna_hg19_seg.txt"); + + // copy number segment datatype metadata for reference genome hg18 + copyNumberSegmentMetadataMap.put("cna-hg18-seg", "meta_filename", "_meta_cna_hg18_seg.txt"); + + return copyNumberSegmentMetadataMap; + } + + /** + * Gistic datatype metadata configuration. + * + * @return MultiKeyMap + */ + public MultiKeyMap gisticMetadata() { + MultiKeyMap gisticMetadataMap = MultiKeyMap.decorate(new LinkedMap()); + + // gistic datatype metadata + gisticMetadataMap.put("gistic-genes-amp", "meta_filename", "meta_gistic_genes_amp.txt"); + + // gistic datatype metadata + gisticMetadataMap.put("gistic-genes-del", "meta_filename", "meta_gistic_genes_del.txt"); + + return gisticMetadataMap; + } + + /** + * Protein-level datatype metadata configuration. + * + * @return MultiKeyMap + */ + public MultiKeyMap proteinLevelMetadata() { + MultiKeyMap proteinLevelMetadataMap = MultiKeyMap.decorate(new LinkedMap()); + + // rppa datatype metadata + proteinLevelMetadataMap.put("rppa", "meta_filename", "meta_rppa.txt"); + + // rppa z-scores datatype metadata + proteinLevelMetadataMap.put("rppa-zscores", "meta_filename", "meta_rppa_Zscores.txt"); + + // protein-quantification datatype metadata + proteinLevelMetadataMap.put("protein-quantification", "meta_filename", "meta_protein_quantification.txt"); + + return proteinLevelMetadataMap; + } + + /** + * CNA datatype metadata configuration. + * + * @return MultiKeyMap + */ + public MultiKeyMap cnaMetadata() { + MultiKeyMap cnaMetadataMap = MultiKeyMap.decorate(new LinkedMap()); + + // cna-gistic datatype metadata + cnaMetadataMap.put("cna-gistic", "meta_filename", "meta_CNA.txt"); + + // cna-foundation datatype metadata + cnaMetadataMap.put("cna-foundation", "meta_filename", "meta_CNA_foundation.txt"); + + // cna-rae datatype metadata + cnaMetadataMap.put("cna-rae", "meta_filename", "meta_CNA_RAE.txt"); + + // cna-consensus datatype metadata + cnaMetadataMap.put("cna-consensus", "meta_filename", "meta_CNA_consensus.txt"); + + // linear-cna-gistic metadata + cnaMetadataMap.put("linear-cna-gistic", "meta_filename", "meta_linear_CNA.txt"); + + // log2-cna metadata + cnaMetadataMap.put("log2-cna", "meta_filename", "meta_log2CNA.txt"); + + return cnaMetadataMap; + } + + /** + * Gene expression datatype metadata configuration. + * + * @return MultiKeyMap + */ + public MultiKeyMap geneExpressionMetadata() { + MultiKeyMap geneExpressionMetadataMap = MultiKeyMap.decorate(new LinkedMap()); + + // affymetrix-gene-expression datatype metadata + geneExpressionMetadataMap.put("affymetrix-gene-expression", "meta_filename", "meta_expression.txt"); + + // affymetrix-gene-expression-zscores datatype metadata + geneExpressionMetadataMap.put("affymetrix-gene-expression-zscores", "meta_filename", "meta_expression_Zscores.txt"); + + // gene-expression-merged datatype metadata + geneExpressionMetadataMap.put("gene-expression-merged", "meta_filename", "meta_expression_merged.txt"); + + // gene-expression-merged-zscores datatype metadata + geneExpressionMetadataMap.put("gene-expression-merged-zscores", "meta_filename", "meta_expression_merged_Zscores.txt"); + + // rnaseq-gene-expression datatype metadata + geneExpressionMetadataMap.put("rnaseq-gene-expression", "meta_filename", "meta_RNA_Seq_expression_median.txt"); + + // rnaseq-gene-expression-zscores datatype metadata + geneExpressionMetadataMap.put("rnaseq-gene-expression-zscores", "meta_filename", "meta_RNA_Seq_mRNA_median_Zscores.txt"); + + // agilent-gene-expression datatype metadata + geneExpressionMetadataMap.put("agilent-gene-expression", "meta_filename", "meta_expression_median.txt"); + + // agilent-gene-expression-zscores datatype metadata + geneExpressionMetadataMap.put("agilent-gene-expression-zscores", "meta_filename", "meta_mRNA_median_Zscores.txt"); + + // rnaseq-v2-gene-expression datatype metadata + geneExpressionMetadataMap.put("rnaseq-v2-gene-expression", "meta_filename", "meta_RNA_Seq_v2_expression_median.txt"); + + // rnaseq-v2-gene-expression-zscores datatype metadata + geneExpressionMetadataMap.put("rnaseq-v2-gene-expression-zscores", "meta_filename", "meta_RNA_Seq_v2_mRNA_median_Zscores.txt"); + + // mirna-expression datatype metadata + geneExpressionMetadataMap.put("mirna-expression", "meta_filename", "meta_expression_miRNA.txt"); + + // mirna-median-zscores datatype metadata + geneExpressionMetadataMap.put("mirna-median-zscores", "meta_filename", "meta_miRNA_median_Zscores.txt"); + + // mirna-merged-median-zscores datatype metadata + geneExpressionMetadataMap.put("mirna-merged-median-zscores", "meta_filename", "meta_expression_merged_median_Zscores.txt"); + + // mrna-outliers datatype metadata + geneExpressionMetadataMap.put("mrna-outliers", "meta_filename", "meta_mRNA_outliers.txt"); + + // capture-gene-expression datatype metadata + geneExpressionMetadataMap.put("capture-gene-expression", "meta_filename", "meta_RNA_Seq_expression_capture.txt"); + + // capture-gene-expression-zscores datatype metadata + geneExpressionMetadataMap.put("capture-gene-expression-zscores", "meta_filename", "meta_RNA_Seq_expression_capture_Zscores.txt"); + + // other-gene-expression-zscores datatype metadata + geneExpressionMetadataMap.put("other-gene-expression-zscores", "meta_filename", "meta_expression_other_Zscores.txt"); + + // mrna-seq-fpkm datatype metadata + geneExpressionMetadataMap.put("mrna-seq-fpkm", "meta_filename", "meta_mRNA_seq_fpkm.txt"); + + return geneExpressionMetadataMap; + } + + /** + * Methylation datatype metadata configuration. + * + * @return MultiKeyMap + */ + public MultiKeyMap methylationMetadata() { + MultiKeyMap methylationMetadataMap = MultiKeyMap.decorate(new LinkedMap()); + + // methylation-hm27 datatype metadata + methylationMetadataMap.put("methylation-hm27", "meta_filename", "meta_methylation_hm27.txt"); + + // methylation-hm450 datatype metadata + methylationMetadataMap.put("methylation-hm450", "meta_filename", "meta_methylation_hm450.txt"); + + return methylationMetadataMap; + } + + /** + * Mutation datatype metadata configuration. + * + * @return MultiKeyMap + */ + public MultiKeyMap mutationMetadata() { + MultiKeyMap mutationMetadataMap = MultiKeyMap.decorate(new LinkedMap()); + + // mutation datatype metadata + mutationMetadataMap.put("mutation", "meta_filename", "meta_mutations_extended.txt"); + + // mutation-germline datatype metadata + mutationMetadataMap.put("mutation-germline", "meta_filename", "meta_mutations_germline.txt"); + + // mutation-manual datatype metadata + mutationMetadataMap.put("mutation-manual", "meta_filename", "meta_mutations_manual.txt"); + + return mutationMetadataMap; + } + + /** + * Fusion datatype metadata configuration. + * + * @return MultiKeyMap + */ + public MultiKeyMap fusionMetadata() { + MultiKeyMap fusionMetadataMap = MultiKeyMap.decorate(new LinkedMap()); + + // fusion datatype metadata + fusionMetadataMap.put("fusion", "meta_filename", "meta_fusions.txt"); + + return fusionMetadataMap; + } + + public MultiKeyMap structuralVariantMetadata() { + MultiKeyMap structuralVariantMetadataMap = MultiKeyMap.decorate(new LinkedMap()); + + // structural variant datatype metadata + structuralVariantMetadataMap.put("structural-variant", "meta_filename", "meta_SV.txt"); + + return structuralVariantMetadataMap; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/JdbcConfiguration.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/JdbcConfiguration.java new file mode 100644 index 0000000..552cd3e --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/JdbcConfiguration.java @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config; + +import java.sql.SQLException; +import javax.sql.DataSource; +import org.apache.commons.dbcp.BasicDataSource; + +import org.springframework.beans.factory.annotation.Value; +import org.springframework.context.annotation.*; +import org.springframework.jdbc.core.namedparam.NamedParameterJdbcTemplate; +import org.springframework.transaction.*; + +/** + * + * @author ochoaa + */ +@Configuration +public class JdbcConfiguration { + + @Value("${db.user}") + private String dbUser; + + @Value("${db.password}") + private String dbPassword; + + @Value("${db.driver}") + private String dbDriver; + + @Value("${db.url}") + private String dbUrl; + + public DataSource mainDataSource() throws SQLException { + BasicDataSource dataSource = new BasicDataSource(); + dataSource.setUsername(dbUser); + dataSource.setPassword(dbPassword); + dataSource.setDriverClassName(dbDriver); + dataSource.setUrl(dbUrl); + dataSource.setDefaultTransactionIsolation(TransactionDefinition.ISOLATION_READ_UNCOMMITTED); + + return dataSource; + } + + /** + * Bean that holds the named parameter jdbc template for persistence layer. + * + * @return NamedParameterJdbcTemplate + * @throws SQLException + */ + @Bean(name="namedParameterJdbcTemplate") + public NamedParameterJdbcTemplate namedParameterJdbcTemplate() throws SQLException { + return new NamedParameterJdbcTemplate(mainDataSource()); + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/composite/CompositeClinicalData.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/composite/CompositeClinicalData.java new file mode 100644 index 0000000..658f5cd --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/composite/CompositeClinicalData.java @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.composite; + +import org.mskcc.cbio.model.*; + +import java.util.*; +import java.io.Serializable; + + +/** + * Composite clinical data object + * @author ochoaa + */ +public class CompositeClinicalData implements Serializable { + + private Patient patient; + private Sample sample; + private Map patientClinicalData; + private Map sampleClinicalData; + private Map compositeClinicalDataMap; + + /** + * @return the patient + */ + public Patient getPatient() { + return patient; + } + + /** + * @param patient the patient to set + */ + public void setPatient(Patient patient) { + this.patient = patient; + } + + /** + * @return the sample + */ + public Sample getSample() { + return sample; + } + + /** + * @param sample the sample to set + */ + public void setSample(Sample sample) { + this.sample = sample; + } + + /** + * @return the patientClinicalData + */ + public Map getPatientClinicalData() { + return patientClinicalData; + } + + /** + * @param patientClinicalData the patientClinicalData to set + */ + public void setPatientClinicalData(Map patientClinicalData) { + this.patientClinicalData = patientClinicalData; + } + + /** + * @return the sampleClinicalData + */ + public Map getSampleClinicalData() { + return sampleClinicalData; + } + + /** + * @param sampleClinicalData the sampleClinicalData to set + */ + public void setSampleClinicalData(Map sampleClinicalData) { + this.sampleClinicalData = sampleClinicalData; + } + + /** + * @return the compositeClinicalDataMap + */ + public Map getCompositeClinicalDataMap() { + return compositeClinicalDataMap; + } + + /** + * @param compositeClinicalDataMap the compositeClinicalDataMap to set + */ + public void setCompositeClinicalDataMap(Map compositeClinicalDataMap) { + this.compositeClinicalDataMap = compositeClinicalDataMap; + } + + /** + * Propagate internal id update to patient clinical data. + * + * @param internalId + */ + public void updatePatientInternalId(int internalId) { + this.patient.setInternalId(internalId); + if (this.sample.getPatientId() == -1) { + this.sample.setPatientId(internalId); + } + if (!this.patientClinicalData.isEmpty()) { + this.patientClinicalData.keySet().stream().forEach((attrId) -> { + this.patientClinicalData.get(attrId).setInternalId(internalId); + }); + } + } + + /** + * Propagate internal id update to sample clinical data. + * + * @param internalId + */ + public void updateSampleInternalId(int internalId) { + this.sample.setInternalId(internalId); + if (this.patient.getInternalId() == -1) { + this.patient.setInternalId(this.sample.getPatientId()); + } + if (!this.sampleClinicalData.isEmpty()) { + this.sampleClinicalData.keySet().stream().forEach((attrId) -> { + this.sampleClinicalData.get(attrId).setInternalId(internalId); + }); + } + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/composite/CompositeMutationData.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/composite/CompositeMutationData.java new file mode 100644 index 0000000..7c1c829 --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/composite/CompositeMutationData.java @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.composite; + +import org.mskcc.cbio.model.*; + +/** + * + * @author ochoaa + */ +public class CompositeMutationData { + + private Integer mutationEventId; + private Integer geneticProfileId; + private Mutation mutation; + + /** + * @return the mutationEventId + */ + public Integer getMutationEventId() { + return mutationEventId; + } + + /** + * @param mutationEventId the mutationEventId to set + */ + public void setMutationEventId(Integer mutationEventId) { + this.mutationEventId = mutationEventId; + this.mutation.setMutationEventId(mutationEventId); + this.mutation.getMutationEvent().setMutationEventId(mutationEventId); + } + + /** + * @return the geneticProfileId + */ + public Integer getGeneticProfileId() { + return geneticProfileId; + } + + /** + * @param geneticProfileId the geneticProfileId to set + */ + public void setGeneticProfileId(Integer geneticProfileId) { + this.geneticProfileId = geneticProfileId; + this.mutation.setGeneticProfileId(geneticProfileId); + } + + /** + * @return the mutation + */ + public Mutation getMutation() { + return mutation; + } + + /** + * @param mutation the mutation to set + */ + public void setMutation(Mutation mutation) { + this.mutation = mutation; + } + + /** + * @param mutationEvent the mutationEvent to update + */ + public void updateMutationEvent(MutationEvent mutationEvent) { + this.setMutationEventId(mutationEvent.getMutationEventId()); + this.mutation.setMutationEvent(mutationEvent); + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/composite/CompositeProfileData.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/composite/CompositeProfileData.java new file mode 100644 index 0000000..c757801 --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/composite/CompositeProfileData.java @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.composite; + +import org.mskcc.cbio.model.*; + +import java.util.*; + +/** + * + * @author ochoaa + */ +public class CompositeProfileData { + + private GeneticProfile geneticProfile; + private Gene gene; + private Map caseProfileDataRecords; + private List profileCnaEvents; + + /** + * @return the geneticProfile + */ + public GeneticProfile getGeneticProfile() { + return geneticProfile; + } + + /** + * @param geneticProfile the geneticProfile to set + */ + public void setGeneticProfile(GeneticProfile geneticProfile) { + this.geneticProfile = geneticProfile; + } + + /** + * @return the gene + */ + public Gene getGene() { + return gene; + } + + /** + * @param gene the gene to set + */ + public void setGene(Gene gene) { + this.gene = gene; + } + + /** + * @return the caseProfileDataRecords + */ + public Map getCaseProfileDataRecords() { + return caseProfileDataRecords; + } + + /** + * @param caseProfileDataRecords the caseProfileDataRecords to set + */ + public void setCaseProfileDataRecords(Map caseProfileDataRecords) { + this.caseProfileDataRecords = caseProfileDataRecords; + } + + /** + * @return the profileCnaEvents + */ + public List getProfileCnaEvents() { + return profileCnaEvents; + } + + /** + * @param profileCnaEvents the profileCnaEvents to set + */ + public void setProfileCnaEvents(List profileCnaEvents) { + this.profileCnaEvents = profileCnaEvents; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/listener/BatchStudyImporterListener.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/listener/BatchStudyImporterListener.java new file mode 100644 index 0000000..d15ece3 --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/listener/BatchStudyImporterListener.java @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.listener; + +import org.mskcc.cbio.model.CancerStudy; +import org.mskcc.cbio.persistence.jdbc.CancerStudyJdbcDaoImpl; + +import org.apache.commons.logging.*; +import org.springframework.batch.core.*; +import org.springframework.beans.factory.annotation.Autowired; + +/** + * Listener for batch study importer job execution. + * + * @author ochoaa + */ +public class BatchStudyImporterListener implements JobExecutionListener { + + @Autowired + CancerStudyJdbcDaoImpl cancerStudyJdbcDaoImpl; + + private static final Log LOG = LogFactory.getLog(BatchStudyImporterListener.class); + + @Override + public void beforeJob(JobExecution jobExecution) {} + + @Override + public void afterJob(JobExecution jobExecution) { + CancerStudy cancerStudy = (CancerStudy) jobExecution.getExecutionContext().get("cancerStudy"); + boolean rollbackCancerStudyState = (boolean) jobExecution.getExecutionContext().get("rollbackCancerStudyState"); + + // delete cancer study if rollback cancer study state is true or if exit status not COMPLETED + if (jobExecution.getExitStatus().equals(ExitStatus.COMPLETED) && !rollbackCancerStudyState) { + LOG.info("Import complete for cancer study: " + cancerStudy.getCancerStudyIdentifier()); + } + else { + LOG.error("Job STOPPED or FAILED for study: " + cancerStudy.getCancerStudyIdentifier()); + if (rollbackCancerStudyState) { + LOG.error("Cancer study import contains errors - deleting imported data for study: " + cancerStudy.getCancerStudyIdentifier()); + } + cancerStudyJdbcDaoImpl.deleteCancerStudy(cancerStudy.getCancerStudyId()); + } + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/listener/CancerStudyListener.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/listener/CancerStudyListener.java new file mode 100644 index 0000000..b42084d --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/listener/CancerStudyListener.java @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.listener; + +import org.springframework.batch.core.*; + +/** + * Simple step listener for cancer study import + * @author ochoaa + */ +public class CancerStudyListener implements StepExecutionListener { + + @Override + public void beforeStep(StepExecution stepExecution) {} + + @Override + public ExitStatus afterStep(StepExecution stepExecution) { + return ExitStatus.COMPLETED; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/listener/CaseListListener.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/listener/CaseListListener.java new file mode 100644 index 0000000..c05c5ee --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/listener/CaseListListener.java @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.listener; + +import org.apache.commons.logging.*; +import org.springframework.batch.core.*; + +/** + * + * @author ochoaa + */ +public class CaseListListener implements StepExecutionListener { + + private static final Log LOG = LogFactory.getLog(CaseListListener.class); + + @Override + public void beforeStep(StepExecution stepExecution) { + // add cancer study and datatype metadata to step execution context + stepExecution.getExecutionContext().put("cancerStudy", stepExecution.getJobExecution().getExecutionContext().get("cancerStudy")); + stepExecution.getExecutionContext().put("datatypeMetadata", stepExecution.getJobExecution().getExecutionContext().get("datatypeMetadata")); + } + + @Override + public ExitStatus afterStep(StepExecution stepExecution) { + String stepName = stepExecution.getStepName(); + + // get counts for sample list and sample list list + int sampleListDataCount = (int) stepExecution.getExecutionContext().get("sampleListDataCount"); + int sampleListListDataCount = (int) stepExecution.getExecutionContext().get("sampleListListDataCount"); + + if (sampleListDataCount == 0 && sampleListListDataCount == 0) { + LOG.error("Error import sample list records into SAMPLE_LIST and SAMPLE_LIST_LIST"); + // change rollback state for cancer study to true to return to saved state + stepExecution.getJobExecution().getExecutionContext().put("rollbackCancerStudyState", true); + } + else { + // log the sample list and sample list list record counts + LOG.info("Sample list records imported: " + sampleListDataCount); + LOG.info("Sample list list records imported: " + sampleListListDataCount); + } + + // log rollbacks and number of items skipped during current step execution + if (stepExecution.getRollbackCount() > 0) { + LOG.info("Rollbacks during " + stepName + ": " + stepExecution.getRollbackCount()); + } + if (stepExecution.getSkipCount() > 0) { + LOG.info("Items skipped " + stepName + ": " + stepExecution.getSkipCount()); + } + + return ExitStatus.COMPLETED; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/listener/ClinicalDataListener.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/listener/ClinicalDataListener.java new file mode 100644 index 0000000..6ef5fcd --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/listener/ClinicalDataListener.java @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.listener; + +import java.util.*; +import org.apache.commons.collections.map.MultiKeyMap; +import org.apache.commons.logging.*; +import org.springframework.batch.core.*; + +/** + * + * @author ochoaa + */ +public class ClinicalDataListener implements StepExecutionListener { + + private static final Log LOG = LogFactory.getLog(ClinicalDataListener.class); + + @Override + public void beforeStep(StepExecution stepExecution) { + // add cancer study, datafile list, and clinical file metadata to step execution context + stepExecution.getExecutionContext().put("cancerStudy", stepExecution.getJobExecution().getExecutionContext().get("cancerStudy")); + stepExecution.getExecutionContext().put("clinicalMetadata", stepExecution.getJobExecution().getExecutionContext().get("clinicalMetadata")); + stepExecution.getExecutionContext().put("dataFileList", stepExecution.getJobExecution().getExecutionContext().get("dataFileList")); + } + + @Override + public ExitStatus afterStep(StepExecution stepExecution) { + MultiKeyMap datatypeMetadata = (MultiKeyMap) stepExecution.getJobExecution().getExecutionContext().get("datatypeMetadata"); + String datatype = (String) stepExecution.getJobExecution().getExecutionContext().get("currentDatatype"); + String stepName = stepExecution.getStepName(); + + // log new attributes count + int newClinicalAttributes = stepExecution.getJobExecution().getExecutionContext().getInt("newClinicalAttributes", 0); + if (newClinicalAttributes > 0) { + LOG.info("New clinical attributes imported: " + newClinicalAttributes); + } + + // get counts for patient, sample, and clinical data records inserted + int patientCount = (int) stepExecution.getExecutionContext().get("patientCount"); + int sampleCount = (int) stepExecution.getExecutionContext().get("sampleCount"); + int patientDataCount = (int) stepExecution.getExecutionContext().get("patientDataCount"); + int sampleDataCount = (int) stepExecution.getExecutionContext().get("sampleDataCount"); + + int totalCount = Arrays.stream(new int[]{patientCount, sampleCount, patientDataCount, sampleDataCount}).sum(); + if (totalCount == 0) { + LOG.error("No records were imported for datatype: " + datatype); + // change rollback state for cancer study to true to return to saved state + stepExecution.getJobExecution().getExecutionContext().put("rollbackCancerStudyState", true); + } + else { + // log the record counts + LOG.info("Patient records imported: " + patientCount); + LOG.info("Patient clinical data records imported: " + patientDataCount); + LOG.info("Sample records imported: " + sampleCount); + LOG.info("Sample clinical data records imported: " + sampleDataCount); + + // update the case list for datatype metadata add update execution context + datatypeMetadata.put(datatype, "caseList", stepExecution.getExecutionContext().get("caseList")); + stepExecution.getJobExecution().getExecutionContext().put("datatypeMetadata", datatypeMetadata); + } + + // log rollbacks and number of items skipped during current step execution + if (stepExecution.getRollbackCount() > 0) { + LOG.info("Rollbacks during " + stepName + ": " + stepExecution.getRollbackCount()); + } + if (stepExecution.getSkipCount() > 0) { + LOG.info("Items skipped " + stepName + ": " + stepExecution.getSkipCount()); + } + + return ExitStatus.COMPLETED; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/listener/CopyNumberSegmentDataListener.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/listener/CopyNumberSegmentDataListener.java new file mode 100644 index 0000000..0c9dde5 --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/listener/CopyNumberSegmentDataListener.java @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.listener; + +import java.util.*; +import org.apache.commons.collections.map.MultiKeyMap; +import org.apache.commons.logging.*; +import org.springframework.batch.core.*; + +/** + * + * @author ochoaa + */ +public class CopyNumberSegmentDataListener implements StepExecutionListener { + + private static final Log LOG = LogFactory.getLog(CopyNumberSegmentDataListener.class); + + @Override + public void beforeStep(StepExecution stepExecution) { + // add cancer study, datafile list, and seg file metadata to step execution context + stepExecution.getExecutionContext().put("cancerStudy", stepExecution.getJobExecution().getExecutionContext().get("cancerStudy")); + stepExecution.getExecutionContext().put("dataFile", stepExecution.getJobExecution().getExecutionContext().get("dataFile")); + stepExecution.getExecutionContext().put("copyNumberSegmentMetadata", stepExecution.getJobExecution().getExecutionContext().get("copyNumberSegmentMetadata")); + } + + @Override + public ExitStatus afterStep(StepExecution stepExecution) { + MultiKeyMap datatypeMetadata = (MultiKeyMap) stepExecution.getJobExecution().getExecutionContext().get("datatypeMetadata"); + String datatype = (String) stepExecution.getJobExecution().getExecutionContext().get("currentDatatype"); + String stepName = stepExecution.getStepName(); + + // get number of samples skipped, entries skipped, and total records imported + int samplesSkipped = (int) stepExecution.getExecutionContext().get("samplesSkipped"); + int entriesSkipped = stepExecution.getFilterCount(); // filter count = # records returned as null from processor + int copyNumberSegmentDataCount = (int) stepExecution.getExecutionContext().get("copyNumberSegmentDataCount"); + + if (copyNumberSegmentDataCount == 0) { + LOG.error("Error importing copy number segment data for datatype: " + datatype); + // change rollback state for cancer study to true to return to saved state + stepExecution.getJobExecution().getExecutionContext().put("rollbackCancerStudyState", true); + } + else { + // get case list + Set caseList = (Set) stepExecution.getExecutionContext().get("caseList"); + + // log counts during data loading and importing + LOG.info("Total samples loaded for datatype: " + caseList.size()); + LOG.info("Normal samples skipped during data loading: " + samplesSkipped); + LOG.info("Total entries skipped during data loading: " + entriesSkipped); + LOG.info("Total records imported into COPY_NUMBER_SEG: " + copyNumberSegmentDataCount); + + // update the case list for datatype metadata add update execution context + datatypeMetadata.put(datatype, "caseList", caseList); + stepExecution.getJobExecution().getExecutionContext().put("datatypeMetadata", datatypeMetadata); + } + + // log rollbacks and number of items skipped during current step execution + if (stepExecution.getRollbackCount() > 0) { + LOG.info("Rollbacks during " + stepName + ": " + stepExecution.getRollbackCount()); + } + if (stepExecution.getSkipCount() > 0) { + LOG.info("Items skipped " + stepName + ": " + stepExecution.getSkipCount()); + } + + return ExitStatus.COMPLETED; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/listener/GisticDataListener.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/listener/GisticDataListener.java new file mode 100644 index 0000000..2236a92 --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/listener/GisticDataListener.java @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.listener; + +import org.apache.commons.logging.*; +import org.springframework.batch.core.*; + +/** + * + * @author ochoaa + */ +public class GisticDataListener implements StepExecutionListener { + + private static final Log LOG = LogFactory.getLog(GisticDataListener.class); + + @Override + public void beforeStep(StepExecution stepExecution) { + // add cancer study, datafile, and gistic file metadata from tasklet to step execution context + stepExecution.getExecutionContext().put("cancerStudy", stepExecution.getJobExecution().getExecutionContext().get("cancerStudy")); + stepExecution.getExecutionContext().put("dataFile", stepExecution.getJobExecution().getExecutionContext().get("dataFile")); + stepExecution.getExecutionContext().put("gisticMetadata", stepExecution.getJobExecution().getExecutionContext().get("gisticMetadata")); + } + + @Override + public ExitStatus afterStep(StepExecution stepExecution) { + String stepName = stepExecution.getStepName(); + + // get number of entries skipped, total genes loaded, and total records imported + int totalGenes = (int) stepExecution.getExecutionContext().get("totalGeneCount"); + int entriesSkipped = stepExecution.getFilterCount(); // filter count = # records returned as null from processor + int gisticDataCount = (int) stepExecution.getExecutionContext().get("gisticDataCount"); + int gisticGeneDataCount = (int) stepExecution.getExecutionContext().get("gisticGeneDataCount"); + + if ((gisticDataCount + gisticGeneDataCount) == 0) { + LOG.error("No records were imported into GISTIC or GISTIC_TO_GENE"); + // change rollback state for cancer study to true to return to saved state + stepExecution.getJobExecution().getExecutionContext().put("rollbackCancerStudyState", true); + } + else { + // log counts during data loading and importing + LOG.info("Total genes loaded for datatype: " + totalGenes); + LOG.info("Entries skipped during data loading: " + entriesSkipped); + LOG.info("Total records imported into GISTIC: " + gisticDataCount); + LOG.info("Total records imported into GISTIC_TO_GENE: " + gisticGeneDataCount); + } + + // log rollbacks and number of items skipped during current step execution + if (stepExecution.getRollbackCount() > 0) { + LOG.info("Rollbacks during " + stepName + ": " + stepExecution.getRollbackCount()); + } + if (stepExecution.getSkipCount() > 0) { + LOG.info("Items skipped " + stepName + ": " + stepExecution.getSkipCount()); + } + + return ExitStatus.COMPLETED; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/listener/MutSigDataListener.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/listener/MutSigDataListener.java new file mode 100644 index 0000000..59cd390 --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/listener/MutSigDataListener.java @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.listener; + +import org.apache.commons.logging.*; +import org.springframework.batch.core.*; + +/** + * + * @author ochoaa + */ +public class MutSigDataListener implements StepExecutionListener { + + private static final Log LOG = LogFactory.getLog(MutSigDataListener.class); + + @Override + public void beforeStep(StepExecution stepExecution) { + // add cancer study, datafile, and mutsig file metadata from tasklet to step execution context + stepExecution.getExecutionContext().put("cancerStudy", stepExecution.getJobExecution().getExecutionContext().get("cancerStudy")); + stepExecution.getExecutionContext().put("dataFile", stepExecution.getJobExecution().getExecutionContext().get("dataFile")); + stepExecution.getExecutionContext().put("mutSigMetadata", stepExecution.getJobExecution().getExecutionContext().get("mutSigMetadata")); + } + + @Override + public ExitStatus afterStep(StepExecution stepExecution) { + String stepName = stepExecution.getStepName(); + + // get total gene count, high q-value rejects, number of entries skipped, and total records imported + int totalGenes = (int) stepExecution.getExecutionContext().get("totalGeneCount"); + int qValueRejects = (int) stepExecution.getExecutionContext().get("qValueRejects"); + int entriesSkipped = stepExecution.getFilterCount(); // filter count = # records returned as null from processor + int mutSigDataCount = (int) stepExecution.getExecutionContext().get("mutSigDataCount"); + + if (mutSigDataCount == 0) { + LOG.error("No records were imported into MUT_SIG - " + + (totalGenes==0?"data could not be loaded from file":("expected number of records: " + totalGenes))); + // change rollback state for cancer study to true to return to saved state + stepExecution.getJobExecution().getExecutionContext().put("rollbackCancerStudyState", true); + } + else { + // log counts during data loading and importing + LOG.info("Total genes loaded from mutsig file: " + totalGenes); + LOG.info("Total records skipped because of q-value >= 0.1 during data loading: " + qValueRejects); + LOG.info("Total entries skipped during data loading: " + entriesSkipped); + LOG.info("Total records imported into MUT_SIG: " + mutSigDataCount); + } + + // log rollbacks and number of items skipped during current step execution + if (stepExecution.getRollbackCount() > 0) { + LOG.info("Rollbacks during " + stepName + ": " + stepExecution.getRollbackCount()); + } + if (stepExecution.getSkipCount() > 0) { + LOG.info("Items skipped " + stepName + ": " + stepExecution.getSkipCount()); + } + + return ExitStatus.COMPLETED; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/listener/MutationDataListener.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/listener/MutationDataListener.java new file mode 100644 index 0000000..23bdde6 --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/listener/MutationDataListener.java @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.listener; + +import org.mskcc.cbio.model.GeneticProfile; +import org.mskcc.cbio.persistence.jdbc.MutationJdbcDaoImpl; +import org.cbio.portal.pipelines.importer.util.MutationFilter; + +import java.util.*; +import org.apache.commons.collections.map.MultiKeyMap; +import org.apache.commons.logging.*; +import org.springframework.batch.core.*; +import org.springframework.beans.factory.annotation.Autowired; + +/** + * + * @author ochoaa + */ +public class MutationDataListener implements StepExecutionListener { + + @Autowired + MutationJdbcDaoImpl mutationJdbcDaoImpl; + + private static final Log LOG = LogFactory.getLog(MutationDataListener.class); + + @Override + public void beforeStep(StepExecution stepExecution) { + // add genetic profile, gene panel id, data file list, and mutation file metadata to execution context + stepExecution.getExecutionContext().put("geneticProfile", stepExecution.getJobExecution().getExecutionContext().get("geneticProfile")); + stepExecution.getExecutionContext().put("genePanelId", stepExecution.getJobExecution().getExecutionContext().get("genePanelId")); + stepExecution.getExecutionContext().put("dataFileList", stepExecution.getJobExecution().getExecutionContext().get("dataFileList")); + stepExecution.getExecutionContext().put("mutationFileMetadata", stepExecution.getJobExecution().getExecutionContext().get("mutationFileMetadata")); + } + + @Override + public ExitStatus afterStep(StepExecution stepExecution) { + GeneticProfile geneticProfile = (GeneticProfile) stepExecution.getJobExecution().getExecutionContext().get("geneticProfile"); + MultiKeyMap datatypeMetadata = (MultiKeyMap) stepExecution.getJobExecution().getExecutionContext().get("datatypeMetadata"); + String datatype = (String) stepExecution.getJobExecution().getExecutionContext().get("currentDatatype"); + String stepName = stepExecution.getStepName(); + + // get counts for total samples and entries skipped for datatype + int samplesSkipped = (int) stepExecution.getExecutionContext().get("samplesSkipped"); + int entriesSkipped = stepExecution.getFilterCount(); // filter count = # records returned as null from processor + + // get counts of total mutation records imported for datatype + int mutationDataCount = (int) stepExecution.getExecutionContext().get("mutationDataCount"); + int mutationEventDataCount = (int) stepExecution.getExecutionContext().get("mutationEventDataCount"); + + // check total count for records imported before calculating statistics + if ((mutationDataCount + mutationEventDataCount) == 0) { + LOG.error("Error importing mutation data for datatype: " + datatype); + // change rollback state for cancer study to true to return to saved state + stepExecution.getJobExecution().getExecutionContext().put("rollbackCancerStudyState", true); + } + else { + // if datatype is mutation datatype then print summary statistics for mutation filter + // and calculate mutation count for every sample + MutationFilter mutationFilter = (MutationFilter) stepExecution.getExecutionContext().get("mutationFilter"); + if (datatype.contains("mutation")) { + LOG.info("Calculating mutation count for every sample for genetic profile: " + geneticProfile.getStableId()); + mutationJdbcDaoImpl.calculateMutationCount(geneticProfile.getGeneticProfileId()); + mutationFilter.printSummaryStatistics(); + } + + // get case list and total genes count + Set caseList = (Set) stepExecution.getExecutionContext().get("caseList"); + int totalGenes = (int) stepExecution.getExecutionContext().get("totalGeneCount"); + + // log counts during data loading and import + LOG.info("Total samples loaded for datatype: " + caseList.size()); + LOG.info("Total genes loaded for datatype: " + totalGenes); + LOG.info("Samples skipped during data loading: " + samplesSkipped); + LOG.info("Entries skipped during data loading: " + entriesSkipped); + LOG.info("Total records imported into MUTATION: " + mutationDataCount); + LOG.info("Total records imported into MUTATION_EVENT: " + mutationEventDataCount); + + + + // update the case list for datatype metadata and update execution context + datatypeMetadata.put(datatype, "caseList", caseList); + stepExecution.getJobExecution().getExecutionContext().put("datatypeMetadata", datatypeMetadata); + } + + // log rollbacks and number of items skipped during current step execution + if (stepExecution.getRollbackCount() > 0) { + LOG.info("Rollbacks during " + stepName + ": " + stepExecution.getRollbackCount()); + } + if (stepExecution.getSkipCount() > 0) { + LOG.info("Items skipped " + stepName + ": " + stepExecution.getSkipCount()); + } + + return ExitStatus.COMPLETED; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/listener/ProfileDataListener.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/listener/ProfileDataListener.java new file mode 100644 index 0000000..c4f5eb2 --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/listener/ProfileDataListener.java @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.listener; + +import java.util.*; +import org.apache.commons.collections.map.MultiKeyMap; +import org.apache.commons.logging.*; +import org.springframework.batch.core.*; + +/** + * + * @author ochoaa + */ +public class ProfileDataListener implements StepExecutionListener { + + private String datatype; + private boolean isRppaProfile; + private boolean isCnaData; + + private static final Log LOG = LogFactory.getLog(ProfileDataListener.class); + + @Override + public void beforeStep(StepExecution stepExecution) { + this.datatype = (String) stepExecution.getJobExecution().getExecutionContext().get("currentDatatype"); + + // add genetic profile and data file list to execution context + stepExecution.getExecutionContext().put("geneticProfile", stepExecution.getJobExecution().getExecutionContext().get("geneticProfile")); + stepExecution.getExecutionContext().put("dataFileList", stepExecution.getJobExecution().getExecutionContext().get("dataFileList")); + stepExecution.getExecutionContext().put("profileMetadata", stepExecution.getJobExecution().getExecutionContext().get("profileMetadata")); + + // set booleans for determing if datatype is RPPA profile or CNA data + this.isRppaProfile = datatype.startsWith("rppa"); + this.isCnaData = datatype.contains("cna"); + stepExecution.getExecutionContext().put("isRppaProfile", isRppaProfile); + stepExecution.getExecutionContext().put("isCnaData", isCnaData); + } + + @Override + public ExitStatus afterStep(StepExecution stepExecution) { + MultiKeyMap datatypeMetadata = (MultiKeyMap) stepExecution.getJobExecution().getExecutionContext().get("datatypeMetadata"); + String stepName = stepExecution.getStepName(); + + // get counts for total samples and genes loaded for datatype + int totalGenes = (int) stepExecution.getExecutionContext().get("totalGeneCount"); + int totalGeneticAlterationCount = (int) stepExecution.getExecutionContext().get("geneticAlterationCount"); + + // get counts for total samples and entries skipped for datatype + int samplesSkipped = (int) stepExecution.getExecutionContext().get("samplesSkipped"); + int additionalEntriesSkipped = (int) stepExecution.getExecutionContext().get("additionalEntriesSkipped"); + int entriesSkipped = additionalEntriesSkipped + stepExecution.getFilterCount(); // filter count = # records returned as null from processor + + // get RPPA profile status and counts of extra records + boolean isRppaProfile = (boolean) stepExecution.getExecutionContext().get("isRppaProfile"); + int validExtraRecords = (int) stepExecution.getExecutionContext().get("validExtraRecords"); + int skippedExtraRecords = (int) stepExecution.getExecutionContext().get("skippedExtraRecords"); + + if (totalGeneticAlterationCount == 0) { + LOG.error("Error importing profile data for datatype: " + datatype); + // change rollback state for cancer study to true to return to saved state + stepExecution.getJobExecution().getExecutionContext().put("rollbackCancerStudyState", true); + } + else { + // get case list + Set caseList = (Set) stepExecution.getExecutionContext().get("caseList"); + + // log counts during data loading and import + LOG.info("Total samples loaded for datatype: " + caseList.size()); + LOG.info("Total genes loaded for datatype: " + totalGenes); + LOG.info("Normal samples skipped during data loading: " + samplesSkipped); + LOG.info("Total entries skipped during data loading: " + entriesSkipped); + + LOG.info("Total records imported into GENETIC_ALTERATION: " + totalGeneticAlterationCount); + if (isCnaData) { + LOG.info("Total records imported into CNA_EVENT: " + stepExecution.getExecutionContext().get("cnaEventCount")); + LOG.info("Total records imported into SAMPLE_CNA_EVENT: " + stepExecution.getExecutionContext().get("sampleCnaEventCount")); + } + + // log counts of extra records loaded during data loading + if (isRppaProfile && validExtraRecords > 0) { + LOG.info("Total number of extra records added because of multiple genes in one line: " + validExtraRecords); + } + if (skippedExtraRecords > 0) { + LOG.info("Total number of extra records skipped because of ambiguous gene symbols: " + skippedExtraRecords); + } + + // update the case list for datatype metadata add update execution context + datatypeMetadata.put(datatype, "caseList", caseList); + stepExecution.getJobExecution().getExecutionContext().put("datatypeMetadata", datatypeMetadata); + } + + // log rollbacks and number of items skipped during current step execution + if (stepExecution.getRollbackCount() > 0) { + LOG.info("Rollbacks during " + stepName + ": " + stepExecution.getRollbackCount()); + } + if (stepExecution.getSkipCount() > 0) { + LOG.info("Items skipped " + stepName + ": " + stepExecution.getSkipCount()); + } + + return ExitStatus.COMPLETED; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/listener/StructuralVariantDataListener.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/listener/StructuralVariantDataListener.java new file mode 100644 index 0000000..f89c923 --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/listener/StructuralVariantDataListener.java @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.listener; + +import java.util.*; +import org.apache.commons.collections.map.MultiKeyMap; +import org.apache.commons.logging.*; +import org.springframework.batch.core.*; + +/** + * + * @author ochoaa + */ +public class StructuralVariantDataListener implements StepExecutionListener { + + private static final Log LOG = LogFactory.getLog(StructuralVariantDataListener.class); + + @Override + public void beforeStep(StepExecution stepExecution) { + // add cancer study, gene panel id, datafile, and structural variant file metadata from tasklet to step execution context + stepExecution.getExecutionContext().put("cancerStudy", stepExecution.getJobExecution().getExecutionContext().get("cancerStudy")); + stepExecution.getExecutionContext().put("geneticProfile", stepExecution.getJobExecution().getExecutionContext().get("geneticProfile")); + stepExecution.getExecutionContext().put("genePanelId", stepExecution.getJobExecution().getExecutionContext().get("genePanelId")); + stepExecution.getExecutionContext().put("dataFile", stepExecution.getJobExecution().getExecutionContext().get("dataFile")); + stepExecution.getExecutionContext().put("structuralVariantMetadata", stepExecution.getJobExecution().getExecutionContext().get("structuralVariantMetadata")); + } + + @Override + public ExitStatus afterStep(StepExecution stepExecution) { + MultiKeyMap datatypeMetadata = (MultiKeyMap) stepExecution.getJobExecution().getExecutionContext().get("datatypeMetadata"); + String datatype = (String) stepExecution.getJobExecution().getExecutionContext().get("currentDatatype"); + String stepName = stepExecution.getStepName(); + + // get number of samples and entries skipped, and total records imported + int samplesSkipped = (int) stepExecution.getExecutionContext().get("samplesSkipped"); + int entriesSkipped = stepExecution.getFilterCount(); // filter count = # records returned as null from processor + int structuralVariantDataCount = (int) stepExecution.getExecutionContext().get("structuralVariantDataCount"); + + if (structuralVariantDataCount == 0) { + LOG.error("Error importing profile data for datatype: " + datatype); + // change rollback state for cancer study to true to return to saved state + stepExecution.getJobExecution().getExecutionContext().put("rollbackCancerStudyState", true); + } + else { + // get case list + Set caseList = (Set) stepExecution.getExecutionContext().get("caseList"); + + // log counts during data loading and importing + LOG.info("Total samples loaded for datatype: " + caseList.size()); + LOG.info("Total samples skipped during data loading: " + samplesSkipped); + LOG.info("Total entries skipped during data loading: " + entriesSkipped); + LOG.info("Total records imported into STRUCTURAL_VARIANT: " + structuralVariantDataCount); + + // update the case list for datatype metadata add update execution context + datatypeMetadata.put(datatype, "caseList", caseList); + stepExecution.getJobExecution().getExecutionContext().put("datatypeMetadata", datatypeMetadata); + } + + // log rollbacks and number of items skipped during current step execution + if (stepExecution.getRollbackCount() > 0) { + LOG.info("Rollbacks during " + stepName + ": " + stepExecution.getRollbackCount()); + } + if (stepExecution.getSkipCount() > 0) { + LOG.info("Items skipped " + stepName + ": " + stepExecution.getSkipCount()); + } + + return ExitStatus.COMPLETED; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/listener/TimelineDataListener.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/listener/TimelineDataListener.java new file mode 100644 index 0000000..dcd7379 --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/listener/TimelineDataListener.java @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.listener; + +import org.apache.commons.logging.*; +import org.springframework.batch.core.*; + +/** + * + * @author ochoaa + */ +public class TimelineDataListener implements StepExecutionListener { + + private static final Log LOG = LogFactory.getLog(TimelineDataListener.class); + + @Override + public void beforeStep(StepExecution stepExecution) { + // add cancer study, data file list, and timeline file metadata from tasklet to step execution context + stepExecution.getExecutionContext().put("cancerStudy", stepExecution.getJobExecution().getExecutionContext().get("cancerStudy")); + stepExecution.getExecutionContext().put("dataFileList", stepExecution.getJobExecution().getExecutionContext().get("dataFileList")); + stepExecution.getExecutionContext().put("timelineMetadata", stepExecution.getJobExecution().getExecutionContext().get("timelineMetadata")); + } + + @Override + public ExitStatus afterStep(StepExecution stepExecution) { + String stepName = stepExecution.getStepName(); + + // get number of patients skipped and total records imported + int patientsSkipped = (int) stepExecution.getExecutionContext().get("patientsSkipped"); + int clinicalEventCount = (int) stepExecution.getExecutionContext().get("clinicalEventCount"); + int clinicalEventDataCount = (int) stepExecution.getExecutionContext().get("clinicalEventDataCount"); + + if ((clinicalEventCount + clinicalEventDataCount) == 0) { + LOG.error("No records were imported into CLINICAL_EVENT or CLINICAL_EVENT_DATA"); + // change rollback state for cancer study to true to return to saved state + stepExecution.getJobExecution().getExecutionContext().put("rollbackCancerStudyState", true); + } + else { + // log counts during data loading and importing + LOG.info("Total patients skipped during data loading: " + patientsSkipped); + LOG.info("Total records imported into CLINICAL_EVENT: " + clinicalEventCount); + LOG.info("Total records imported into CLINICAL_EVENT_DATA: " + clinicalEventDataCount); + } + + // log rollbacks and number of items skipped during current step execution + if (stepExecution.getRollbackCount() > 0) { + LOG.info("Rollbacks during " + stepName + ": " + stepExecution.getRollbackCount()); + } + if (stepExecution.getSkipCount() > 0) { + LOG.info("Items skipped " + stepName + ": " + stepExecution.getSkipCount()); + } + + return ExitStatus.COMPLETED; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/processor/ClinicalDataProcessor.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/processor/ClinicalDataProcessor.java new file mode 100644 index 0000000..43ce35f --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/processor/ClinicalDataProcessor.java @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.processor; + +import org.mskcc.cbio.model.*; +import org.mskcc.cbio.persistence.jdbc.*; +import org.cbio.portal.pipelines.importer.config.composite.CompositeClinicalData; + +import java.util.*; +import org.springframework.batch.item.ItemProcessor; +import org.springframework.beans.factory.annotation.Autowired; + +/** + * Processor for composite clinical data + * @author ochoaa + */ +public class ClinicalDataProcessor implements ItemProcessor { + + @Autowired + PatientJdbcDaoImpl patientJdbcDaoImpl; + + @Autowired + SampleJdbcDaoImpl sampleJdbcDaoImpl; + + @Override + public CompositeClinicalData process(CompositeClinicalData composite) throws Exception { + // update patient and sample internal ids if possible + Patient existingPatient = patientJdbcDaoImpl.getPatient(composite.getPatient().getStableId(), composite.getPatient().getCancerStudyId()); + if (existingPatient != null) { + composite.getPatient().setInternalId(existingPatient.getInternalId()); + + // if patient exists then sample should also exist by patient internal id + Sample existingSample = sampleJdbcDaoImpl.getSampleByPatient(composite.getSample().getStableId(), composite.getPatient().getInternalId()); + if (existingSample != null) { + composite.getSample().setInternalId(existingSample.getInternalId()); + } + } + + // sort filtered clinical data into patient and sample clinical data + Map patientClinicalData = new HashMap<>(); + Map sampleClinicalData = new HashMap<>(); + Map filteredClinicalData = composite.getCompositeClinicalDataMap(); + for (ClinicalAttribute attr : filteredClinicalData.keySet()) { + // if attribute is patient attribute then add to patient clinical data list + // otherwise add to sample clinical data list + if (attr.getPatientAttribute()) { + PatientClinicalData newClinicalDatum = new PatientClinicalData(); + newClinicalDatum.setInternalId(composite.getPatient().getInternalId()); + newClinicalDatum.setAttrId(attr.getAttrId()); + newClinicalDatum.setAttrValue(filteredClinicalData.get(attr)); + patientClinicalData.put(attr.getAttrId(), newClinicalDatum); + } + else { + SampleClinicalData newClinicalDatum = new SampleClinicalData(); + newClinicalDatum.setInternalId(composite.getSample().getInternalId()); + newClinicalDatum.setAttrId(attr.getAttrId()); + newClinicalDatum.setAttrValue(filteredClinicalData.get(attr)); + sampleClinicalData.put(attr.getAttrId(), newClinicalDatum); + } + } + // set patient and sample clinical data for composite object + composite.setPatientClinicalData(patientClinicalData); + composite.setSampleClinicalData(sampleClinicalData); + + return composite; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/processor/CopyNumberSegmentDataProcessor.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/processor/CopyNumberSegmentDataProcessor.java new file mode 100644 index 0000000..120840e --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/processor/CopyNumberSegmentDataProcessor.java @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.processor; + +import org.mskcc.cbio.model.CopyNumberSegment; +import org.cbio.portal.pipelines.importer.util.*; +import org.cbio.portal.pipelines.importer.model.CopyNumberSegmentRecord; + +import com.google.common.base.Strings; +import org.apache.commons.logging.*; + +import org.springframework.batch.item.ItemProcessor; +import org.springframework.beans.factory.annotation.Autowired; + +/** + * + * @author ochoaa + */ +public class CopyNumberSegmentDataProcessor implements ItemProcessor { + + @Autowired + GeneDataUtils geneDataUtils; + + private static final Log LOG = LogFactory.getLog(CopyNumberSegmentDataProcessor.class); + + @Override + public CopyNumberSegment process(CopyNumberSegmentRecord copyNumberSegmentRecord) throws Exception { + CopyNumberSegment screenedRecord = screenCopyNumberSegmentRecord(copyNumberSegmentRecord); + + return screenedRecord; + } + + /** + * Performs basic data screening to determine whether copy number segment record is acceptable or not. + * Returns null if copy number segment record does not pass screening + * + * @param copyNumberSegmentRecord + * @return CopyNumberSegment + */ + private CopyNumberSegment screenCopyNumberSegmentRecord(CopyNumberSegmentRecord copyNumberSegmentRecord) { + // check normalized chromosome value + String normalizedChromosome = geneDataUtils.getNormalizedChromosome(copyNumberSegmentRecord.getChrom()); + if (Strings.isNullOrEmpty(normalizedChromosome)) { + LOG.warn("Skipping entry with chromosome value: " + copyNumberSegmentRecord.getChrom()); + return null; + } + + // check start and end locations + Integer locStart = !DataFileUtils.isNullOrEmptyValue(copyNumberSegmentRecord.getLocStart())? + Integer.valueOf(copyNumberSegmentRecord.getLocStart()):-1; + Integer locEnd = !DataFileUtils.isNullOrEmptyValue(copyNumberSegmentRecord.getLocEnd())? + Integer.valueOf(copyNumberSegmentRecord.getLocEnd()):-1; + if (locStart >= locEnd) { + LOG.warn("Skipping entry with start location >= end location: " + locStart + " >= " + locEnd); + return null; + } + + // create new CopyNumberSegment instance with processed/resolved data from above + CopyNumberSegment copyNumberSegment = new CopyNumberSegment(); + copyNumberSegment.setChr(normalizedChromosome); + copyNumberSegment.setStart(locStart); + copyNumberSegment.setEnd(locEnd); + copyNumberSegment.setNumProbes(Integer.valueOf(copyNumberSegmentRecord.getNumProbes())); + copyNumberSegment.setSegmentMean(Double.valueOf(copyNumberSegmentRecord.getSegMean())); + + return copyNumberSegment; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/processor/FusionDataProcessor.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/processor/FusionDataProcessor.java new file mode 100644 index 0000000..a619195 --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/processor/FusionDataProcessor.java @@ -0,0 +1,144 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.processor; + +import org.mskcc.cbio.model.*; +import org.cbio.portal.pipelines.importer.model.*; +import org.cbio.portal.pipelines.importer.util.*; +import org.cbio.portal.pipelines.importer.config.composite.CompositeMutationData; + +import org.apache.commons.logging.*; +import org.springframework.batch.item.ItemProcessor; +import org.springframework.beans.factory.annotation.Autowired; + +/** + * + * @author ochoaa + */ +public class FusionDataProcessor implements ItemProcessor { + + @Autowired + GeneDataUtils geneDataUtils; + + private static final Log LOG = LogFactory.getLog(FusionDataProcessor.class); + + @Override + public CompositeMutationData process(FusionRecord fusionRecord) throws Exception { + // screen fusion record and return null if does not pass screening + FusionRecord screenedRecord = screenFusionRecord(fusionRecord); + if (screenedRecord == null) { + return null; + } + + // extract mutation and mutation event data from screened fusion record + Mutation mutation = transformFusionRecordToMutation(fusionRecord); + MutationEvent mutationEvent = transformFusionRecordToMutationEvent(fusionRecord); + mutation.setMutationEvent(mutationEvent); + + // create instance of composite mutation data + CompositeMutationData cmd = new CompositeMutationData(); + cmd.setMutation(mutation); + + return cmd; + } + + /** + * Transform a fusion record to a mutation instance. + * + * @param fusionRecord + * @return Mutation + */ + private Mutation transformFusionRecordToMutation(FusionRecord fusionRecord) { + Mutation mutation = new Mutation(); + mutation.setSampleId(fusionRecord.getSampleId()); + mutation.setGeneticProfileId(fusionRecord.getGeneticProfileId()); + mutation.setEntrezGeneId(Integer.valueOf(fusionRecord.getEntrezGeneId())); + mutation.setCenter(fusionRecord.getCenter()); + mutation.setSequenceSource("NA"); + + return mutation; + } + + /** + * Transform a fusion record to a mutation event instance. + * + * @param fusionRecord + * @return MutationEvent + */ + private MutationEvent transformFusionRecordToMutationEvent(FusionRecord fusionRecord) { + MutationEvent mutationEvent = new MutationEvent(); + mutationEvent.setEntrezGeneId(Integer.valueOf(fusionRecord.getEntrezGeneId())); + mutationEvent.setProteinChange(fusionRecord.getFusion()); + mutationEvent.setMutationType("Fusion"); + + return mutationEvent; + } + + /** + * Performs basic data screening to determine whether fusion record is acceptable or not. + * Returns null if fusion record does not pass screening + * + * @param fusionRecord + * @return FusionRecord + */ + private FusionRecord screenFusionRecord(FusionRecord fusionRecord) { + // check gene symbol and entrez gene id values + String hugoGeneSymbol = fusionRecord.getHugoSymbol(); + Integer entrezGeneId = !DataFileUtils.isNullOrEmptyValue(fusionRecord.getEntrezGeneId())? + Integer.valueOf(fusionRecord.getEntrezGeneId()):null; + if ((DataFileUtils.isNullOrEmptyValue(hugoGeneSymbol) || hugoGeneSymbol.equalsIgnoreCase("unknown")) + && (entrezGeneId == null || entrezGeneId <= 0)) { + LOG.warn("Skipping entry with invalid (Entrez_Gene_Id,Hugo_Symbol): " + + "(" + entrezGeneId + "," + hugoGeneSymbol + ")"); + return null; + } + + // check if gene can be resolved from fusion record first + Gene gene = geneDataUtils.resolveGeneFromRecordData(entrezGeneId, hugoGeneSymbol, null); + if (gene == null) { + LOG.warn("Could not resolve gene from (Entrez_Gene_Id,Hugo_Symbol): " + + "(" + entrezGeneId + "," + hugoGeneSymbol + ")"); + return null; + } + // update entrez gene id, hugo gene symbol, and chromosome (if necessary) with resolved gene data + entrezGeneId = gene.getEntrezGeneId(); + hugoGeneSymbol = gene.getHugoGeneSymbol(); + + // update fusion record with any resolved values from data checks above + fusionRecord.setEntrezGeneId(String.valueOf(entrezGeneId)); + fusionRecord.setHugoSymbol(hugoGeneSymbol); + + return fusionRecord; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/processor/GisticDataProcessor.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/processor/GisticDataProcessor.java new file mode 100644 index 0000000..78c9e25 --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/processor/GisticDataProcessor.java @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.processor; + + +import java.util.*; +import com.google.common.base.Strings; +import org.apache.commons.logging.*; +import org.cbio.portal.pipelines.importer.model.GisticRecord; +import org.cbio.portal.pipelines.importer.util.GeneDataUtils; +import org.mskcc.cbio.model.*; + +import org.springframework.batch.item.ItemProcessor; +import org.springframework.beans.factory.annotation.Autowired; + +/** + * + * @author ochoaa + */ +public class GisticDataProcessor implements ItemProcessor { + + @Autowired + GeneDataUtils geneDataUtils; + + private static final Log LOG = LogFactory.getLog(GisticDataProcessor.class); + + @Override + public Gistic process(GisticRecord gisticRecord) throws Exception { + Gistic screenedRecord = screenGisticRecord(gisticRecord); + + return screenedRecord; + } + + /** + * Performs basic data screening to determine whether gistic record is acceptable or not. + * Returns null if gistic record does not pass screening + * + * @param gisticRecord + * @return Gistic + */ + private Gistic screenGisticRecord(GisticRecord gisticRecord) { + // check that normalized chromosome is valid + String normalizedChromosome = geneDataUtils.getNormalizedChromosome(gisticRecord.getChromosome()); + if (Strings.isNullOrEmpty(normalizedChromosome)) { + LOG.warn("Skipping entry with chromosome value: " + gisticRecord.getChromosome()); + return null; + } + + // go through list of genes - ignore genes that have already been processed and ignore miRNA's + List gisticToGeneList = new ArrayList(); + Set genesAdded = new HashSet<>(); + String[] genesInRegion = gisticRecord.getGenesInRegion().replaceAll("\\[|\\]", "").split(","); + for (String geneSymbol : genesInRegion) { + geneSymbol = geneSymbol.split("\\|")[0]; + Gene gene = geneDataUtils.resolveGeneFromRecordData(geneSymbol, normalizedChromosome); + if (gene == null) { + LOG.warn("Skipping ambiguous gene: " + geneSymbol); + continue; + } + + // skip miRNA's + if (gene.getType().equals(geneDataUtils.MIRNA_TYPE) ) { + LOG.warn("Ignoring miRNA: " + gene.getHugoGeneSymbol()); + continue; + } + + // skip genes that already have data loaded for them + if (genesAdded.contains(gene.getEntrezGeneId())) { + LOG.warn("Skipping gene that has been given as alias in your file: " + gene.getHugoGeneSymbol()); + continue; + } + + // add gistic gene to list + genesAdded.add(gene.getEntrezGeneId()); + GisticToGene gisticGene = new GisticToGene(); + gisticGene.setEntrezGeneId(gene.getEntrezGeneId()); + gisticToGeneList.add(gisticGene); + } + + // skip entry if genes could not be found in db + if (gisticToGeneList.isEmpty()) { + LOG.warn("Genes could not be found for entry - skipping gistic event"); + return null; + } + // create new Gistic instance with processed/resolved data from above + Gistic gistic = new Gistic(); + gistic.setCancerStudyId(gisticRecord.getCancerStudyId()); + gistic.setChromosome(Integer.valueOf(normalizedChromosome)); + gistic.setCytoband(gisticRecord.getCytoband()); + gistic.setWidePeakStart(Integer.valueOf(gisticRecord.getPeakStart())); + gistic.setWidePeakEnd(Integer.valueOf(gisticRecord.getPeakEnd())); + gistic.setqValue(Double.valueOf(gisticRecord.getqValue())); + gistic.setAmp(gisticRecord.getAmp().equals("1")); + gistic.setGenesInRegion(gisticToGeneList); + + return gistic; + } +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/processor/MutSigDataProcessor.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/processor/MutSigDataProcessor.java new file mode 100644 index 0000000..d0c0dfc --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/processor/MutSigDataProcessor.java @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.processor; + +import org.mskcc.cbio.model.*; +import org.cbio.portal.pipelines.importer.model.MutSigRecord; +import org.cbio.portal.pipelines.importer.util.GeneDataUtils; + +import org.apache.commons.logging.*; +import org.springframework.batch.item.ItemProcessor; +import org.springframework.beans.factory.annotation.Autowired; + +/** + * + * @author ochoaa + */ +public class MutSigDataProcessor implements ItemProcessor { + + @Autowired + GeneDataUtils geneDataUtils; + + private static final Log LOG = LogFactory.getLog(MutSigDataProcessor.class); + + @Override + public MutSig process(MutSigRecord mutSigRecord) throws Exception { + MutSig screenedRecord = screenMutSigRecord(mutSigRecord); + + return screenedRecord; + } + + /** + * Performs basic data screening to determine whether mutsig record is acceptable or not. + * Returns null if mutsig record does not pass screening + * + * @param mutSigRecord + * @return MutSig + */ + private MutSig screenMutSigRecord(MutSigRecord mutSigRecord) { + // light-processing for p-value and q-value + Float pValue = Float.valueOf(mutSigRecord.getpValue().replace("<", "")); + Float qValue = Float.valueOf(mutSigRecord.getqValue().replace("<", "")); + + // check that hugo gene symbol value + String hugoGeneSymbol = mutSigRecord.getHugoSymbol(); + Gene gene = geneDataUtils.resolveGeneFromRecordData(hugoGeneSymbol, null); + if (gene == null) { + LOG.warn("Could not resolve gene from Hugo_Symbol: " + hugoGeneSymbol); + return null; + } + Integer entrezGeneId = gene.getEntrezGeneId(); + + // create new MutSig instance with processed/resolved data from above + MutSig mutSig = new MutSig(); + mutSig.setCancerStudyId(mutSigRecord.getCancerStudyId()); + mutSig.setEntrezGeneId(entrezGeneId); + mutSig.setRank(Integer.valueOf(mutSigRecord.getRank())); + mutSig.setNumBasesCovered(Integer.valueOf(mutSigRecord.getNumBasesCovered())); + mutSig.setNumMutations(Integer.valueOf(mutSigRecord.getNumMutations())); + mutSig.setPValue(pValue); + mutSig.setQValue(qValue); + + return mutSig; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/processor/MutationDataProcessor.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/processor/MutationDataProcessor.java new file mode 100644 index 0000000..b1d1afc --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/processor/MutationDataProcessor.java @@ -0,0 +1,294 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.processor; + +import org.mskcc.cbio.model.*; +import org.mskcc.cbio.persistence.jdbc.*; +import org.cbio.portal.pipelines.importer.model.*; +import org.cbio.portal.pipelines.importer.util.*; +import org.cbio.portal.pipelines.importer.config.composite.CompositeMutationData; + +import com.google.common.base.Strings; +import org.apache.commons.logging.*; +import org.springframework.batch.item.ItemProcessor; +import org.springframework.beans.factory.annotation.Autowired; + +/** + * + * @author ochoaa + */ +public class MutationDataProcessor implements ItemProcessor { + + @Autowired + GeneDataUtils geneDataUtils; + + @Autowired + UniProtIdMappingJdbcDaoImpl uniProtIdMappingJdbcDaoImpl; + + private static final Log LOG = LogFactory.getLog(MutationDataProcessor.class); + + @Override + public CompositeMutationData process(MafRecord mafRecord) throws Exception { + // screen maf record and return null of record does not pass screening + MafRecord screenedRecord = screenMafRecord(mafRecord); + if (screenedRecord == null) { + return null; + } + + // process the maf record and extract mutation and mutation event data from processed record + MafRecord processedRecord = processMafRecordData(screenedRecord); + Mutation mutation = transformMafRecordToMutation(processedRecord); + MutationEvent mutationEvent = transformMafRecordToMutationEvent(processedRecord); + mutation.setMutationEvent(mutationEvent); + + // create instance of composite mutation data + CompositeMutationData cmd = new CompositeMutationData(); + cmd.setMutation(mutation); + + return cmd; + } + + /** + * Transform a MAF record to a mutation instance. + * + * @param mafRecord + * @return Mutation + */ + private Mutation transformMafRecordToMutation(MafRecord mafRecord) { + Mutation mutation = new Mutation(); + mutation.setSampleId(mafRecord.getSampleId()); + mutation.setGeneticProfileId(mafRecord.getGeneticProfileId()); + mutation.setEntrezGeneId(Integer.valueOf(mafRecord.getEntrezGeneId())); + mutation.setCenter(mafRecord.getCenter()); + mutation.setSequencer(mafRecord.getSequencer()); + mutation.setMutationStatus(mafRecord.getMutationStatus()); + mutation.setValidationStatus(mafRecord.getValidationStatus()); + mutation.setTumorSeqAllele1(mafRecord.getTumorSeqAllele1()); + mutation.setTumorSeqAllele2(mafRecord.getTumorSeqAllele2()); + mutation.setMatchedNormSampleBarcode(mafRecord.getMatchedNormSampleBarcode()); + mutation.setMatchNormSeqAllele1(mafRecord.getMatchNormSeqAllele1()); + mutation.setMatchNormSeqAllele2(mafRecord.getMatchNormSeqAllele2()); + mutation.setTumorValidationAllele1(mafRecord.getTumorValidationAllele1()); + mutation.setMatchNormValidationAllele1(mafRecord.getMatchNormValidationAllele1()); + mutation.setMatchNormValidationAllele2(mafRecord.getMatchNormValidationAllele2()); + mutation.setVerificationStatus(mafRecord.getVerificationStatus()); + mutation.setSequencingPhase(mafRecord.getSequencingPhase()); + mutation.setSequenceSource(mafRecord.getSequenceSource()); + mutation.setValidationMethod(mafRecord.getValidationMethod()); + mutation.setScore(mafRecord.getScore()); + mutation.setBamFile(mafRecord.getBamFile()); + mutation.setTumorAltCount(Integer.valueOf(mafRecord.gettAltCount())); + mutation.setTumorRefCount(Integer.valueOf(mafRecord.gettRefCount())); + mutation.setNormalAltCount(Integer.valueOf(mafRecord.getnAltCount())); + mutation.setNormalRefCount(Integer.valueOf(mafRecord.getnRefCount())); + + return mutation; + } + + /** + * Transform a MAF record to a mutation event instance. + * + * @param mafRecord + * @return MutationEvent + */ + private MutationEvent transformMafRecordToMutationEvent(MafRecord mafRecord) { + MutationEvent mutationEvent = new MutationEvent(); + mutationEvent.setEntrezGeneId(Integer.valueOf(mafRecord.getEntrezGeneId())); + mutationEvent.setChr(mafRecord.getChromosome()); + mutationEvent.setStartPosition(Long.valueOf(mafRecord.getStartPosition())); + mutationEvent.setEndPosition(Long.valueOf(mafRecord.getEndPosition())); + mutationEvent.setReferenceAllele(mafRecord.getReferenceAllele()); + mutationEvent.setTumorSeqAllele(mafRecord.getTumorSeqAllele1()); + mutationEvent.setProteinChange(mafRecord.getHgvspShort()); + mutationEvent.setMutationType(mafRecord.getVariantClassification()); + mutationEvent.setFunctionalImpactScore(mafRecord.getMaFimpact()); + mutationEvent.setFisValue(Float.valueOf(mafRecord.getMaFis())); + mutationEvent.setLinkXvar(mafRecord.getMaLinkVar()); + mutationEvent.setLinkPdb(mafRecord.getMaLinkPdb()); + mutationEvent.setLinkMsa(mafRecord.getMaLinkMsa()); + mutationEvent.setNcbiBuild(mafRecord.getNcbiBuild()); + mutationEvent.setStrand(mafRecord.getStrand()); + mutationEvent.setVariantType(mafRecord.getVariantType()); + mutationEvent.setDbSnpRs(mafRecord.getDbsnpRs()); + mutationEvent.setDbSnpValStatus(mafRecord.getDbsnpValStatus()); + mutationEvent.setOncotatorDbsnpRs(mafRecord.getOncotatorDbsnpRs()); + mutationEvent.setOncotatorRefseqMrnaId(mafRecord.getOncotatorRefseqMrnaId()); + mutationEvent.setOncotatorCodonChange(mafRecord.getOncotatorCodonChange()); + mutationEvent.setOncotatorUniprotEntryName(mafRecord.getOncotatorUniprotEntryName()); + mutationEvent.setOncotatorUniprotAccession(mafRecord.getOncotatorUniprotAccession()); + mutationEvent.setOncotatorProteinPosStart(Integer.valueOf(mafRecord.getOncotatorProteinPosStart())); + mutationEvent.setOncotatorProteinPosEnd(Integer.valueOf(mafRecord.getOncotatorProteinPosEnd())); + mutationEvent.setCanonicalTranscript(MutationDataUtils.CANONICAL_TRANSCRIPT); + + return mutationEvent; + } + + /** + * Process the maf record data. + * + * @param mafRecord + * @return MafRecord + */ + private MafRecord processMafRecordData(MafRecord mafRecord) { + // resolve reference and tumor seq allele + String referenceAllele = DataFileUtils.isNullOrEmptyValue(mafRecord.getReferenceAllele())?"-":mafRecord.getReferenceAllele(); + String tumorSeqAllele = mafRecord.getTumorSeqAllele1(); + if (!DataFileUtils.isNullOrEmptyValue(mafRecord.getTumorSeqAllele2())) { + tumorSeqAllele = MutationDataUtils.resolveTumorSeqAllele(referenceAllele, mafRecord.getTumorSeqAllele1(), mafRecord.getTumorSeqAllele2()); + } + + // resolve start and stop positions + Long startPosition = (Long.valueOf(mafRecord.getStartPosition()) < 0)?0:Long.valueOf(mafRecord.getStartPosition()); + Long endPosition = MutationDataUtils.calculateEndPosition(referenceAllele, tumorSeqAllele, startPosition); + + // resolve functional impact score if applicable - if column not in maf then spring automatically makes value null + String functionalImpactScore = !DataFileUtils.isNullOrEmptyValue(mafRecord.getMaFimpact())?MutationDataUtils.transformOmaScore(mafRecord.getMaFimpact()):null; + String linkXVar = (!Strings.isNullOrEmpty(mafRecord.getMaLinkVar()))?mafRecord.getMaLinkVar().replace("\"", ""):""; + String fisValue = !DataFileUtils.isNullOrEmptyValue(mafRecord.getMaFis())?mafRecord.getMaFis():String.valueOf(Float.MIN_VALUE); + + // resolve the protein change, protein start position, and protein end position + String proteinChange = MutationDataUtils.resolveProteinChange(mafRecord.getHgvspShort(), mafRecord.getAminoAcidChange(), mafRecord.getMaProteinChange()); + Integer proteinStartPosition = 0; + Integer proteinEndPosition = 0; + if (!DataFileUtils.isNullOrEmptyValue(mafRecord.getProteinPosition())) { + proteinStartPosition = MutationDataUtils.resolveProteinStartPosition(mafRecord.getProteinPosition(), proteinChange); + proteinEndPosition = MutationDataUtils.resolveProteinEndPosition(mafRecord.getProteinPosition(), proteinChange); + } + String uniProtAccession = uniProtIdMappingJdbcDaoImpl.mapUniProtIdToAccession(mafRecord.getSwissprot()); + + // resolve the tumor alt/ref counts and normal alt/ref counts + Integer tAltCount = MutationDataUtils.resolveTumorAltCount(mafRecord); + Integer tRefCount = MutationDataUtils.resolveTumorRefCount(mafRecord); + Integer nAltCount = MutationDataUtils.resolveNormalAltCount(mafRecord); + Integer nRefCount = MutationDataUtils.resolveNormalRefCount(mafRecord); + + // resolve sequence source value + String sequenceSource = DataFileUtils.isNullOrEmptyValue(mafRecord.getSequenceSource())?"NA":mafRecord.getSequenceSource(); + + // update maf record with resolved data + mafRecord.setReferenceAllele(referenceAllele); + mafRecord.setTumorSeqAllele1(tumorSeqAllele); + mafRecord.setStartPosition(String.valueOf(startPosition)); + mafRecord.setEndPosition(String.valueOf(endPosition)); + mafRecord.setMaFimpact(functionalImpactScore); + mafRecord.setMaLinkVar(linkXVar); + mafRecord.setMaFis(fisValue); + mafRecord.setHgvspShort(proteinChange); + mafRecord.setOncotatorProteinPosStart(String.valueOf(proteinStartPosition)); + mafRecord.setOncotatorProteinPosEnd(String.valueOf(proteinEndPosition)); + mafRecord.setOncotatorUniprotAccession(uniProtAccession); + mafRecord.settAltCount(String.valueOf(tAltCount)); + mafRecord.settRefCount(String.valueOf(tRefCount)); + mafRecord.setnAltCount(String.valueOf(nAltCount)); + mafRecord.setnRefCount(String.valueOf(nRefCount)); + mafRecord.setSequenceSource(sequenceSource); + + return mafRecord; + } + + /** + * Performs basic data screening to determine whether MAF record is acceptable or not. + * Returns null if MAF record does not pass screening + * + * @param mafRecord + * @return MafRecord + */ + private MafRecord screenMafRecord(MafRecord mafRecord) { + // check validation status value + if (mafRecord.getValidationStatus() == null || mafRecord.getValidationStatus().equalsIgnoreCase("Wildtype")) { + LOG.warn("Skipping entry with Validation_Status: Wildtype"); + return null; + } + + // check mutation type value + String mutationType = MutationDataUtils.resolveMutationType(mafRecord.getVariantClassification(), mafRecord.getOncotatorVariantClassification()); + if (!DataFileUtils.isNullOrEmptyValue(mutationType) && mutationType.equalsIgnoreCase("rna")) { + LOG.warn("Skipping entry with mutation type: RNA"); + return null; + } + + // check gene symbol and entrez gene id values + String hugoGeneSymbol = mafRecord.getHugoSymbol(); + Integer entrezGeneId = !DataFileUtils.isNullOrEmptyValue(mafRecord.getEntrezGeneId())? + Integer.valueOf(mafRecord.getEntrezGeneId()):null; + String normalizedChromosome = geneDataUtils.getNormalizedChromosome(mafRecord.getChromosome()); + if ((DataFileUtils.isNullOrEmptyValue(hugoGeneSymbol) || hugoGeneSymbol.equalsIgnoreCase("unknown")) + && (entrezGeneId == null || entrezGeneId <= 0)) { + LOG.warn("Skipping entry with invalid (Entrez_Gene_Id,Hugo_Symbol,Chromosome): " + + "(" + entrezGeneId + "," + hugoGeneSymbol + "," + normalizedChromosome + ")"); + + // treat records with unknown gene symbols and invalid entrez gene ids with valid mutation types as igr + if (hugoGeneSymbol.equalsIgnoreCase("unknown") && MutationDataUtils.isAcceptableMutation(mutationType)) { + LOG.warn("Treating mutation with gene symbol 'Unknown' as intergenic instead of " + + mutationType); + mutationType = "IGR"; + } + else { + // let mutation type IGR pass so that it can be counted by mutation filter correctly + return null; + } + } + + // check if gene can be resolved from MAF record first + Gene gene = geneDataUtils.resolveGeneFromRecordData(entrezGeneId, hugoGeneSymbol, normalizedChromosome); + if (gene == null) { + if (!hugoGeneSymbol.equalsIgnoreCase("unknown") && !mutationType.equals("IGR")) { + LOG.warn("Could not resolve gene from (Entrez_Gene_Id,Hugo_Symbol,Chromosome): " + + "(" + entrezGeneId + "," + hugoGeneSymbol + "," + normalizedChromosome + ")"); + return null; + } + } + else { + // update entrez gene id, hugo gene symbol, and chromosome (if necessary) with resolved gene data + entrezGeneId = gene.getEntrezGeneId(); + hugoGeneSymbol = gene.getHugoGeneSymbol(); + if (DataFileUtils.isNullOrEmptyValue(normalizedChromosome)) { + normalizedChromosome = geneDataUtils.getChromosomeFromCytoband(gene.getCytoband()); + } + // check normalized chromosome value + if (Strings.isNullOrEmpty(normalizedChromosome)) { + LOG.warn("Skipping entry with chromosome value: " + mafRecord.getChromosome()); + return null; + } + } + + // update maf record with any resolved values from data checks above + mafRecord.setChromosome(normalizedChromosome); + mafRecord.setVariantClassification(mutationType); + mafRecord.setEntrezGeneId(String.valueOf(entrezGeneId)); + mafRecord.setHugoSymbol(hugoGeneSymbol); + + return mafRecord; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/processor/ProfileDataProcessor.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/processor/ProfileDataProcessor.java new file mode 100644 index 0000000..244a2f3 --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/processor/ProfileDataProcessor.java @@ -0,0 +1,273 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.processor; + +import org.mskcc.cbio.model.*; +import org.mskcc.cbio.persistence.jdbc.GeneJdbcDaoImpl; +import org.cbio.portal.pipelines.importer.model.ProfileDataRecord; +import org.cbio.portal.pipelines.importer.util.*; + +import java.util.*; +import java.util.regex.*; +import org.apache.commons.logging.*; +import org.springframework.batch.item.ItemProcessor; +import org.springframework.beans.factory.annotation.Autowired; + +/** + * + * @author ochoaa + */ +public class ProfileDataProcessor implements ItemProcessor { + + @Autowired + GeneJdbcDaoImpl geneJdbcDaoImpl; + + @Autowired + GeneDataUtils geneDataUtils; + + private static final Log LOG = LogFactory.getLog(ProfileDataProcessor.class); + + @Override + public ProfileDataRecord process(ProfileDataRecord profileDataRecord) throws Exception { + ProfileDataRecord screenedRecord = screenProfileDataRecord(profileDataRecord); + if (screenedRecord != null && profileDataRecord.isCnaData()) { + List cnaEvents = loadCnaEvents(screenedRecord); + screenedRecord.setCnaEvents(cnaEvents); + } + + return screenedRecord; + } + + /** + * Performs base data screening to determine whether profile record should be skipped or not. + * Returns null if gene information cannot be resolve from record. + * + * @param profileDataRecord + * @param caseIdsMap + * @param isRppaProfile + * @return CompositeProfileData + */ + private ProfileDataRecord screenProfileDataRecord(ProfileDataRecord profileDataRecord) { + // check that entrez gene id and hugo gene symbol are not both invalid + String hugoGeneSymbol = profileDataRecord.isRppaProfile()?profileDataRecord.getCompositeElementRef():profileDataRecord.getHugoSymbol(); + Integer entrezGeneId = !DataFileUtils.isNullOrEmptyValue(profileDataRecord.getEntrezGeneId())? + Integer.valueOf(profileDataRecord.getEntrezGeneId()):null; + if ((DataFileUtils.isNullOrEmptyValue(hugoGeneSymbol) || hugoGeneSymbol.equalsIgnoreCase("unknown")) + && entrezGeneId == null) { + LOG.warn("Skipping entry with invalid (Entrez_Gene_Id,Hugo_Symbol): " + + "(" + profileDataRecord.getEntrezGeneId() + "," + (profileDataRecord.getHugoSymbol()) + ")"); + return null; + } + + // ignore entries with gene symbols separated by /// or gene symbols specified as --- + if (!DataFileUtils.isNullOrEmptyValue(hugoGeneSymbol) && + (hugoGeneSymbol.contains("///") || hugoGeneSymbol.contains("---"))) { + if (hugoGeneSymbol.contains("///")) { + LOG.warn("Skipping entry with information for multiple genes: " + hugoGeneSymbol); + } + else { + LOG.warn("Skipping entry with unknown gene information: " + hugoGeneSymbol); + } + return null; + } + + // generate list of genes (potentially multiple genes for microRNA or RPPA profile) + List geneList = new ArrayList(); + if (profileDataRecord.isRppaProfile()) { + // composite element ref value cannot be empty for rppa profile + if (DataFileUtils.isNullOrEmptyValue(profileDataRecord.getCompositeElementRef())) { + LOG.warn("Ignoring line with no Composite.Element.Ref value"); + return null; + } + + // try to parse composite element ref value + hugoGeneSymbol = profileDataRecord.getCompositeElementRef(); + String[] parts = hugoGeneSymbol.split("\\|"); + if (parts.length < 2) { + LOG.warn("Could not parse Composite.Element.Ref value: " + hugoGeneSymbol); + return null; + } + + // skip entries with array id's that have already been loaded + String[] geneSymbols = parts[0].split(" "); + String arrayId = parts[1]; + profileDataRecord.setArrayId(arrayId); + + // generate list of genes from gene symbols + geneList = geneDataUtils.resolveGeneFromCompositeElementRef(geneSymbols); + if (geneList.isEmpty()) { + LOG.warn("Gene symbols could not be resolved for Composite.Element.Ref: " + hugoGeneSymbol); + return null; + } + + // get list of phospho genes from array id + Pattern p = Pattern.compile("(p[STY][0-9]+(?:_[STY][0-9]+)*)"); + Matcher m = p.matcher(arrayId); + String residue; + if (m.find()) { + // import phospho genes + residue = m.group(1); + geneList = importPhosphoGene(geneList, residue); + } + } + else { + // check if gene can be resolved from the entrez gene id and/or hugo gene symbol + if (!DataFileUtils.isNullOrEmptyValue(hugoGeneSymbol)) { + hugoGeneSymbol = hugoGeneSymbol.split("\\|")[0]; // handle cases where gene symbol includes accession + } + Gene gene = geneDataUtils.resolveGeneFromRecordData(entrezGeneId, hugoGeneSymbol, null); + if (gene == null) { + LOG.warn("Could not resolve gene from (Entrez_Gene_Id,Hugo_Symbol): " + + "(" + entrezGeneId + "," + hugoGeneSymbol + ")"); + return null; + } + + // if entrez gene id doesn't match the resolved entrez gene id then + // switch the resolved entrez gene id to the value given by the file, + // otherwise set entrez gene id to the resolved value + if (entrezGeneId != null) { + // only update resolved entrez gene id if doesn't match value in file + if (!entrezGeneId.equals(gene.getEntrezGeneId())) { + gene.setEntrezGeneId(entrezGeneId); + } + } + else { + entrezGeneId = gene.getEntrezGeneId(); + } + + // if hugo symbol doesn't match the resolved hugo symbol then + // switch the resolved hugo symbol to the value given by the file, + // otherwise set hugo symbol to the resolved value + if (!DataFileUtils.isNullOrEmptyValue(hugoGeneSymbol)) { + // only update resolved hugo symbol if doesn't match value in file + if (!hugoGeneSymbol.equals(gene.getHugoGeneSymbol())) { + gene.setHugoGeneSymbol(hugoGeneSymbol); + } + } + else { + hugoGeneSymbol = gene.getHugoGeneSymbol(); + } + // add gene to geneList to generate composite profile data + geneList.add(gene); + } + + // check whether gene list is empty and, if so, log appropriate warning + if (geneList.isEmpty()) { + if (!DataFileUtils.isNullOrEmptyValue(hugoGeneSymbol) && !hugoGeneSymbol.toLowerCase().contains("-mir-")) { + LOG.warn("Skipping entry with unknown microRNA: " + hugoGeneSymbol); + return null; + } + else { + LOG.warn("Could not resolve gene from (Entrez_Gene_Id,Hugo_Symbol): " + + "(" + entrezGeneId + "," + hugoGeneSymbol + ")"); + return null; + } + } + // update gene list for profile data record + profileDataRecord.setEntrezGeneId(String.valueOf(geneList.get(0).getEntrezGeneId())); + profileDataRecord.setHugoSymbol(geneList.get(0).getHugoGeneSymbol()); + profileDataRecord.setCompositeGeneList(geneList); + + return profileDataRecord; + } + + /** + * Returns a list of phospho genes and adds phospho genes to db if not already exists. + * + * @param genes + * @param residue + * @return List + */ + private List importPhosphoGene(List genes, String residue) { + List phosphoGenes = new ArrayList(); + + // for each gene, find equivalent phospho gene and add to db if not already exists + for (Gene gene : genes) { + String phosphoGeneSymbol = gene.getHugoGeneSymbol().toUpperCase() + "_" + residue; + Gene existingPhosphoGene = geneJdbcDaoImpl.getGene(phosphoGeneSymbol); + if (existingPhosphoGene == null) { + // create a new phospho gene instance and add to db + Gene newPhosphoGene = geneJdbcDaoImpl.addGene(geneDataUtils.createPhosphoGene(gene, residue)); + phosphoGenes.add(newPhosphoGene); + } + else { + phosphoGenes.add(existingPhosphoGene); + } + } + + return phosphoGenes; + } + + /** + * Load CNA events for a given profile data record object. + * + * @param profileDataRecord + * @return List + */ + private List loadCnaEvents(ProfileDataRecord profileDataRecord) { + Map caseProfileDataRecords = profileDataRecord.getCaseProfileDataMap(); + // generate list of cna events + List cnaEvents = new ArrayList(); + for (Integer sampleId : caseProfileDataRecords.keySet()) { + // change partial deletion to full deletion + String alteration = caseProfileDataRecords.get(sampleId); + if (alteration.equals(CnaEvent.AlterationType.PARTIAL_DELETION.getName())) { + alteration = CnaEvent.AlterationType.HOMOZYGOUS_DELETION.getName(); + } + + // skip cna events that are not homozygous deletions or amplifications + if (!alteration.equals(CnaEvent.AlterationType.HOMOZYGOUS_DELETION.getName()) && + !alteration.equals(CnaEvent.AlterationType.AMPLIFICATION)) { + continue; + } + CnaEvent.AlterationType alterationType = alteration.equals(CnaEvent.AlterationType.HOMOZYGOUS_DELETION.getName())? + CnaEvent.AlterationType.HOMOZYGOUS_DELETION:CnaEvent.AlterationType.AMPLIFICATION; + + // create sample cna event and cna event + SampleCnaEvent sampleCnaEvent = new SampleCnaEvent(); + sampleCnaEvent.setSampleId(sampleId); + sampleCnaEvent.setGeneticProfileId(profileDataRecord.getGeneticProfileId()); + + CnaEvent cnaEvent = new CnaEvent(); + cnaEvent.setEntrezGeneId(Integer.valueOf(profileDataRecord.getEntrezGeneId())); + cnaEvent.setAlterationType(alterationType); + cnaEvent.setSampleCnaEvent(sampleCnaEvent); + + // add cna event to list + cnaEvents.add(cnaEvent); + } + + return cnaEvents; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/processor/StructuralVariantDataProcessor.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/processor/StructuralVariantDataProcessor.java new file mode 100644 index 0000000..2ac026d --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/processor/StructuralVariantDataProcessor.java @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.processor; + +import org.mskcc.cbio.model.*; +import org.cbio.portal.pipelines.importer.util.*; +import org.cbio.portal.pipelines.importer.model.StructuralVariantRecord; + +import com.google.common.base.Strings; +import org.apache.commons.logging.*; +import org.springframework.batch.item.ItemProcessor; +import org.springframework.beans.factory.annotation.Autowired; + +/** + * + * @author ochoaa + */ +public class StructuralVariantDataProcessor implements ItemProcessor { + + @Autowired + GeneDataUtils geneDataUtils; + + private static final Log LOG = LogFactory.getLog(StructuralVariantDataProcessor.class); + + @Override + public StructuralVariant process(StructuralVariantRecord structuralVariantRecord) throws Exception { + StructuralVariantRecord screenedRecord = screenStructuralVariantRecord(structuralVariantRecord); + if (screenedRecord == null) { + return null; + } + StructuralVariant structuralVariant = transformStructuralVariantRecord(screenedRecord); + + return structuralVariant; + } + + /** + * Transforms structural variant record to instance of StructuralVariant. + * + * @param structuralVariantRecord + * @return StructuralVariant + */ + private StructuralVariant transformStructuralVariantRecord(StructuralVariantRecord structuralVariantRecord) { + // resolve integer values for structural variant fields + Integer mapQ = DataFileUtils.isNullOrEmptyValue(structuralVariantRecord.getMapQ())?null:Integer.valueOf(structuralVariantRecord.getMapQ()); + Integer normalReadCount = DataFileUtils.isNullOrEmptyValue(structuralVariantRecord.getNormalReadCount())?0:Integer.valueOf(structuralVariantRecord.getNormalReadCount()); + Integer normalVariantCount = DataFileUtils.isNullOrEmptyValue(structuralVariantRecord.getNormalVariantCount())?null:Integer.valueOf(structuralVariantRecord.getNormalVariantCount()); + Integer svLength = DataFileUtils.isNullOrEmptyValue(structuralVariantRecord.getSvLength())?0:Integer.valueOf(structuralVariantRecord.getSvLength()); + Integer tumorReadCount = DataFileUtils.isNullOrEmptyValue(structuralVariantRecord.getTumorReadCount())?null:Integer.valueOf(structuralVariantRecord.getTumorReadCount()); + Integer tumorVariantCount = DataFileUtils.isNullOrEmptyValue(structuralVariantRecord.getTumorVariantCount())?null:Integer.valueOf(structuralVariantRecord.getTumorVariantCount()); + + // create new StructuralVariant instance with processed/resolved data from above + StructuralVariant structuralVariant = new StructuralVariant(); + structuralVariant.setGeneticProfileId(structuralVariantRecord.getGeneticProfileId()); + structuralVariant.setSampleId(structuralVariantRecord.getSampleInternalId()); + structuralVariant.setSite1Gene(structuralVariantRecord.getSite1Gene()); + structuralVariant.setSite1Chrom(structuralVariantRecord.getSite1Chrom()); + structuralVariant.setSite2Gene(structuralVariantRecord.getSite2Gene()); + structuralVariant.setSite2Chrom(structuralVariantRecord.getSite2Chrom()); + structuralVariant.setBreakpointType(structuralVariantRecord.getBreakpointType()); + structuralVariant.setAnnotation(structuralVariantRecord.getAnnotation()); + structuralVariant.setComments(structuralVariantRecord.getComments()); + structuralVariant.setConfidenceClass(structuralVariantRecord.getConfidenceClass()); + structuralVariant.setConnectionType(structuralVariantRecord.getConnectionType()); + structuralVariant.setEventInfo(structuralVariantRecord.getEventInfo()); + structuralVariant.setMapQ(mapQ); + structuralVariant.setNormalReadCount(normalReadCount); + structuralVariant.setNormalVariantCount(normalVariantCount); + structuralVariant.setPairedEndReadSupport(structuralVariantRecord.getPairedEndReadSupport()); + structuralVariant.setSite1Desc(structuralVariantRecord.getSite1Desc()); + structuralVariant.setSite1Pos(Integer.valueOf(structuralVariantRecord.getSite1Pos())); + structuralVariant.setSite2Desc(structuralVariantRecord.getSite2Desc()); + structuralVariant.setSite2Pos(Integer.valueOf(structuralVariantRecord.getSite2Pos())); + structuralVariant.setSplitReadSupport(structuralVariantRecord.getSplitReadSupport()); + structuralVariant.setSvClassName(structuralVariantRecord.getSvClassName()); + structuralVariant.setSvDesc(structuralVariantRecord.getSvDesc()); + structuralVariant.setSvLength(svLength); + structuralVariant.setTumorReadCount(tumorReadCount); + structuralVariant.setTumorVariantCount(tumorVariantCount); + + return structuralVariant; + } + + /** + * Performs basic data screening to determine whether structural variant record is acceptable or not. + * Returns null if structural variant record does not pass screening + * + * @param structuralVariantRecord + * @return StructuralVariantRecord + */ + private StructuralVariantRecord screenStructuralVariantRecord(StructuralVariantRecord structuralVariantRecord) { + // check if genes can be resolved from structural variant record + String geneSymbol1 = structuralVariantRecord.getSite1Gene(); + String normChrom1 = geneDataUtils.getNormalizedChromosome(structuralVariantRecord.getSite1Chrom()); + if (DataFileUtils.isNullOrEmptyValue(geneSymbol1)) { + LOG.warn("Skipping entry with invalid Site1_Gene: " + geneSymbol1); + return null; + } + Gene gene1 = geneDataUtils.resolveGeneFromRecordData(geneSymbol1, normChrom1); + if (gene1 == null) { + LOG.warn("Could not resolve gene from (Site1_Gene,Site1_Chrom): (" + geneSymbol1 + "," + normChrom1 + ")"); + return null; + } + // update normalized chromosome value if null or empty + if (Strings.isNullOrEmpty(normChrom1)) { + normChrom1 = gene1.getChromosome(); + } + + String geneSymbol2 = structuralVariantRecord.getSite2Gene(); + String normChrom2 = geneDataUtils.getNormalizedChromosome(structuralVariantRecord.getSite2Chrom()); + if (DataFileUtils.isNullOrEmptyValue(geneSymbol2)) { + LOG.warn("Skipping entry with invalid Site2_Gene: " + geneSymbol2); + return null; + } + Gene gene2 = geneDataUtils.resolveGeneFromRecordData(geneSymbol1, normChrom1); + if (gene2 == null) { + LOG.warn("Could not resolve gene from (Site2_Gene,Site2_Chrom): (" + geneSymbol2 + "," + normChrom2 + ")"); + return null; + } + // update normalized chromosome value if null or empty + if (Strings.isNullOrEmpty(normChrom2)) { + normChrom2 = gene2.getChromosome(); + } + // update gene and chromosome values for structural variant record + structuralVariantRecord.setSite1Gene(geneSymbol1); + structuralVariantRecord.setSite1Chrom(normChrom1); + structuralVariantRecord.setSite2Gene(geneSymbol2); + structuralVariantRecord.setSite2Chrom(normChrom2); + + return structuralVariantRecord; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/processor/TimelineDataProcessor.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/processor/TimelineDataProcessor.java new file mode 100644 index 0000000..a202b90 --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/processor/TimelineDataProcessor.java @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.processor; + +import org.mskcc.cbio.model.*; +import org.cbio.portal.pipelines.importer.model.*; +import org.cbio.portal.pipelines.importer.util.DataFileUtils; + +import java.util.*; +import org.springframework.batch.item.ItemProcessor; + +/** + * + * @author ochoaa + */ +public class TimelineDataProcessor implements ItemProcessor { + + @Override + public ClinicalEvent process(TimelineRecord timelineRecord) throws Exception { + // extract the clinical event and transform list of clinical event data from timeline record + ClinicalEvent clinicalEvent = transformTimelineRecordToClinicalEvent(timelineRecord); + List clinicalEventDataList = extractClinicalEventDataMap(timelineRecord.getClinicalEventDataMap()); + clinicalEvent.setClinicalEventData(clinicalEventDataList); + + return clinicalEvent; + } + + /** + * Transform timeline record into a clinical event instance. + * + * @param timelineRecord + * @return ClinicalEvent + */ + private ClinicalEvent transformTimelineRecordToClinicalEvent(TimelineRecord timelineRecord) { + ClinicalEvent clinicalEvent = new ClinicalEvent(); + clinicalEvent.setPatientId(timelineRecord.getPatientInternalId()); + clinicalEvent.setStartDate(Integer.valueOf(timelineRecord.getStartDate())); + clinicalEvent.setEventType(timelineRecord.getEventType()); + + // only set stop date if value not null or empty in file + if (!DataFileUtils.isNullOrEmptyValue(timelineRecord.getStartDate())) { + clinicalEvent.setStopDate(Integer.valueOf(timelineRecord.getStartDate())); + } + + return clinicalEvent; + } + + /** + * Extract clinical data event map from timeline record to list. + * + * @param clinicalEventDataMap + * @return List + */ + private List extractClinicalEventDataMap(Map clinicalEventDataMap) { + List clinicalEventDataList = new ArrayList(); + for (String key : clinicalEventDataMap.keySet()) { + ClinicalEventData clinicalDataEvent = new ClinicalEventData(); + clinicalDataEvent.setKey(key); + clinicalDataEvent.setValue(clinicalEventDataMap.get(key)); + clinicalEventDataList.add(clinicalDataEvent); + } + + return clinicalEventDataList; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/reader/CaseListReader.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/reader/CaseListReader.java new file mode 100644 index 0000000..c3802eb --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/reader/CaseListReader.java @@ -0,0 +1,262 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.reader; + +import org.mskcc.cbio.model.*; +import org.mskcc.cbio.persistence.jdbc.SampleJdbcDaoImpl; +import org.cbio.portal.pipelines.importer.util.DataFileUtils; + +import java.io.*; +import java.util.*; +import javax.annotation.Resource; +import org.apache.commons.logging.*; +import com.google.common.collect.Sets; +import org.apache.commons.collections.map.MultiKeyMap; + +import org.springframework.batch.item.*; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.beans.factory.annotation.Value; + +/** + * + * @author ochoaa + */ +public class CaseListReader implements ItemStreamReader { + + @Autowired + SampleJdbcDaoImpl sampleJdbcDaoImpl; + + @Value("#{jobParameters[stagingDirectory]}") + private String stagingDirectory; + + @Resource(name="caseListMetadataMap") + public Map caseListMetadataMap; + + private CancerStudy cancerStudy; + private List sampleListResults; + + private static final Log LOG = LogFactory.getLog(CaseListReader.class); + + @Override + public void open(ExecutionContext executionContext) throws ItemStreamException { + this.cancerStudy = (CancerStudy) executionContext.get("cancerStudy"); + MultiKeyMap datatypeMetadata = (MultiKeyMap) executionContext.get("datatypeMetadata"); + File caseListDirectory = new File(stagingDirectory, "case_lists"); + + // load standard sample lists (i.e., cases_all, cases_cna, cases_sequenced, cases_cnaseq) + List sampleLists = new ArrayList(); + try { + LOG.info("Loading standard case lists from: " + stagingDirectory + "/case_lists"); + sampleLists = loadCaseLists(caseListDirectory.getCanonicalPath(), datatypeMetadata); + } + catch (IOException ex) { + LOG.error("Error loading case lists from case list directory: " + stagingDirectory + "/case_lists"); + ex.printStackTrace(); + } + + // load custom case lists if any + List customCaseListFiles = new ArrayList(); + try { + List caseListFiles = DataFileUtils.listDataFiles(caseListDirectory.getCanonicalPath(), "case*"); + for (File caseListFile : caseListFiles) { + if (!caseListMetadataMap.containsKey(caseListFile.getName())) { + customCaseListFiles.add(caseListFile); + } + } + } + catch (IOException ex) {} + + if (!customCaseListFiles.isEmpty()) { + LOG.info("Loading custom case lists found"); + for (File caseListFile : customCaseListFiles) { + Properties properties = new Properties(); + try { + properties.load(new FileInputStream(caseListFile)); + Set caseIdSet = loadCaseIdSetFromCaseListFile(properties); + if (!caseIdSet.isEmpty()) { + SampleList sampleList = loadSampleListFromFile(properties, new ArrayList(caseIdSet)); + sampleLists.add(sampleList); + LOG.info("Loaded " + caseIdSet.size() + " cases from case list file: " + caseListFile.getName()); + } + else { + LOG.error("Error loading custom case list from: " + caseListFile.getName()); + } + } + catch (IOException ex) {} + } + } + + this.sampleListResults = sampleLists; + } + + /** + * Load case lists from the case list directory. + * + * @param caseListDirectory + * @param datatypeMetadata + * @return List + * @throws IOException + */ + private List loadCaseLists(String caseListDirectory, MultiKeyMap datatypeMetadata) throws IOException { + List sampleLists = new ArrayList(); + for (String caseListFilename : caseListMetadataMap.keySet()) { + // if case list filename does not exist then continue + File caseListFile = new File(caseListDirectory, caseListFilename); + if (!caseListFile.exists()) { + continue; + } + + boolean autoGenerateCaseList = true; + // load properties from case list file + Properties properties = new Properties(); + properties.load(new FileInputStream(caseListFile)); + if (properties.containsKey("case_list_ids")) { + autoGenerateCaseList = false; + } + + // get the case id set either from the case list file or from what was imported + // for the case list datatypes + Set caseIdSet = new LinkedHashSet<>(); + if (!autoGenerateCaseList) { + caseIdSet = loadCaseIdSetFromCaseListFile(properties); + } + else { + // overwrite case list only applies to case list filename "cases_complete.txt" + // if RNA-SEQ data exists, we want the cases complete file to contain it, not micro-array gene expression (Agilent). + boolean overwriteCaseList = caseListFilename.contains("complete"); + String[] datatypeList = caseListMetadataMap.get(caseListFilename).split("\\|"); + + Set workingCaseIdSet = new LinkedHashSet<>(); + for (String datatypeGroup : datatypeList) { + String[] datatypes = datatypeGroup.split("\\&"); + + if (!(boolean) datatypeMetadata.get(datatypes[0], "importData")) { + continue; + } + + // only overwrite working case id set for cases_complete.txt + if (overwriteCaseList) { + workingCaseIdSet = (Set) datatypeMetadata.get(datatypes[0], "caseList"); + } + else { + workingCaseIdSet.addAll((Set) datatypeMetadata.get(datatypes[0], "caseList")); + } + + // get intersection of datatypes if necessary + if (datatypes.length > 1 ) { + for (int i=1; i<=datatypes.length; i++) { + if (!(boolean) datatypeMetadata.get(datatypes[i], "importData")) { + continue; + } + workingCaseIdSet = Sets.intersection(workingCaseIdSet, (Set) datatypeMetadata.get(datatypes[i], "caseList")); + } + } + } + caseIdSet.addAll(workingCaseIdSet); + } + // get the remaining case list file properties if the case id set is not empty + if (caseIdSet.isEmpty()) { + continue; + } + SampleList sampleList = loadSampleListFromFile(properties, new ArrayList(caseIdSet)); + sampleLists.add(sampleList); + LOG.info("Retrieved case list: " + sampleList.getStableId()); + } + + return sampleLists; + } + + /** + * Load case id set directly from a case list file properties. + * + * @param properties + * @return Set + * @throws IOException + */ + private Set loadCaseIdSetFromCaseListFile(Properties properties) throws IOException { + Set caseIdSet = new LinkedHashSet<>(); + + // get sample internal id for each case id in list + String[] caseIds = DataFileUtils.splitDataFields(properties.getProperty("case_list_ids")); + for (String caseId : caseIds) { + String sampleStableId = DataFileUtils.getSampleStableId(caseId); + Sample sample = sampleJdbcDaoImpl.getSampleByStudy(sampleStableId, cancerStudy.getCancerStudyId()); + if (sample == null) { + LOG.warn("Skipping unknown sample stable id: " + (sampleStableId.equals(caseId)?caseId:sampleStableId)); + continue; + } + caseIdSet.add(sample.getInternalId()); + } + + return caseIdSet; + } + + /** + * Load sample list from file given case list file properties and the set of case ids. + * + * @param properties + * @param caseIdSet + * @return SampleList + */ + private SampleList loadSampleListFromFile(Properties properties, List caseIdSet) { + String stableId = properties.getProperty("stable_id"); + String category = properties.getProperty("case_list_category", "other"); + String name = properties.getProperty("case_list_name"); + String description = properties.getProperty("case_list_description"); + + SampleList sampleList = new SampleList(); + sampleList.setStableId(stableId); + sampleList.setCategory(category); + sampleList.setName(name); + sampleList.setDescription(description); + sampleList.setCancerStudyId(cancerStudy.getCancerStudyId()); + sampleList.setSampleListList(caseIdSet); + + return sampleList; + } + + @Override + public void update(ExecutionContext executionContext) throws ItemStreamException {} + + @Override + public void close() throws ItemStreamException {} + + @Override + public SampleList read() throws Exception, UnexpectedInputException, ParseException, NonTransientResourceException { + if (!sampleListResults.isEmpty()) { + return sampleListResults.remove(0); + } + return null; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/reader/ClinicalDataReader.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/reader/ClinicalDataReader.java new file mode 100644 index 0000000..fe2e055 --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/reader/ClinicalDataReader.java @@ -0,0 +1,238 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.reader; + +import org.mskcc.cbio.model.*; +import org.cbio.portal.pipelines.importer.util.DataFileUtils; +import org.cbio.portal.pipelines.importer.config.composite.CompositeClinicalData; + +import java.util.*; +import java.io.*; +import org.apache.commons.collections.map.MultiKeyMap; +import org.apache.commons.logging.*; + +import org.springframework.batch.item.*; +import org.springframework.batch.item.file.FlatFileItemReader; +import org.springframework.batch.item.file.mapping.*; +import org.springframework.batch.item.file.transform.*; +import org.springframework.core.io.FileSystemResource; + +/** + * + * @author ochoaa + */ +public class ClinicalDataReader implements ItemStreamReader { + + private CancerStudy cancerStudy; + private List compositeClinicalDataResults; + + private static final Log LOG = LogFactory.getLog(ClinicalDataReader.class); + + public static enum MissingAttributeValues { + NOT_APPLICABLE("Not Applicable"), + NOT_AVAILABLE("Not Available"), + PENDING("Pending"), + DISCREPANCY("Discrepancy"), + COMPLETED("Completed"), + NULL("null"), + MISSING(""), + NA("NA"), + N_A("N/A"), + UNKNOWN("unknown"); + + private String propertyName; + + MissingAttributeValues(String propertyName) { this.propertyName = propertyName; } + @Override + public String toString() { return propertyName; } + + static public boolean has(String value) { + if (value == null) return false; + if (value.trim().equals("")) return true; + try { + value = value.replaceAll("[\\[|\\]\\/]", ""); + value = value.replaceAll(" ", "_"); + return valueOf(value.toUpperCase()) != null; + } + catch (IllegalArgumentException ex) { + return false; + } + } + + static public String getNotAvailable() { + return "[" + NOT_AVAILABLE.toString() + "]"; + } + } + + @Override + public void open(ExecutionContext executionContext) throws ItemStreamException { + this.cancerStudy = (CancerStudy) executionContext.get("cancerStudy"); + List dataFileList = (List) executionContext.get("dataFileList"); + MultiKeyMap clinicalMetadata = (MultiKeyMap) executionContext.get("clinicalMetadata"); + + // for each file, read in the clinical data + List compositeClinicalDataList = new ArrayList(); + dataFileList.stream().forEach((dataFile) -> { + List compositeClinicalData = new ArrayList(); + String[] header = (String[]) clinicalMetadata.get(dataFile.getName(), "header"); + List clinicalAttributes = (List) clinicalMetadata.get(dataFile.getName(), "clinicalAttributes"); + try { + compositeClinicalData = loadClinicalData(dataFile, header, clinicalAttributes); + } catch (Exception ex) { + LOG.error("Error loading clinical data from: " + dataFile.getName()); + ex.printStackTrace(); + } + // report number of records loaded from clinical file + if (!compositeClinicalData.isEmpty()) { + LOG.info("Clinical records loaded from: " + dataFile.getName() + ": " + compositeClinicalData.size()); + compositeClinicalDataList.addAll(compositeClinicalData); + } + }); + this.compositeClinicalDataResults = compositeClinicalDataList; + } + + /** + * Load clinical data from data file. + * + * @param dataFile + * @param header + * @param clinicalAttributes + * @return List + */ + private List loadClinicalData(File dataFile, String[] header, List clinicalAttributes) throws Exception { + // init tab-delim tokenizer with names of clinical attributes + DelimitedLineTokenizer tokenizer = new DelimitedLineTokenizer(DelimitedLineTokenizer.DELIMITER_TAB); + tokenizer.setNames(header); + + // init line mapper for clinical data file + DefaultLineMapper lineMapper = new DefaultLineMapper<>(); + lineMapper.setLineTokenizer(tokenizer); + lineMapper.setFieldSetMapper(clinicalFieldSetMapper(clinicalAttributes)); + + // set up clinical data file reader context + FlatFileItemReader clinicalDataReader = new FlatFileItemReader(); + clinicalDataReader.setResource(new FileSystemResource(dataFile)); + clinicalDataReader.setLineMapper(lineMapper); + clinicalDataReader.setLinesToSkip(1); + clinicalDataReader.open(new ExecutionContext()); + + // read through each record in clinical data file + List compositeClinicalData = new ArrayList(); + CompositeClinicalData record = clinicalDataReader.read(); + while (record != null) { + compositeClinicalData.add(record); + record = clinicalDataReader.read(); + } + clinicalDataReader.close(); + + return compositeClinicalData; + } + + /** + * Returns a field set mapper for the clinical data file. + * + * @param dataFilename + * @param hasSampleIdCol + * @return FieldSetMapper + */ + private FieldSetMapper clinicalFieldSetMapper(List clinicalAttributes) { + return (FieldSetMapper) (FieldSet fs) -> { + // get stable ids from record + String patientStableId = DataFileUtils.getPatientStableId(fs.readString("PATIENT_ID")); + String sampleStableId = fs.getProperties().containsKey("SAMPLE_ID")? + DataFileUtils.getSampleStableId(fs.readString("SAMPLE_ID")):""; + if (DataFileUtils.isNullOrEmptyValue(patientStableId) && !DataFileUtils.isNullOrEmptyValue(sampleStableId)) { + patientStableId = sampleStableId; + } + + // create new patient + Patient patient = new Patient(); + patient.setStableId(patientStableId); + patient.setCancerStudyId(cancerStudy.getCancerStudyId()); + patient.setInternalId(-1); + + // create new sample + Sample sample = new Sample(); + sample.setStableId(sampleStableId); + sample.setPatientId(patient.getInternalId()); + sample.setTypeOfCancerId(cancerStudy.getTypeOfCancerId()); + sample.setInternalId(-1); + + // resolve sample type, default is PRIMARY_SOLID_TUMOR + // sample type will be corrected if sample already exists by ClinicalDataWriter + Sample.SampleType sampleType = Sample.SampleType.PRIMARY_SOLID_TUMOR; + if (sample.getInternalId() == -1 && fs.getProperties().containsKey("SAMPLE_TYPE")) { + String sampleTypeString = DataFileUtils.getSampleTypeString(sample.getStableId(), fs.readRawString("SAMPLE_TYPE")); + try { + sampleType = Sample.SampleType.valueOf(sampleTypeString); + } + catch (IllegalArgumentException ex) {} + } + sample.setSampleType(sampleType); + + // if attr value not "missing" then add to filtered clinical data map + Map filteredClinicalData = new HashMap<>(); + clinicalAttributes.stream().forEach((attr) -> { + String attrVal = fs.readString(attr.getAttrId()); + if (!(MissingAttributeValues.has(attrVal))) { + // truncate attribute value to prevent MysqlDataTruncation exceptions + if (attrVal.length() > DataFileUtils.DATA_TRUNCATION_THRESHOLD) { + attrVal = attrVal.substring(0, DataFileUtils.DATA_TRUNCATION_THRESHOLD); + } + filteredClinicalData.put(attr, attrVal); + } + }); + CompositeClinicalData composite = new CompositeClinicalData(); + composite.setPatient(patient); + composite.setSample(sample); + composite.setCompositeClinicalDataMap(filteredClinicalData); + + return composite; + }; + } + + @Override + public void update(ExecutionContext executionContext) throws ItemStreamException {} + + @Override + public void close() throws ItemStreamException {} + + @Override + public CompositeClinicalData read() throws Exception, UnexpectedInputException, ParseException, NonTransientResourceException { + if (!compositeClinicalDataResults.isEmpty()) { + return compositeClinicalDataResults.remove(0); + } + return null; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/reader/CopyNumberSegmentDataReader.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/reader/CopyNumberSegmentDataReader.java new file mode 100644 index 0000000..9453b90 --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/reader/CopyNumberSegmentDataReader.java @@ -0,0 +1,205 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.reader; + +import org.mskcc.cbio.model.*; +import org.mskcc.cbio.persistence.jdbc.SampleJdbcDaoImpl; +import org.cbio.portal.pipelines.importer.model.CopyNumberSegmentRecord; +import org.cbio.portal.pipelines.importer.util.*; + +import java.io.*; +import java.util.*; +import org.apache.commons.logging.*; +import com.google.common.base.Strings; +import com.google.common.collect.Sets; +import org.apache.commons.collections.map.MultiKeyMap; + +import org.springframework.batch.item.*; +import org.springframework.batch.item.file.FlatFileItemReader; +import org.springframework.batch.item.file.mapping.*; +import org.springframework.batch.item.file.transform.*; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.cglib.beans.BeanMap; +import org.springframework.core.io.FileSystemResource; + +/** + * + * @author ochoaa + */ +public class CopyNumberSegmentDataReader implements ItemStreamReader { + + @Autowired + SampleJdbcDaoImpl sampleJdbcDaoImpl; + + private final Map copyNumberSegmentStagingDataMap = new CopyNumberSegmentRecord().getCopyNumberSegmentStagingDataMap(); + private final Set samplesSkipped = new HashSet<>(); + + private CancerStudy cancerStudy; + private List copyNumberSegmentDataResults; + + private static final Log LOG = LogFactory.getLog(CopyNumberSegmentDataReader.class); + + @Override + public void open(ExecutionContext executionContext) throws ItemStreamException { + this.cancerStudy = (CancerStudy) executionContext.get("cancerStudy"); + File dataFile = (File) executionContext.get("dataFile"); + MultiKeyMap copyNumberSegmentMetadata = (MultiKeyMap) executionContext.get("copyNumberSegmentMetadata"); + String[] header = (String[]) copyNumberSegmentMetadata.get(dataFile.getName(), "header"); + Integer numRecords = (Integer) copyNumberSegmentMetadata.get(dataFile.getName(), "numRecords"); + + // load copy number segment data from datafile + List copyNumberSegmentRecordList = new ArrayList(); + try { + LOG.info("Loading copy number segment data from: " + dataFile); + copyNumberSegmentRecordList = loadCopyNumberSegmentData(dataFile, header, numRecords); + } + catch (Exception ex) { + ex.printStackTrace(); + } + + // log whether data was loaded or not from copy number segment datafile + if (copyNumberSegmentRecordList.isEmpty()) { + LOG.error("Error loading copy number segment data from: " + dataFile.getName()); + } + else { + LOG.info("Loaded " + copyNumberSegmentRecordList.size() + "/" + numRecords + " from: " + dataFile.getName()); + } + // add total samples skipped to execution context for summary statistics after step executes + executionContext.put("samplesSkipped", samplesSkipped.size()); + + this.copyNumberSegmentDataResults = copyNumberSegmentRecordList; + } + + /** + * Loads copy number segment data from the datafile. + * + * @param dataFile + * @return List + */ + private List loadCopyNumberSegmentData(File dataFile, String[] copyNumberSegmentHeader, Integer numRecords) throws Exception { + // init tab-delim tokenizer with the copy number segment file header + DelimitedLineTokenizer tokenizer = new DelimitedLineTokenizer(DelimitedLineTokenizer.DELIMITER_TAB); + tokenizer.setNames(copyNumberSegmentHeader); + + // init line mapper for copy number segment file + DefaultLineMapper lineMapper = new DefaultLineMapper(); + lineMapper.setLineTokenizer(tokenizer); + lineMapper.setFieldSetMapper(copyNumberSegmentFieldSetMapper()); + + // set up copy number segment file reader context + FlatFileItemReader copyNumberSegmentDataReader = new FlatFileItemReader(); + copyNumberSegmentDataReader.setResource(new FileSystemResource(dataFile)); + copyNumberSegmentDataReader.setLineMapper(lineMapper); + copyNumberSegmentDataReader.setLinesToSkip(1); + copyNumberSegmentDataReader.open(new ExecutionContext()); + + List copyNumberSegmentRecordList = new ArrayList(); + CopyNumberSegmentRecord record = copyNumberSegmentDataReader.read(); + while (record != null) { + CopyNumberSegmentRecord copyNumberSegmentRecord = screenCopyNumberSegmentRecord(record); + if (copyNumberSegmentRecord != null) { + copyNumberSegmentRecordList.add(copyNumberSegmentRecord); + } + record = copyNumberSegmentDataReader.read(); + } + copyNumberSegmentDataReader.close(); + + return copyNumberSegmentRecordList; + } + + /** + * Field set mapper for copy number segment records. + * + * @return FieldSetMapper + */ + private FieldSetMapper copyNumberSegmentFieldSetMapper() { + return (FieldSetMapper) (FieldSet fs) -> { + BeanMap copyNumberSegmentRecordBeanMap = BeanMap.create(new CopyNumberSegmentRecord()); + Set fieldsetNames = new HashSet(Arrays.asList(fs.getNames())); + + // fill property values for bean using intersection of properties and expected copy number segment columns + Set intersection = Sets.intersection(copyNumberSegmentStagingDataMap.keySet(), fieldsetNames); + intersection.stream().forEach((column) -> { + copyNumberSegmentRecordBeanMap.put(copyNumberSegmentStagingDataMap.get(column), + Strings.isNullOrEmpty(fs.readRawString(column))?null:fs.readRawString(column)); + }); + CopyNumberSegmentRecord record = (CopyNumberSegmentRecord) copyNumberSegmentRecordBeanMap.getBean(); + + return record; + }; + } + + /** + * Performs basic data screening to determine whether copy number segment record is acceptable or not. + * Returns null if copy number segment record does not pass screening + * + * @param copyNumberSegmentRecord + * @return CopyNumberSegmentRecord + */ + private CopyNumberSegmentRecord screenCopyNumberSegmentRecord(CopyNumberSegmentRecord copyNumberSegmentRecord) { + // make sure that sample can be found in the database by stable id, cancer study id + String sampleStableId = DataFileUtils.getSampleStableId(copyNumberSegmentRecord.getId()); + Sample sample = sampleJdbcDaoImpl.getSampleByStudy(sampleStableId, cancerStudy.getCancerStudyId()); + if (sample == null) { + if (samplesSkipped.add(sampleStableId)) { + if (!DataFileUtils.isNormalSample(sampleStableId)) { + LOG.warn("Could not find sample in db: " + copyNumberSegmentRecord.getId()); + } + else { + LOG.warn("Skipping normal sample: " + sampleStableId); + } + } + return null; + } + // update record with sample id and cancer study id + copyNumberSegmentRecord.setCancerStudyId(cancerStudy.getCancerStudyId()); + copyNumberSegmentRecord.setSampleId(sample.getInternalId()); + + return copyNumberSegmentRecord; + } + + @Override + public void update(ExecutionContext executionContext) throws ItemStreamException {} + + @Override + public void close() throws ItemStreamException {} + + @Override + public CopyNumberSegmentRecord read() throws Exception, UnexpectedInputException, ParseException, NonTransientResourceException { + if (!copyNumberSegmentDataResults.isEmpty()) { + return copyNumberSegmentDataResults.remove(0); + } + return null; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/reader/FusionDataReader.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/reader/FusionDataReader.java new file mode 100644 index 0000000..1d599d2 --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/reader/FusionDataReader.java @@ -0,0 +1,211 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.reader; + +import org.mskcc.cbio.model.*; +import org.mskcc.cbio.persistence.jdbc.*; +import org.cbio.portal.pipelines.importer.util.*; +import org.cbio.portal.pipelines.importer.model.FusionRecord; + +import java.io.*; +import java.util.*; +import org.apache.commons.logging.*; +import com.google.common.base.Strings; +import com.google.common.collect.Sets; +import org.apache.commons.collections.map.MultiKeyMap; + +import org.springframework.batch.item.*; +import org.springframework.batch.item.file.FlatFileItemReader; +import org.springframework.batch.item.file.mapping.*; +import org.springframework.batch.item.file.transform.*; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.cglib.beans.BeanMap; +import org.springframework.core.io.FileSystemResource; + +/** + * + * @author ochoaa + */ +public class FusionDataReader implements ItemStreamReader { + + @Autowired + SampleJdbcDaoImpl sampleJdbcDaoImpl; + + private final Map fusionStagingDataMap = new FusionRecord().getFusionStagingData(); + private final Set samplesSkipped = new HashSet<>(); + + private GeneticProfile geneticProfile; + private List fusionRecordResults; + + private static final Log LOG = LogFactory.getLog(FusionDataReader.class); + + @Override + public void open(ExecutionContext executionContext) throws ItemStreamException { + this.geneticProfile = (GeneticProfile) executionContext.get("geneticProfile"); + List dataFileList = (List) executionContext.get("dataFileList"); + MultiKeyMap fusionFileMetadata = (MultiKeyMap) executionContext.get("mutationFileMetadata"); + + // load fusion records from each data file in list + List fusionRecordList = new ArrayList(); + dataFileList.stream().forEach((dataFile) -> { + String[] fusionHeader = (String[]) fusionFileMetadata.get(dataFile.getName(), "header"); + int numRecords = (int) fusionFileMetadata.get(dataFile.getName(), "numRecords"); + + // load fusion records from each datafile + List fusionRecords = new ArrayList(); + try { + LOG.info("Loading fusion data from: " + dataFile.getName()); + fusionRecords = loadFusionRecords(dataFile, fusionHeader); + } + catch (Exception ex) { + ex.printStackTrace(); + } + + if (fusionRecords.isEmpty()) { + LOG.error("Error loading fusion data from: " + dataFile.getName()); + } + else { + LOG.info("Loaded " + fusionRecords.size() + "/" + numRecords + " from: " + dataFile.getName()); + fusionRecordList.addAll(fusionRecords); + } + }); + // add counts to execution context for summary statistics after step executes + executionContext.put("samplesSkipped", samplesSkipped.size()); + executionContext.put("isMutationDatatype", false); + + this.fusionRecordResults = fusionRecordList; + } + + /** + * Loads fusion data from the datafile. + * + * @param dataFile + * @return List + */ + private List loadFusionRecords(File dataFile, String[] fusionHeader) throws Exception { + // init tab-delim tokenizer with the fusion file header + DelimitedLineTokenizer tokenizer = new DelimitedLineTokenizer(DelimitedLineTokenizer.DELIMITER_TAB); + tokenizer.setNames(fusionHeader); + + // init line mapper for fusion file + DefaultLineMapper lineMapper = new DefaultLineMapper<>(); + lineMapper.setLineTokenizer(tokenizer); + lineMapper.setFieldSetMapper(fusionFieldSetMapper()); + + // set up fusion file reader context + FlatFileItemReader fusionDataReader = new FlatFileItemReader(); + fusionDataReader.setResource(new FileSystemResource(dataFile)); + fusionDataReader.setLineMapper(lineMapper); + fusionDataReader.setLinesToSkip(1); + fusionDataReader.open(new ExecutionContext()); + + List fusionRecordList = new ArrayList(); + FusionRecord record = fusionDataReader.read(); + while (record != null) { + FusionRecord screenedRecord = screenFusionRecord(record); + if (screenedRecord != null) { + fusionRecordList.add(screenedRecord); + } + record = fusionDataReader.read(); + } + fusionDataReader.close(); + + return fusionRecordList; + } + + /** + * Field set mapper for fusion records. + * + * @return FieldSetMapper + */ + private FieldSetMapper fusionFieldSetMapper() { + return (FieldSetMapper) (FieldSet fs) -> { + BeanMap fusionRecordBeanMap = BeanMap.create(new FusionRecord()); + Set fieldsetNames = new HashSet(Arrays.asList(fs.getNames())); + + // fill property values for bean using intersection of properties and expected fusion columns + Set intersection = Sets.intersection(fusionStagingDataMap.keySet(), fieldsetNames); + intersection.stream().forEach((column) -> { + fusionRecordBeanMap.put(fusionStagingDataMap.get(column), + Strings.isNullOrEmpty(fs.readRawString(column))?null:fs.readRawString(column)); + }); + FusionRecord record = (FusionRecord) fusionRecordBeanMap.getBean(); + + return record; + }; + } + + /** + * Performs basic data screening to determine whether fusion record is acceptable or not. + * Returns null if fusion record does not pass screening + * + * @param fusionRecord + * @return FusionRecord + */ + private FusionRecord screenFusionRecord(FusionRecord fusionRecord) { + // make sure that sample can be found in the database by stable id, cancer study id + String sampleStableId = DataFileUtils.getSampleStableId(fusionRecord.getTumorSampleBarcode()); + Sample sample = sampleJdbcDaoImpl.getSampleByStudy(sampleStableId, geneticProfile.getCancerStudyId()); + if (sample == null) { + if (samplesSkipped.add(sampleStableId)) { + if (!DataFileUtils.isNormalSample(sampleStableId)) { + LOG.warn("Could not find sample in db: " + fusionRecord.getTumorSampleBarcode()); + } + else { + LOG.warn("Skipping normal sample: " + sampleStableId); + } + } + return null; + } + // update sample id for fusion record + fusionRecord.setSampleId(sample.getInternalId()); + fusionRecord.setGeneticProfileId(geneticProfile.getGeneticProfileId()); + + return fusionRecord; + } + + @Override + public void update(ExecutionContext executionContext) throws ItemStreamException {} + + @Override + public void close() throws ItemStreamException {} + + @Override + public FusionRecord read() throws Exception, UnexpectedInputException, ParseException, NonTransientResourceException { + if (!fusionRecordResults.isEmpty()) { + return fusionRecordResults.remove(0); + } + return null; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/reader/GisticDataReader.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/reader/GisticDataReader.java new file mode 100644 index 0000000..7642009 --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/reader/GisticDataReader.java @@ -0,0 +1,168 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.reader; + +import org.mskcc.cbio.model.*; +import org.cbio.portal.pipelines.importer.model.GisticRecord; + +import java.io.*; +import java.util.*; +import org.apache.commons.logging.*; +import com.google.common.base.Strings; +import com.google.common.collect.Sets; +import org.apache.commons.collections.map.MultiKeyMap; + +import org.springframework.batch.item.*; +import org.springframework.batch.item.file.FlatFileItemReader; +import org.springframework.batch.item.file.mapping.*; +import org.springframework.batch.item.file.transform.*; +import org.springframework.cglib.beans.BeanMap; +import org.springframework.core.io.FileSystemResource; + +/** + * + * @author ochoaa + */ +public class GisticDataReader implements ItemStreamReader { + + private final Map gisticStagingDataMap = new GisticRecord().getGisticStagingDataMap(); + + private CancerStudy cancerStudy; + private List gisticRecordResults; + + private static final Log LOG = LogFactory.getLog(GisticDataReader.class); + + @Override + public void open(ExecutionContext executionContext) throws ItemStreamException { + this.cancerStudy = (CancerStudy) executionContext.get("cancerStudy"); + File dataFile = (File) executionContext.get("dataFile"); + MultiKeyMap gisticMetadata = (MultiKeyMap) executionContext.get("gisticMetadata"); + String[] header = (String[]) gisticMetadata.get(dataFile.getName(), "header"); + Integer numRecords = (Integer) gisticMetadata.get(dataFile.getName(), "numRecords"); + + // load gistic data from datafile + List gisticDataList = new ArrayList(); + try { + LOG.info("Loading gistic data from: " + dataFile.getName()); + gisticDataList = loadGisticData(dataFile, header); + } + catch (Exception ex) { + ex.printStackTrace(); + } + + // log whether data was loaded or not from gistic datafile + if (gisticDataList.isEmpty()) { + LOG.error("Error loading gistic data from: " + dataFile.getName()); + } + else { + LOG.info("Loaded " + gisticDataList.size() + "/" + numRecords + " from: " + dataFile.getName()); + } + + this.gisticRecordResults = gisticDataList; + } + + /** + * Loads gistic data from the datafile. + * + * @param dataFile + * @param gisticHeader + * @return List + */ + private List loadGisticData(File dataFile, String[] gisticHeader) throws Exception { + // init tab-delim tokenizer with the gistic file header + DelimitedLineTokenizer tokenizer = new DelimitedLineTokenizer(DelimitedLineTokenizer.DELIMITER_TAB); + tokenizer.setNames(gisticHeader); + + // init line mapper for gistic file + DefaultLineMapper lineMapper = new DefaultLineMapper(); + lineMapper.setLineTokenizer(tokenizer); + lineMapper.setFieldSetMapper(gisticFieldSetMapper()); + + // set up gistic file reader context + FlatFileItemReader gisticDataReader = new FlatFileItemReader(); + gisticDataReader.setResource(new FileSystemResource(dataFile)); + gisticDataReader.setLineMapper(lineMapper); + gisticDataReader.setLinesToSkip(1); + gisticDataReader.open(new ExecutionContext()); + + List gisticDataList = new ArrayList(); + GisticRecord record = gisticDataReader.read(); + while (record != null) { + // update gistic record with cancer study id + record.setCancerStudyId(cancerStudy.getCancerStudyId()); + gisticDataList.add(record); + + record = gisticDataReader.read(); + } + gisticDataReader.close(); + + return gisticDataList; + } + + /** + * Field set mapper for gistic records. + * + * @return FieldSetMapper + */ + private FieldSetMapper gisticFieldSetMapper() { + return (FieldSetMapper) (FieldSet fs) -> { + BeanMap gisticRecordBeanMap = BeanMap.create(new GisticRecord()); + Set fieldsetNames = new HashSet(Arrays.asList(fs.getNames())); + + // fill property values for bean using intersection of properties and expected gistic columns + Set intersection = Sets.intersection(gisticStagingDataMap.keySet(), fieldsetNames); + intersection.stream().forEach((column) -> { + gisticRecordBeanMap.put(gisticStagingDataMap.get(column), + Strings.isNullOrEmpty(fs.readRawString(column))?null:fs.readRawString(column)); + }); + GisticRecord record = (GisticRecord) gisticRecordBeanMap.getBean(); + + return record; + }; + } + + @Override + public void update(ExecutionContext executionContext) throws ItemStreamException {} + + @Override + public void close() throws ItemStreamException {} + + @Override + public GisticRecord read() throws Exception, UnexpectedInputException, ParseException, NonTransientResourceException { + if (!gisticRecordResults.isEmpty()) { + return gisticRecordResults.remove(0); + } + return null; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/reader/MutSigDataReader.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/reader/MutSigDataReader.java new file mode 100644 index 0000000..5b8276d --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/reader/MutSigDataReader.java @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.reader; + +import org.mskcc.cbio.model.*; +import org.cbio.portal.pipelines.importer.model.MutSigRecord; + +import java.io.*; +import java.util.*; +import org.apache.commons.logging.*; +import com.google.common.base.Strings; +import com.google.common.collect.Sets; +import org.apache.commons.collections.map.MultiKeyMap; + +import org.springframework.batch.item.*; +import org.springframework.batch.item.file.FlatFileItemReader; +import org.springframework.batch.item.file.mapping.*; +import org.springframework.batch.item.file.transform.*; +import org.springframework.cglib.beans.BeanMap; +import org.springframework.core.io.FileSystemResource; + +/** + * + * @author ochoaa + */ +public class MutSigDataReader implements ItemStreamReader { + + private final Map mutSigStagingDataMap = new MutSigRecord().getMutSigStagingDataMap(); + private CancerStudy cancerStudy; + private List mutSigDataResults; + + private static final Log LOG = LogFactory.getLog(MutSigDataReader.class); + + @Override + public void open(ExecutionContext executionContext) throws ItemStreamException { + this.cancerStudy = (CancerStudy) executionContext.get("cancerStudy"); + File dataFile = (File) executionContext.get("dataFile"); + MultiKeyMap mutSigMetadata = (MultiKeyMap) executionContext.get("mutSigMetadata"); + String[] header = (String[]) mutSigMetadata.get(dataFile.getName(), "header"); + Integer numRecords = (Integer) mutSigMetadata.get(dataFile.getName(), "numRecords"); + + // load mutsig data from datafile + List mutSigDataList = new ArrayList(); + try { + LOG.info("Loading mutsig data from: " + dataFile); + mutSigDataList = loadMutSigData(dataFile, header); + } + catch (Exception ex) { + ex.printStackTrace(); + } + + // log whether data was loaded or not from mutsig datafile + if (mutSigDataList.isEmpty()) { + LOG.error("Error loading mutsig data from: " + dataFile.getName()); + } + else { + LOG.info("Loaded " + mutSigDataList.size() + "/" + numRecords + " from: " + dataFile.getName()); + } + + this.mutSigDataResults = mutSigDataList; + } + + /** + * Loads mutsig data from the datafile. + * + * @param dataFile + * @param mutSigHeader + * @return List + */ + private List loadMutSigData(File dataFile, String[] mutSigHeader) throws Exception { + // init tab-delim tokenizer with the mutsig file header + DelimitedLineTokenizer tokenizer = new DelimitedLineTokenizer(DelimitedLineTokenizer.DELIMITER_TAB); + tokenizer.setNames(mutSigHeader); + + // init line mapper for mutsig file + DefaultLineMapper lineMapper = new DefaultLineMapper(); + lineMapper.setLineTokenizer(tokenizer); + lineMapper.setFieldSetMapper(mutSigFieldSetMapper()); + + // set up mustig file reader context + FlatFileItemReader mutSigDataReader = new FlatFileItemReader(); + mutSigDataReader.setResource(new FileSystemResource(dataFile)); + mutSigDataReader.setLineMapper(lineMapper); + mutSigDataReader.setLinesToSkip(1); + mutSigDataReader.open(new ExecutionContext()); + + List mutSigDataList = new ArrayList(); + MutSigRecord record = mutSigDataReader.read(); + while (record != null) { + // update mut sig record cancer study id + record.setCancerStudyId(cancerStudy.getCancerStudyId()); + mutSigDataList.add(record); + + record = mutSigDataReader.read(); + } + mutSigDataReader.close(); + + return mutSigDataList; + } + + /** + * Field set mapper for mutsig records. + * + * @return FieldSetMapper + */ + private FieldSetMapper mutSigFieldSetMapper() { + return (FieldSetMapper) (FieldSet fs) -> { + BeanMap mutSigRecordBeanMap = BeanMap.create(new MutSigRecord()); + Set fieldsetNames = new HashSet(Arrays.asList(fs.getNames())); + + // fill property values for bean using intersection of properties and expected mutsig columns + Set intersection = Sets.intersection(mutSigStagingDataMap.keySet(), fieldsetNames); + intersection.stream().forEach((column) -> { + mutSigRecordBeanMap.put(mutSigStagingDataMap.get(column), + Strings.isNullOrEmpty(fs.readRawString(column))?null:fs.readRawString(column)); + }); + MutSigRecord record = (MutSigRecord) mutSigRecordBeanMap.getBean(); + + return record; + }; + } + + @Override + public void update(ExecutionContext executionContext) throws ItemStreamException {} + + @Override + public void close() throws ItemStreamException {} + + @Override + public MutSigRecord read() throws Exception, UnexpectedInputException, ParseException, NonTransientResourceException { + if (!mutSigDataResults.isEmpty()) { + return mutSigDataResults.remove(0); + } + return null; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/reader/MutationDataReader.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/reader/MutationDataReader.java new file mode 100644 index 0000000..58a6804 --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/reader/MutationDataReader.java @@ -0,0 +1,211 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.reader; + +import org.mskcc.cbio.model.*; +import org.mskcc.cbio.persistence.jdbc.*; +import org.cbio.portal.pipelines.importer.util.*; +import org.cbio.portal.pipelines.importer.model.MafRecord; + +import java.io.*; +import java.util.*; +import org.apache.commons.logging.*; +import com.google.common.base.Strings; +import com.google.common.collect.Sets; +import org.apache.commons.collections.map.MultiKeyMap; + +import org.springframework.batch.item.*; +import org.springframework.batch.item.file.FlatFileItemReader; +import org.springframework.batch.item.file.mapping.*; +import org.springframework.batch.item.file.transform.*; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.cglib.beans.BeanMap; +import org.springframework.core.io.FileSystemResource; + +/** + * + * @author ochoaa + */ +public class MutationDataReader implements ItemStreamReader { + + @Autowired + SampleJdbcDaoImpl sampleJdbcDaoImpl; + + private final Map mafStagingDataMap = new MafRecord().getMafStagingDataMap(); + private final Set samplesSkipped = new HashSet<>(); + + private GeneticProfile geneticProfile; + private List mafRecordResults; + + private static final Log LOG = LogFactory.getLog(MutationDataReader.class); + + @Override + public void open(ExecutionContext executionContext) throws ItemStreamException { + this.geneticProfile = (GeneticProfile) executionContext.get("geneticProfile"); + List dataFileList = (List) executionContext.get("dataFileList"); + MultiKeyMap mafFileMetadata = (MultiKeyMap) executionContext.get("mutationFileMetadata"); + + List mafRecordList = new ArrayList(); + dataFileList.stream().forEach((dataFile) -> { + String[] mafHeader = (String[]) mafFileMetadata.get(dataFile.getName(), "header"); + int numRecords = (int) mafFileMetadata.get(dataFile.getName(), "numRecords"); + + // load MAF records from each datafile + List mafRecords = new ArrayList(); + try { + LOG.info("Loading mutation data from: " + dataFile.getName()); + mafRecords = loadMafRecords(dataFile, mafHeader); + } + catch (Exception ex) { + ex.printStackTrace(); + } + + if (mafRecords.isEmpty()) { + LOG.error("Error loading mutation data from: " + dataFile.getName()); + } + else { + LOG.info("Loaded " + mafRecords.size() + "/" + numRecords + " from: " + dataFile.getName()); + mafRecordList.addAll(mafRecords); + } + }); + // add counts to execution context for summary statistics after step executes + executionContext.put("samplesSkipped", samplesSkipped.size()); + executionContext.put("isMutationDatatype", true); + + this.mafRecordResults = mafRecordList; + } + + /** + * Loads MAF data from the datafile. + * + * @param dataFile + * @return List + */ + private List loadMafRecords(File dataFile, String[] mafHeader) throws Exception { + // init tab-delim tokenizer with the maf file header + DelimitedLineTokenizer tokenizer = new DelimitedLineTokenizer(DelimitedLineTokenizer.DELIMITER_TAB); + tokenizer.setNames(mafHeader); + + // init line mapper for maf file + DefaultLineMapper lineMapper = new DefaultLineMapper<>(); + lineMapper.setLineTokenizer(tokenizer); + lineMapper.setFieldSetMapper(mutationFieldSetMapper()); + + // set up maf file reader context + FlatFileItemReader mafDataReader = new FlatFileItemReader(); + mafDataReader.setResource(new FileSystemResource(dataFile)); + mafDataReader.setLineMapper(lineMapper); + mafDataReader.setLinesToSkip(1); + mafDataReader.open(new ExecutionContext()); + + List mafRecordList = new ArrayList(); + MafRecord record = mafDataReader.read(); + while (record != null) { + MafRecord screenedRecord = screenMafRecord(record); + if (screenedRecord != null) { + mafRecordList.add(screenedRecord); + } + record = mafDataReader.read(); + } + mafDataReader.close(); + + return mafRecordList; + } + + /** + * Field set mapper for MAF records. + * + * @return FieldSetMapper + */ + private FieldSetMapper mutationFieldSetMapper() { + return (FieldSetMapper) (FieldSet fs) -> { + BeanMap mafRecordBeanMap = BeanMap.create(new MafRecord()); + Set fieldsetNames = new HashSet(Arrays.asList(fs.getNames())); + + // fill property values for bean using intersection of properties and expected maf columns + Set intersection = Sets.intersection(mafStagingDataMap.keySet(), fieldsetNames); + intersection.stream().forEach((column) -> { + mafRecordBeanMap.put(mafStagingDataMap.get(column), + Strings.isNullOrEmpty(fs.readRawString(column))?null:fs.readRawString(column)); + }); + MafRecord record = (MafRecord) mafRecordBeanMap.getBean(); + + return record; + }; + } + + /** + * Performs basic data screening to determine whether MAF record is acceptable or not. + * Returns null if MAF record does not pass screening + * + * @param mafRecord + * @return MafRecord + */ + private MafRecord screenMafRecord(MafRecord mafRecord) { + // make sure that sample can be found in the database by stable id, cancer study id + String sampleStableId = DataFileUtils.getSampleStableId(mafRecord.getTumorSampleBarcode()); + Sample sample = sampleJdbcDaoImpl.getSampleByStudy(sampleStableId, geneticProfile.getCancerStudyId()); + if (sample == null) { + if (samplesSkipped.add(sampleStableId)) { + if (!DataFileUtils.isNormalSample(sampleStableId)) { + LOG.warn("Could not find sample in db: " + mafRecord.getTumorSampleBarcode()); + } + else { + LOG.warn("Skipping normal sample: " + sampleStableId); + } + } + return null; + } + // update sample id for MAF record since all data screens have passed + mafRecord.setTumorSampleBarcode(sampleStableId); + mafRecord.setSampleId(sample.getInternalId()); + mafRecord.setGeneticProfileId(geneticProfile.getGeneticProfileId()); + + return mafRecord; + } + + @Override + public void update(ExecutionContext executionContext) throws ItemStreamException {} + + @Override + public void close() throws ItemStreamException {} + + @Override + public MafRecord read() throws Exception, UnexpectedInputException, ParseException, NonTransientResourceException { + if (!mafRecordResults.isEmpty()) { + return mafRecordResults.remove(0); + } + return null; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/reader/ProfileDataReader.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/reader/ProfileDataReader.java new file mode 100644 index 0000000..6c5441b --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/reader/ProfileDataReader.java @@ -0,0 +1,205 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.reader; + +import org.mskcc.cbio.model.*; +import org.cbio.portal.pipelines.importer.model.ProfileDataRecord; + +import java.io.*; +import java.util.*; +import com.google.common.base.Strings; +import org.apache.commons.logging.*; +import org.apache.commons.collections.map.MultiKeyMap; + +import org.springframework.batch.item.*; +import org.springframework.batch.item.file.FlatFileItemReader; +import org.springframework.batch.item.file.mapping.*; +import org.springframework.batch.item.file.transform.*; +import org.springframework.cglib.beans.BeanMap; +import org.springframework.core.io.FileSystemResource; + +/** + * + * @author ochoaa + */ +public class ProfileDataReader implements ItemStreamReader { + + private final Map profileNonCaseIdsMap = new ProfileDataRecord().getNonCaseIdsMap(); + + private GeneticProfile geneticProfile; + private int samplesSkipped; + private boolean isRppaProfile; + private boolean isCnaData; + private List compositeProfileDataResults; + + private final Set caseIdSet = new LinkedHashSet<>(); + + private static final Log LOG = LogFactory.getLog(ProfileDataReader.class); + + @Override + public void open(ExecutionContext executionContext) throws ItemStreamException { + this.geneticProfile = (GeneticProfile) executionContext.get("geneticProfile"); + this.isRppaProfile = (boolean) executionContext.get("isRppaProfile"); + this.isCnaData = (boolean) executionContext.get("isCnaData"); + List dataFileList = (List) executionContext.get("dataFileList"); + MultiKeyMap profileMetadata = (MultiKeyMap) executionContext.get("profileMetadata"); + + List compositeProfileDataList = new ArrayList(); + dataFileList.stream().forEach((dataFile) -> { + // get profile metadata for current datafile + String[] header = (String[]) profileMetadata.get(dataFile.getName(), "header"); + Set nonCaseIds = (Set) profileMetadata.get(dataFile.getName(), "nonCaseIds"); + HashMap caseIdsMap = (HashMap) profileMetadata.get(dataFile.getName(), "caseIdsMap"); + Set normalCaseIds = (Set) profileMetadata.get(dataFile.getName(), "normalCaseIds"); + + List compositeProfileData = new ArrayList(); + try { + LOG.info("Loading profile data from: " + dataFile.getName()); + compositeProfileData = loadDataFileProfileData(dataFile, header, nonCaseIds, caseIdsMap); + } + catch (Exception ex) { + ex.printStackTrace(); + } + + if (compositeProfileData.isEmpty()) { + LOG.error("Error loading profile data from: " + dataFile.getName()); + } + else { + // increment total sample count and number of normal samples skipped + this.samplesSkipped += normalCaseIds.size(); + + LOG.info("Loaded " + compositeProfileData.size() + " records from: " + dataFile.getName()); + compositeProfileDataList.addAll(compositeProfileData); + + // add sample stable ids to case id set + caseIdSet.addAll(caseIdsMap.values()); + } + }); + // add samples skipped and case list to execution context for listener + executionContext.put("samplesSkipped", samplesSkipped); + executionContext.put("caseList", caseIdSet); + + this.compositeProfileDataResults = compositeProfileDataList; + } + + /** + * Loads profile data from the datafile. + * + * @param dataFile + * @param header + * @param nonCaseIds + * @param caseIdsMap + * @return List + * @throws Exception + */ + private List loadDataFileProfileData(File dataFile, String[] header, Set nonCaseIds, HashMap caseIdsMap) throws Exception { + // init tab-delim tokenizer with the profile datafile header + DelimitedLineTokenizer tokenizer = new DelimitedLineTokenizer(DelimitedLineTokenizer.DELIMITER_TAB); + tokenizer.setNames(header); + + // init line mapper for profile datafile + DefaultLineMapper lineMapper = new DefaultLineMapper<>(); + lineMapper.setLineTokenizer(tokenizer); + lineMapper.setFieldSetMapper(profileFieldSetMapper(nonCaseIds, caseIdsMap)); + + // set up profile datafile reader context + FlatFileItemReader profileDataReader = new FlatFileItemReader(); + profileDataReader.setResource(new FileSystemResource(dataFile)); + profileDataReader.setLineMapper(lineMapper); + profileDataReader.setLinesToSkip(1); + profileDataReader.open(new ExecutionContext()); + + List compositeProfileDataList = new ArrayList(); + ProfileDataRecord record = profileDataReader.read(); + while (record != null) { + // update record with rppa profile, cna data status, and genetic profile id + record.setRppaProfile(isRppaProfile); + record.setCnaData(isCnaData); + record.setGeneticProfileId(geneticProfile.getGeneticProfileId()); + compositeProfileDataList.add(record); + + record = profileDataReader.read(); + } + profileDataReader.close(); + + return compositeProfileDataList; + } + + /** + * Field set mapper for profile data records. + * + * @param nonCaseIds + * @param caseIdsMap + * @return FieldSetMapper + */ + private FieldSetMapper profileFieldSetMapper(Set nonCaseIds, Map caseIdsMap) { + return (FieldSetMapper) (FieldSet fs) -> { + + // generate linked hashmap of case data + Map caseProfileRecordData = new LinkedHashMap<>(); + caseIdsMap.keySet().stream().forEach((sampleStableId) -> { + caseProfileRecordData.put(caseIdsMap.get(sampleStableId), fs.readRawString(sampleStableId)); + }); + + // fill in values for non case id columns + BeanMap profileRecordBeanMap = BeanMap.create(new ProfileDataRecord()); + for(String nonCaseIdCol : nonCaseIds){ + // ignore fields that are not in the profile record model + if (!profileNonCaseIdsMap.containsKey(nonCaseIdCol.toUpperCase())) { + continue; + } + profileRecordBeanMap.put(profileNonCaseIdsMap.get(nonCaseIdCol.toUpperCase()), + Strings.isNullOrEmpty(fs.readRawString(nonCaseIdCol))?null:fs.readRawString(nonCaseIdCol)); + } + ProfileDataRecord record = (ProfileDataRecord) profileRecordBeanMap.getBean(); + record.setCaseProfileDataMap(caseProfileRecordData); + + return record; + }; + } + + @Override + public void update(ExecutionContext executionContext) throws ItemStreamException {} + + @Override + public void close() throws ItemStreamException {} + + @Override + public ProfileDataRecord read() throws Exception, UnexpectedInputException, ParseException, NonTransientResourceException { + if (!compositeProfileDataResults.isEmpty()) { + return compositeProfileDataResults.remove(0); + } + return null; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/reader/StructuralVariantDataReader.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/reader/StructuralVariantDataReader.java new file mode 100644 index 0000000..9cdce01 --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/reader/StructuralVariantDataReader.java @@ -0,0 +1,206 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.reader; + +import org.mskcc.cbio.model.*; +import org.mskcc.cbio.persistence.jdbc.*; +import org.cbio.portal.pipelines.importer.util.*; +import org.cbio.portal.pipelines.importer.model.StructuralVariantRecord; + +import java.io.*; +import java.util.*; +import org.apache.commons.logging.*; +import com.google.common.base.Strings; +import com.google.common.collect.Sets; +import org.apache.commons.collections.map.MultiKeyMap; + +import org.springframework.batch.item.*; +import org.springframework.batch.item.file.FlatFileItemReader; +import org.springframework.batch.item.file.mapping.*; +import org.springframework.batch.item.file.transform.*; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.cglib.beans.BeanMap; +import org.springframework.core.io.FileSystemResource; + +/** + * + * @author ochoaa + */ +public class StructuralVariantDataReader implements ItemStreamReader { + + @Autowired + SampleJdbcDaoImpl sampleJdbcDaoImpl; + + private final Map structuralVariantStagingDataMap = new StructuralVariantRecord().getStructuralVariantStagingDataMap(); + private final Set samplesSkipped = new HashSet<>(); + + private GeneticProfile geneticProfile; + private List structuralVariantRecordResults; + + private static final Log LOG = LogFactory.getLog(StructuralVariantDataReader.class); + + @Override + public void open(ExecutionContext executionContext) throws ItemStreamException { + this.geneticProfile = (GeneticProfile) executionContext.get("geneticProfile"); + File dataFile = (File) executionContext.get("dataFile"); + MultiKeyMap structuralVariantMetadata = (MultiKeyMap) executionContext.get("structuralVariantMetadata"); + String[] header = (String[]) structuralVariantMetadata.get(dataFile.getName(), "header"); + Integer numRecords = (Integer) structuralVariantMetadata.get(dataFile.getName(), "numRecords"); + + // load structural variant data from datafile + List structuralVariantDataList = new ArrayList(); + try { + LOG.info("Loading structural variant data from: " + dataFile.getName()); + structuralVariantDataList = loadStructuralVariantData(dataFile, header); + } + catch (Exception ex) { + ex.printStackTrace(); + } + + // log whether data was loaded or not from structural variant datafile + if (structuralVariantDataList.isEmpty()) { + LOG.error("Error loading structural variant data from: " + dataFile.getName()); + } + else { + LOG.info("Loaded " + structuralVariantDataList.size() + "/" + numRecords + " records from: " + dataFile.getName()); + } + // add counts to execution context for summary statistics after step executes + executionContext.put("samplesSkipped", samplesSkipped.size()); + + this.structuralVariantRecordResults = structuralVariantDataList; + } + + /** + * Loads structural variant data from the datafile. + * + * @param dataFile + * @param structuralVariantHeader + * @return List + */ + private List loadStructuralVariantData(File dataFile, String[] structuralVariantHeader) throws Exception { + // init tab-delim tokenizer with the structural variant file header + DelimitedLineTokenizer tokenizer = new DelimitedLineTokenizer(DelimitedLineTokenizer.DELIMITER_TAB); + tokenizer.setNames(structuralVariantHeader); + + // init line mapper for structural variant file + DefaultLineMapper lineMapper = new DefaultLineMapper(); + lineMapper.setLineTokenizer(tokenizer); + lineMapper.setFieldSetMapper(structuralVariantFieldSetMapper()); + + // set up structural variant file reader context + FlatFileItemReader structuralVariantDataReader = new FlatFileItemReader(); + structuralVariantDataReader.setResource(new FileSystemResource(dataFile)); + structuralVariantDataReader.setLineMapper(lineMapper); + structuralVariantDataReader.setLinesToSkip(1); + structuralVariantDataReader.open(new ExecutionContext()); + + List structuralVariantDataList = new ArrayList(); + StructuralVariantRecord record = structuralVariantDataReader.read(); + while (record != null) { + StructuralVariantRecord structuralVariant = screenStructuralVariantRecord(record); + if (structuralVariant != null) { + structuralVariantDataList.add(structuralVariant); + } + record = structuralVariantDataReader.read(); + } + structuralVariantDataReader.close(); + + return structuralVariantDataList; + } + + /** + * Field set mapper for structural variant records. + * + * @return FieldSetMapper + */ + private FieldSetMapper structuralVariantFieldSetMapper() { + return (FieldSetMapper) (FieldSet fs) -> { + BeanMap structuralVariantRecordBeanMap = BeanMap.create(new StructuralVariantRecord()); + Set fieldsetNames = new HashSet(Arrays.asList(fs.getNames())); + + // fill property values for bean using intersection of properties and expected structural variant columns + Set intersection = Sets.intersection(structuralVariantStagingDataMap.keySet(), fieldsetNames); + intersection.stream().forEach((column) -> { + structuralVariantRecordBeanMap.put(structuralVariantStagingDataMap.get(column), + Strings.isNullOrEmpty(fs.readRawString(column))?null:fs.readRawString(column)); + }); + StructuralVariantRecord record = (StructuralVariantRecord) structuralVariantRecordBeanMap.getBean(); + return record; + }; + } + + /** + * Performs basic data screening to determine whether structural variant record is acceptable or not. + * Returns null if structural variant record does not pass screening + * + * @param structuralVariantRecord + * @return StructuralVariantRecord + */ + private StructuralVariantRecord screenStructuralVariantRecord(StructuralVariantRecord structuralVariantRecord) { + // make sure that sample can be found in the database by stable id, cancer study id + String sampleStableId = DataFileUtils.getSampleStableId(structuralVariantRecord.getSampleId()); + Sample sample = sampleJdbcDaoImpl.getSampleByStudy(sampleStableId, geneticProfile.getCancerStudyId()); + if (sample == null) { + if (samplesSkipped.add(sampleStableId)) { + if (!DataFileUtils.isNormalSample(sampleStableId)) { + LOG.warn("Could not find sample in db: " + structuralVariantRecord.getSampleId()); + } + else { + LOG.warn("Skipping normal sample: " + sampleStableId); + } + } + return null; + } + // update structural variant record with sample data and genetic profile id + structuralVariantRecord.setSampleId(sampleStableId); + structuralVariantRecord.setSampleInternalId(sample.getInternalId()); + structuralVariantRecord.setGeneticProfileId(geneticProfile.getGeneticProfileId()); + + return structuralVariantRecord; + } + + @Override + public void update(ExecutionContext executionContext) throws ItemStreamException {} + + @Override + public void close() throws ItemStreamException {} + + @Override + public StructuralVariantRecord read() throws Exception, UnexpectedInputException, ParseException, NonTransientResourceException { + if (!structuralVariantRecordResults.isEmpty()) { + return structuralVariantRecordResults.remove(0); + } + return null; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/reader/TimelineDataReader.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/reader/TimelineDataReader.java new file mode 100644 index 0000000..af9ce52 --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/reader/TimelineDataReader.java @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.reader; + +import org.mskcc.cbio.model.*; +import org.mskcc.cbio.persistence.jdbc.*; +import org.cbio.portal.pipelines.importer.util.*; +import org.cbio.portal.pipelines.importer.model.TimelineRecord; + +import java.io.*; +import java.util.*; +import org.apache.commons.logging.*; +import com.google.common.base.Strings; +import com.google.common.collect.Sets; +import org.apache.commons.collections.map.MultiKeyMap; + +import org.springframework.batch.item.*; +import org.springframework.batch.item.file.FlatFileItemReader; +import org.springframework.batch.item.file.mapping.*; +import org.springframework.batch.item.file.transform.*; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.cglib.beans.BeanMap; +import org.springframework.core.io.FileSystemResource; + +/** + * + * @author ochoaa + */ +public class TimelineDataReader implements ItemStreamReader { + + @Autowired + PatientJdbcDaoImpl patientJdbcDaoImpl; + + private final Map timelineStagingDataMap = new TimelineRecord().getTimelineStagingDataMap(); + private final Set patientsSkipped = new HashSet<>(); + + private CancerStudy cancerStudy; + private List timelineRecordResults; + + private static final Log LOG = LogFactory.getLog(TimelineDataReader.class); + + @Override + public void open(ExecutionContext executionContext) throws ItemStreamException { + this.cancerStudy = (CancerStudy) executionContext.get("cancerStudy"); + List dataFileList = (List) executionContext.get("dataFileList"); + MultiKeyMap timelineMetadata = (MultiKeyMap) executionContext.get("timelineMetadata"); + + // load timeline records from each datafile + List timelineRecordList = new ArrayList(); + dataFileList.stream().forEach((dataFile) -> { + String[] timelineHeader = (String[]) timelineMetadata.get(dataFile.getName(), "header"); + List timelineRecords = new ArrayList(); + try { + LOG.info("Loading timeline data from: " + dataFile.getName()); + timelineRecords = loadTimelineRecords(dataFile, timelineHeader); + } + catch (Exception ex) { + ex.printStackTrace(); + } + + if (timelineRecords.isEmpty()) { + LOG.error("Error loading timeline data from: " + dataFile.getName()); + } + else { + LOG.info("Timeline records loaded from: " + dataFile.getName() + ": " + timelineRecords.size()); + timelineRecordList.addAll(timelineRecords); + } + }); + // add total patients skipped to execution context for summary statistics after step executes + executionContext.put("patientsSkipped", patientsSkipped.size()); + + this.timelineRecordResults = timelineRecordList; + } + + /** + * Loads timeline data from the datafile. + * + * @param dataFile + * @return List + */ + private List loadTimelineRecords(File dataFile, String[] timelineHeader) throws Exception { + // init tab-delim tokenizer with the timeline file header + DelimitedLineTokenizer tokenizer = new DelimitedLineTokenizer(DelimitedLineTokenizer.DELIMITER_TAB); + tokenizer.setNames(timelineHeader); + + // init line mapper for timeline file + DefaultLineMapper lineMapper = new DefaultLineMapper<>(); + lineMapper.setLineTokenizer(tokenizer); + lineMapper.setFieldSetMapper(timelineFieldSetMapper()); + + // set up timeline file reader context + FlatFileItemReader timelineDataReader = new FlatFileItemReader(); + timelineDataReader.setResource(new FileSystemResource(dataFile)); + timelineDataReader.setLineMapper(lineMapper); + timelineDataReader.setLinesToSkip(1); + timelineDataReader.open(new ExecutionContext()); + + List timelineRecordList = new ArrayList(); + TimelineRecord record = timelineDataReader.read(); + while (record != null) { + TimelineRecord screenedRecord = screenTimelineRecord(record); + if (screenedRecord != null) { + timelineRecordList.add(record); + } + record = timelineDataReader.read(); + } + timelineDataReader.close(); + + return timelineRecordList; + } + + /** + * Field set mapper for timeline records. + * + * @return FieldSetMapper + */ + private FieldSetMapper timelineFieldSetMapper() { + return (FieldSetMapper) (FieldSet fs) -> { + Set fieldsetNames = new HashSet(Arrays.asList(fs.getNames())); + Set intersection = Sets.intersection(timelineStagingDataMap.keySet(), fieldsetNames); + + // generate hashmap of clinical event data using the set difference + // between properties and expected timeline staging file columns + Set clinicalEventDataColumns = Sets.difference(fieldsetNames, intersection); + Map clinicalEventDataMap = new HashMap<>(); + clinicalEventDataColumns.stream().forEach((dataColumn) -> { + String value = fs.readString(dataColumn); + if (!DataFileUtils.isNullOrEmptyValue(value)) { + clinicalEventDataMap.put(dataColumn, value); + } + }); + + BeanMap timelineRecordBeanMap = BeanMap.create(new TimelineRecord()); + // fill property values for bean using the set intersection of properties + // and expected timeline staging file columns + intersection.stream().forEach((column) -> { + timelineRecordBeanMap.put(timelineStagingDataMap.get(column), + Strings.isNullOrEmpty(fs.readRawString(column))?null:fs.readRawString(column)); + }); + TimelineRecord record = (TimelineRecord) timelineRecordBeanMap.getBean(); + record.setClinicalEventDataMap(clinicalEventDataMap); + + return record; + }; + } + + /** + * Performs basic data screening to determine whether timeline record is acceptable or not. + * Returns null if timeline record does not pass screening + * + * @param timelineRecord + * @return TimelineRecord + */ + private TimelineRecord screenTimelineRecord(TimelineRecord timelineRecord) { + // make sure that patient can be found in the database by stable id, cancer study id + String patientStableId = DataFileUtils.getPatientStableId(timelineRecord.getPatientId()); + Patient patient = patientJdbcDaoImpl.getPatient(patientStableId, cancerStudy.getCancerStudyId()); + if (patient == null) { + if (patientsSkipped.add(patientStableId)) { + LOG.warn("Could not find patient in db: " + timelineRecord.getPatientId()); + } + return null; + } + // update patient stable id and patient internal id for timeline record + timelineRecord.setPatientId(patientStableId); + timelineRecord.setPatientInternalId(patient.getInternalId()); + + return timelineRecord; + } + + @Override + public void update(ExecutionContext executionContext) throws ItemStreamException {} + + @Override + public void close() throws ItemStreamException {} + + @Override + public TimelineRecord read() throws Exception, UnexpectedInputException, ParseException, NonTransientResourceException { + if (!timelineRecordResults.isEmpty()) { + return timelineRecordResults.remove(0); + } + return null; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/step/CancerStudyStep.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/step/CancerStudyStep.java new file mode 100644 index 0000000..ad1aa01 --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/step/CancerStudyStep.java @@ -0,0 +1,254 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.step; + +import org.cbio.portal.pipelines.importer.config.tasklet.*; +import org.cbio.portal.pipelines.importer.config.listener.CancerStudyListener; + +import javax.annotation.Resource; +import org.springframework.batch.core.*; +import org.springframework.batch.core.configuration.annotation.*; +import org.springframework.batch.core.job.builder.FlowBuilder; +import org.springframework.batch.core.job.flow.*; +import org.springframework.batch.core.step.tasklet.Tasklet; + +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.context.annotation.*; + +/** + * Configuration for importing a cancer study. + * + * @author ochoaa + */ +@Configuration +@EnableBatchProcessing +public class CancerStudyStep { + + @Autowired + public JobBuilderFactory jobBuilderFactory; + + @Autowired + public StepBuilderFactory stepBuilderFactory; + + @Resource(name="importClinicalData") + public Step importClinicalData; + + @Resource(name="importTimelineData") + public Step importTimelineData; + + @Resource(name="importMutSigData") + public Step importMutSigData; + + @Resource(name="importCopyNumberSegData") + public Step importCopyNumberSegData; + + @Resource(name="importGisticGenesData") + public Step importGisticGenesData; + + @Resource(name="importProteinLevelData") + public Step importProteinLevelData; + + @Resource(name="importCnaData") + public Step importCnaData; + + @Resource(name="importGeneExpressionData") + public Step importGeneExpressionData; + + @Resource(name="importMethylationData") + public Step importMethylationData; + + @Resource(name="importMutationData") + public Step importMutationData; + + @Resource(name="importFusionData") + public Step importFusionData; + + @Resource(name="importStructuralVariantData") + public Step importStructuralVariantData; + + @Resource(name="importCaseLists") + public Step importCaseLists; + + /** + * Step for initiating cancer study import. + * + * @return Step + */ + @Bean + public Step importCancerStudy() { + return stepBuilderFactory.get("importCancerStudy") + .flow(importCancerStudyFlow()) + .build(); + } + + @Bean + public Step deleteCancerStudy() { + return stepBuilderFactory.get("deleteCancerStudy") + .tasklet(deleteCancerStudyTasklet()) + .build(); + } + + /** + * Flow for importing a cancer study. + * + * @return Flow + */ + @Bean + public Flow importCancerStudyFlow() { + return new FlowBuilder("importCancerStudyFlow") + .start(importCancerStudyStep()) + .next(importCancerStudyDataDecider()) + .on("STOPPED").stop() + .from(importCancerStudyDataDecider()) + .on("CONTINUE") + .to(loadCancerStudyDatatypeMetadata()) + .next(importClinicalData) + .next(continueImportingStudyDataDecider()) + .on("STOPPED").stop() + .from(continueImportingStudyDataDecider()) + .on("CONTINUE") + .to(importTimelineData) + .next(importMutSigData) + .next(importCopyNumberSegData) + .next(importGisticGenesData) + .next(importProteinLevelData) + .next(importCnaData) + .next(importGeneExpressionData) + .next(importMethylationData) + .next(importMutationData) + .next(importFusionData) + .next(importStructuralVariantData) + .next(importCaseLists) + .build(); + } + + /** + * Import cancer study step and tasklet. + * + * @return Step + */ + @Bean + public Step importCancerStudyStep() { + return stepBuilderFactory.get("importCancerStudyStep") + .listener(cancerStudyListener()) + .tasklet(cancerStudyTasklet()) + .build(); + } + + /** + * Tasklet for loading cancer study meta data. + * + * @return Tasklet + */ + @Bean + @StepScope + public Tasklet cancerStudyTasklet() { + return new CancerStudyTasklet(); + } + + /** + * Listener for cancer study import. + * + * @return StepExecutionListener + */ + @Bean + public StepExecutionListener cancerStudyListener() { + return new CancerStudyListener(); + } + + /** + * Step for implementing datatype metadata tasklet. + * + * @return Step + */ + @Bean + public Step loadCancerStudyDatatypeMetadata() { + return stepBuilderFactory.get("loadCancerStudyDatatypeMetdata") + .tasklet(datatypeMetadataTasklet()) + .build(); + } + + @Bean + @StepScope + public Tasklet deleteCancerStudyTasklet() { + return new DeleteCancerStudyTasklet(); + } + + /** + * Tasklet for loading datatype metadata. + * Searches cancer study path for meta files and determines whether importer + * should import each datatype depending on whether metafile and datafiles exist + * + * @return Tasklet + */ + @Bean + @StepScope + public Tasklet datatypeMetadataTasklet() { + return new DatatypeMetadataTasklet(); + } + + /** + * Decider to determine if cancer study was loaded properly. + * + * @return JobExecutionDecider + */ + @Bean + public JobExecutionDecider importCancerStudyDataDecider() { + return (JobExecution jobExecution, StepExecution stepExecution) -> { + if ((boolean) jobExecution.getExecutionContext().get("importCancerStudy")) { + return new FlowExecutionStatus("CONTINUE"); + } + else { + return FlowExecutionStatus.STOPPED; + } + }; + } + + /** + * Decider that checks rollback status for cancer study after the clinical data step runs. + * + * @return JobExecutionDecider + */ + @Bean + public JobExecutionDecider continueImportingStudyDataDecider() { + return (JobExecution jobExecution, StepExecution stepExecution) -> { + if ((boolean) jobExecution.getExecutionContext().get("rollbackCancerStudyState")) { + return FlowExecutionStatus.FAILED; + } + else { + return new FlowExecutionStatus("CONTINUE"); + } + }; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/step/CaseListStep.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/step/CaseListStep.java new file mode 100644 index 0000000..d37e5c3 --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/step/CaseListStep.java @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.step; + +import org.mskcc.cbio.model.SampleList; +import org.cbio.portal.pipelines.importer.config.listener.CaseListListener; +import org.cbio.portal.pipelines.importer.config.reader.CaseListReader; +import org.cbio.portal.pipelines.importer.config.writer.CaseListWriter; + +import org.springframework.batch.core.*; +import org.springframework.batch.core.configuration.annotation.*; +import org.springframework.batch.item.*; + +import org.springframework.beans.factory.annotation.*; +import org.springframework.context.annotation.*; + +/** + * + * @author ochoaa + */ +@Configuration +@EnableBatchProcessing +public class CaseListStep { + + @Value("${chunk.interval}") + private int chunkInterval; + + @Autowired + public StepBuilderFactory stepBuilderFactory; + + @Bean + public Step importCaseLists() { + return stepBuilderFactory.get("importCaseLists") + . chunk(chunkInterval) + .reader(caseListReader()) + .writer(caseListWriter()) + .listener(caseListListener()) + .build(); + } + + /*************************************************************************** + * Case list listener, reader, and writer. + **************************************************************************/ + + @Bean + public StepExecutionListener caseListListener() { + return new CaseListListener(); + } + + @Bean + @StepScope + public ItemStreamReader caseListReader() { + return new CaseListReader(); + } + + @Bean + @StepScope + public ItemStreamWriter caseListWriter() { + return new CaseListWriter(); + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/step/ClinicalDataStep.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/step/ClinicalDataStep.java new file mode 100644 index 0000000..a3e9b5c --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/step/ClinicalDataStep.java @@ -0,0 +1,490 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.step; + +import org.cbio.portal.pipelines.importer.config.composite.CompositeClinicalData; +import org.cbio.portal.pipelines.importer.config.tasklet.ClinicalAttributeTasklet; +import org.cbio.portal.pipelines.importer.config.listener.ClinicalDataListener; +import org.cbio.portal.pipelines.importer.config.reader.ClinicalDataReader; +import org.cbio.portal.pipelines.importer.config.processor.ClinicalDataProcessor; +import org.cbio.portal.pipelines.importer.config.writer.ClinicalDataWriter; + +import java.util.*; +import org.apache.commons.collections.map.MultiKeyMap; +import org.apache.commons.logging.*; + +import org.springframework.batch.core.*; +import org.springframework.batch.core.configuration.annotation.*; +import org.springframework.batch.core.job.builder.FlowBuilder; +import org.springframework.batch.core.job.flow.*; +import org.springframework.batch.core.scope.context.ChunkContext; +import org.springframework.batch.core.step.tasklet.Tasklet; +import org.springframework.batch.item.*; +import org.springframework.batch.repeat.RepeatStatus; + +import org.springframework.beans.factory.annotation.*; +import org.springframework.context.annotation.*; + +/** + * + * @author ochoaa + */ +@Configuration +@EnableBatchProcessing +public class ClinicalDataStep { + + public static final String IMPORT_CLINICAL_DATA = "importClinicalData"; + + @Value("${chunk.interval}") + private int chunkInterval; + + @Autowired + public StepBuilderFactory stepBuilderFactory; + + private static final Log LOG = LogFactory.getLog(ClinicalDataStep.class); + + /** + * Step for importing clinical data implemented by cancer study work flow. + * + * @return Step + */ + @Bean + public Step importClinicalData() { + return stepBuilderFactory.get(IMPORT_CLINICAL_DATA) + .flow(clinicalStepFlow()) + .build(); + } + + /** + * Clinical data flow. + * Execution of clinical data steps will begin with "clinicalStep" and will + * execute conditionally based on their respective deciders + * + * @return Flow + */ + @Bean + public Flow clinicalStepFlow() { + // execute clinical data steps + return new FlowBuilder("clinicalStepFlow") + .start(initClinicalImportStep()) + .next(clinicalStep()) + .build(); + } + + /** + * Step to initiate flow for clinical data steps. + * + * @return Step + */ + @Bean + public Step initClinicalImportStep() { + return stepBuilderFactory.get("initClinicalImportStep") + .tasklet((StepContribution sc, ChunkContext cc) -> { + return RepeatStatus.FINISHED; + }).build(); + } + + /*************************************************************************** + * Clinical data import steps and flows. + **************************************************************************/ + + /** + * Step for implementing clinical attribute tasklet. + * + * @return Step + */ + @Bean + public Step loadClinicalAttributes() { + return stepBuilderFactory.get("loadClinicalAttributes") + .tasklet(clinicalAttributeTasklet()) + .build(); + } + + /** + * Tasklet for loading clinical attributes from clinical data files. + * + * @return Tasklet + */ + @Bean + @StepScope + public Tasklet clinicalAttributeTasklet() { + return new ClinicalAttributeTasklet(); + } + + /** + * Flow for importing mixed clinical data. + * + * @return Flow + */ + @Bean + public Flow clinicalStep() { + return new FlowBuilder("clinicalStep") + .start(clinicalStepDecider()) + .on("RUN").to(loadClinicalAttributes()) + .next(clinicalStepBuilder("clinicalStep")) + .next(clinicalPatientStep()) + .from(clinicalStepDecider()) + .on("SKIP").to(clinicalPatientStep()) + .build(); + } + + /** + * Flow for importing patient clinical data. + * + * @return Flow + */ + @Bean + public Flow clinicalPatientStep() { + return new FlowBuilder("clinicalPatientStep") + .start(clinicalPatientStepDecider()) + .on("RUN").to(loadClinicalAttributes()) + .next(clinicalStepBuilder("clinicalPatientStep")) + .next(clinicalSampleStep()) + .from(clinicalPatientStepDecider()) + .on("SKIP").to(clinicalSampleStep()) + .build(); + } + + /** + * Flow for importing sample clinical data. + * + * @return Flow + */ + @Bean + public Flow clinicalSampleStep() { + return new FlowBuilder("clinicalSampleStep") + .start(clinicalSampleStepDecider()) + .on("RUN").to(loadClinicalAttributes()) + .next(clinicalStepBuilder("clinicalSampleStep")) + .next(bcrClinicalStep()) + .from(clinicalSampleStepDecider()) + .on("SKIP").to(bcrClinicalStep()) + .build(); + } + + /** + * Flow for importing BCR mixed clinical data. + * + * @return Flow + */ + @Bean + public Flow bcrClinicalStep() { + return new FlowBuilder("bcrClinicalStep") + .start(bcrClinicalStepDecider()) + .on("RUN").to(loadClinicalAttributes()) + .next(clinicalStepBuilder("bcrClinicalStep")) + .next(bcrClinicalPatientStep()) + .from(bcrClinicalStepDecider()) + .on("SKIP").to(bcrClinicalPatientStep()) + .build(); + } + + /** + * Flow for importing BCR patient clinical data. + * + * @return Flow + */ + @Bean + public Flow bcrClinicalPatientStep() { + return new FlowBuilder("bcrClinicalPatientStep") + .start(bcrClinicalPatientStepDecider()) + .on("RUN").to(loadClinicalAttributes()) + .next(clinicalStepBuilder("bcrClinicalPatientStep")) + .next(bcrClinicalSampleStep()) + .from(bcrClinicalPatientStepDecider()) + .on("SKIP").to(bcrClinicalSampleStep()) + .build(); + } + + /** + * Flow for importing BCR sample clinical data. + * + * @return Flow + */ + @Bean + public Flow bcrClinicalSampleStep() { + return new FlowBuilder("bcrClinicalSampleStep") + .start(bcrClinicalSampleStepDecider()) + .on("RUN").to(loadClinicalAttributes()) + .next(clinicalStepBuilder("bcrClinicalSampleStep")) + .next(clinicalSuppStep()) + .from(bcrClinicalSampleStepDecider()) + .on("SKIP").to(clinicalSuppStep()) + .build(); + } + + /** + * Flow for importing supplemental clinical data. + * + * @return Flow + */ + @Bean + public Flow clinicalSuppStep() { + return new FlowBuilder("clinicalSuppStep") + .start(clinicalSuppStepDecider()) + .on("RUN").to(loadClinicalAttributes()) + .next(clinicalStepBuilder("clinicalSuppStep")) + .from(clinicalSuppStepDecider()) + .on("SKIP").end() + .build(); + } + + /** + * Universal step builder for clinical data. + * + * @param stepName + * @return Step + */ + public Step clinicalStepBuilder(String stepName) { + return stepBuilderFactory.get(stepName) + . chunk(chunkInterval) + .reader(clinicalDataReader()) + .processor(clinicalDataProcessor()) + .writer(clinicalDataWriter()) + .listener(clinicalDataListener()) + .build(); + } + + /*************************************************************************** + * Clinical data listener, reader, processor, and writer. + **************************************************************************/ + + @Bean + public StepExecutionListener clinicalDataListener() { + return new ClinicalDataListener(); + } + + @Bean + @StepScope + public ItemStreamReader clinicalDataReader() { + return new ClinicalDataReader(); + } + + @Bean + public ClinicalDataProcessor clinicalDataProcessor() { + return new ClinicalDataProcessor(); + } + + @Bean + @StepScope + public ItemStreamWriter clinicalDataWriter() { + return new ClinicalDataWriter(); + } + + /*************************************************************************** + * Deciders for clinical data steps. + **************************************************************************/ + + /** + * Clinical step decider. + * Sets flow execution status to SKIP if meta file and data files do not exist + * + * @return JobExecutionDecider + */ + @Bean + public JobExecutionDecider clinicalStepDecider() { + return (JobExecution jobExecution, StepExecution stepExecution) -> { + MultiKeyMap datatypeMetadata = (MultiKeyMap) jobExecution.getExecutionContext().get("datatypeMetadata"); + String datatype = "clinical"; + + List logMessages = (List) datatypeMetadata.get(datatype, "logMessages"); + LOG.info(logMessages.get(0)); + if ((boolean) datatypeMetadata.get(datatype, "importData")) { + LOG.info(logMessages.get(1)); + jobExecution.getExecutionContext().put("currentDatatype", datatype); + return new FlowExecutionStatus("RUN"); + } + else { + LOG.warn(logMessages.get(1)); + return new FlowExecutionStatus("SKIP"); + } + }; + } + + /** + * Clinical patient step decider. + * Sets flow execution status to SKIP if meta file and data files do not exist + * + * @return JobExecutionDecider + */ + @Bean + public JobExecutionDecider clinicalPatientStepDecider() { + return (JobExecution jobExecution, StepExecution stepExecution) -> { + MultiKeyMap datatypeMetadata = (MultiKeyMap) jobExecution.getExecutionContext().get("datatypeMetadata"); + String datatype = "clinical-patient"; + + List logMessages = (List) datatypeMetadata.get(datatype, "logMessages"); + LOG.info(logMessages.get(0)); + if ((boolean) datatypeMetadata.get(datatype, "importData")) { + LOG.info(logMessages.get(1)); + jobExecution.getExecutionContext().put("currentDatatype", datatype); + return new FlowExecutionStatus("RUN"); + } + else { + LOG.warn(logMessages.get(1)); + return new FlowExecutionStatus("SKIP"); + } + }; + } + + /** + * Clinical sample step decider. + * Sets flow execution status to SKIP if meta file and data files do not exist + * + * @return JobExecutionDecider + */ + @Bean + public JobExecutionDecider clinicalSampleStepDecider() { + return (JobExecution jobExecution, StepExecution stepExecution) -> { + MultiKeyMap datatypeMetadata = (MultiKeyMap) jobExecution.getExecutionContext().get("datatypeMetadata"); + String datatype = "clinical-sample"; + + List logMessages = (List) datatypeMetadata.get(datatype, "logMessages"); + LOG.info(logMessages.get(0)); + if ((boolean) datatypeMetadata.get(datatype, "importData")) { + LOG.info(logMessages.get(1)); + jobExecution.getExecutionContext().put("currentDatatype", datatype); + return new FlowExecutionStatus("RUN"); + } + else { + LOG.warn(logMessages.get(1)); + return new FlowExecutionStatus("SKIP"); + } + }; + } + + /** + * BCR clinical step decider. + * Sets flow execution status to SKIP if meta file and data files do not exist + * + * @return JobExecutionDecider + */ + @Bean + public JobExecutionDecider bcrClinicalStepDecider() { + return (JobExecution jobExecution, StepExecution stepExecution) -> { + MultiKeyMap datatypeMetadata = (MultiKeyMap) jobExecution.getExecutionContext().get("datatypeMetadata"); + String datatype = "bcr-clinical"; + + List logMessages = (List) datatypeMetadata.get(datatype, "logMessages"); + LOG.info(logMessages.get(0)); + if ((boolean) datatypeMetadata.get(datatype, "importData")) { + LOG.info(logMessages.get(1)); + jobExecution.getExecutionContext().put("currentDatatype", datatype); + return new FlowExecutionStatus("RUN"); + } + else { + LOG.warn(logMessages.get(1)); + return new FlowExecutionStatus("SKIP"); + } + }; + } + + /** + * BCR clinical patient step decider. + * Sets flow execution status to SKIP if meta file and data files do not exist + * + * @return JobExecutionDecider + */ + @Bean + public JobExecutionDecider bcrClinicalPatientStepDecider() { + return (JobExecution jobExecution, StepExecution stepExecution) -> { + MultiKeyMap datatypeMetadata = (MultiKeyMap) jobExecution.getExecutionContext().get("datatypeMetadata"); + String datatype = "bcr-clinical-patient"; + + List logMessages = (List) datatypeMetadata.get(datatype, "logMessages"); + LOG.info(logMessages.get(0)); + if ((boolean) datatypeMetadata.get(datatype, "importData")) { + LOG.info(logMessages.get(1)); + jobExecution.getExecutionContext().put("currentDatatype", datatype); + return new FlowExecutionStatus("RUN"); + } + else { + LOG.warn(logMessages.get(1)); + return new FlowExecutionStatus("SKIP"); + } + }; + } + + /** + * BCR clinical sample step decider. + * Sets flow execution status to SKIP if meta file and data files do not exist + * + * @return JobExecutionDecider + */ + @Bean + public JobExecutionDecider bcrClinicalSampleStepDecider() { + return (JobExecution jobExecution, StepExecution stepExecution) -> { + MultiKeyMap datatypeMetadata = (MultiKeyMap) jobExecution.getExecutionContext().get("datatypeMetadata"); + String datatype = "bcr-clinical-sample"; + + List logMessages = (List) datatypeMetadata.get(datatype, "logMessages"); + LOG.info(logMessages.get(0)); + if ((boolean) datatypeMetadata.get(datatype, "importData")) { + LOG.info(logMessages.get(1)); + jobExecution.getExecutionContext().put("currentDatatype", datatype); + return new FlowExecutionStatus("RUN"); + } + else { + LOG.warn(logMessages.get(1)); + return new FlowExecutionStatus("SKIP"); + } + }; + } + + /** + * Clinical supplemental step decider. + * Sets flow execution status to SKIP if meta file and data files do not exist + * + * @return JobExecutionDecider + */ + @Bean + public JobExecutionDecider clinicalSuppStepDecider() { + return (JobExecution jobExecution, StepExecution stepExecution) -> { + MultiKeyMap datatypeMetadata = (MultiKeyMap) jobExecution.getExecutionContext().get("datatypeMetadata"); + String datatype = "clinical-supp"; + + List logMessages = (List) datatypeMetadata.get(datatype, "logMessages"); + LOG.info(logMessages.get(0)); + if ((boolean) datatypeMetadata.get(datatype, "importData")) { + LOG.info(logMessages.get(1)); + jobExecution.getExecutionContext().put("currentDatatype", datatype); + return new FlowExecutionStatus("RUN"); + } + else { + LOG.warn(logMessages.get(1)); + return new FlowExecutionStatus("SKIP"); + } + }; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/step/CnaDataStep.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/step/CnaDataStep.java new file mode 100644 index 0000000..7f1fec8 --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/step/CnaDataStep.java @@ -0,0 +1,472 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.step; + +import org.cbio.portal.pipelines.importer.model.ProfileDataRecord; +import org.cbio.portal.pipelines.importer.config.listener.ProfileDataListener; +import org.cbio.portal.pipelines.importer.config.tasklet.GeneticProfileTasklet; +import org.cbio.portal.pipelines.importer.config.tasklet.ProfileMetadataTasklet; +import org.cbio.portal.pipelines.importer.config.reader.ProfileDataReader; +import org.cbio.portal.pipelines.importer.config.processor.ProfileDataProcessor; +import org.cbio.portal.pipelines.importer.config.writer.ProfileDataWriter; + +import java.util.*; +import org.apache.commons.collections.map.MultiKeyMap; +import org.apache.commons.logging.*; + +import org.springframework.batch.core.*; +import org.springframework.batch.core.configuration.annotation.*; +import org.springframework.batch.core.job.builder.FlowBuilder; +import org.springframework.batch.core.job.flow.*; +import org.springframework.batch.core.scope.context.ChunkContext; +import org.springframework.batch.core.step.tasklet.Tasklet; +import org.springframework.batch.item.*; +import org.springframework.batch.repeat.RepeatStatus; + +import org.springframework.beans.factory.annotation.*; +import org.springframework.context.annotation.*; + +/** + * + * @author ochoaa + */ +@Configuration +@EnableBatchProcessing +public class CnaDataStep { + + public static final String IMPORT_CNA_DATA = "importCnaData"; + + @Value("${chunk.interval}") + private int chunkInterval; + + @Autowired + public StepBuilderFactory stepBuilderFactory; + + private static final Log LOG = LogFactory.getLog(CnaDataStep.class); + + @Bean + public Step importCnaData() { + return stepBuilderFactory.get(IMPORT_CNA_DATA) + .flow(cnaStepFlow()) + .build(); + } + + /** + * CNA data flow. + * Execution of CNA data steps will begin with "cnaStep" and will + * execute conditionally based on their respective deciders + * + * @return Flow + */ + @Bean + public Flow cnaStepFlow() { + // execute CNA data steps sequentially starting with cnaStep + return new FlowBuilder("cnaStepFlow") + .start(initCnaImportStep()) + .next(cnaStep()) + .build(); + } + + /** + * Step to initiate flow for CNA data steps. + * + * @return Step + */ + @Bean + public Step initCnaImportStep() { + return stepBuilderFactory.get("initCnaImportStep") + .tasklet((StepContribution sc, ChunkContext cc) -> { + return RepeatStatus.FINISHED; + }).build(); + } + + /*************************************************************************** + * CNA data import steps and flows. + **************************************************************************/ + + /** + * Step for implementing genetic profile tasklet. + * + * @return Step + */ + @Bean + public Step loadCnaGeneticProfile() { + return stepBuilderFactory.get("loadCnaGeneticProfile") + .tasklet(cnaGeneticProfileTasklet()) + .build(); + } + + /** + * Tasklet for loading and importing genetic profile from meta file. + * + * @return Tasklet + */ + @Bean + @StepScope + public Tasklet cnaGeneticProfileTasklet() { + return new GeneticProfileTasklet(); + } + + /** + * Step for implementing profile metadata tasklet. + * + * @return Step + */ + @Bean + public Step loadCnaMetadata() { + return stepBuilderFactory.get("loadCnaMetadata") + .tasklet(loadCnaMetadataTasklet()) + .build(); + } + + /** + * Tasklet for loading profile metadata from profile datafiles. + * + * @return Tasklet + */ + @Bean + @StepScope + public Tasklet loadCnaMetadataTasklet() { + return new ProfileMetadataTasklet(); + } + + /** + * Flow for importing CNA data. + * + * @return Flow + */ + @Bean + public Flow cnaStep() { + return new FlowBuilder("cnaStep") + .start(cnaStepDecider()) + .on("RUN").to(loadCnaGeneticProfile()) + .next(loadCnaMetadata()) + .next(cnaStepBuilder("cnaStep")) + .next(cnaFoundationStep()) + .from(cnaStepDecider()) + .on("SKIP").to(cnaFoundationStep()) + .build(); + } + + /** + * Flow for importing Foundation CNA data. + * + * @return Flow + */ + @Bean + public Flow cnaFoundationStep() { + return new FlowBuilder("cnaFoundationStep") + .start(cnaFoundationStepDecider()) + .on("RUN").to(loadCnaGeneticProfile()) + .next(loadCnaMetadata()) + .next(cnaStepBuilder("cnaFoundationStep")) + .next(cnaRaeStep()) + .from(cnaFoundationStepDecider()) + .on("SKIP").to(cnaRaeStep()) + .build(); + } + + /** + * Flow for importing RAE CNA data. + * + * @return Flow + */ + @Bean + public Flow cnaRaeStep() { + return new FlowBuilder("cnaRaeStep") + .start(cnaRaeStepDecider()) + .on("RUN").to(loadCnaGeneticProfile()) + .next(loadCnaMetadata()) + .next(cnaStepBuilder("cnaRaeStep")) + .next(cnaConsensusStep()) + .from(cnaRaeStepDecider()) + .on("SKIP").to(cnaConsensusStep()) + .build(); + } + + /** + * Flow for importing consensus CNA data. + * + * @return Flow + */ + @Bean + public Flow cnaConsensusStep() { + return new FlowBuilder("cnaConsensusStep") + .start(cnaConsensusStepDecider()) + .on("RUN").to(loadCnaGeneticProfile()) + .next(loadCnaMetadata()) + .next(cnaStepBuilder("cnaConsensusStep")) + .next(cnaLinearStep()) + .from(cnaConsensusStepDecider()) + .on("SKIP").to(cnaLinearStep()) + .build(); + } + + /** + * Flow for importing linear CNA data. + * + * @return Flow + */ + @Bean + public Flow cnaLinearStep() { + return new FlowBuilder("cnaLinearStep") + .start(cnaLinearStepDecider()) + .on("RUN").to(loadCnaGeneticProfile()) + .next(loadCnaMetadata()) + .next(cnaStepBuilder("cnaLinearStep")) + .next(cnaLog2Step()) + .from(cnaLinearStepDecider()) + .on("SKIP").to(cnaLog2Step()) + .build(); + } + + /** + * Flow for importing log2 CNA data. + * + * @return Flow + */ + @Bean + public Flow cnaLog2Step() { + return new FlowBuilder("cnaLog2Step") + .start(cnaLog2StepDecider()) + .on("RUN").to(loadCnaGeneticProfile()) + .next(loadCnaMetadata()) + .next(cnaStepBuilder("cnaLog2Step")) + .from(cnaLog2StepDecider()) + .on("SKIP").end() + .build(); + } + + /** + * Universal step builder for CNA data. + * + * @param stepName + * @return Step + */ + public Step cnaStepBuilder(String stepName) { + return stepBuilderFactory.get(stepName) + . chunk(chunkInterval) + .reader(cnaDataReader()) + .processor(cnaDataProcessor()) + .writer(cnaDataWriter()) + .listener(cnaDataListener()) + .build(); + } + + /*************************************************************************** + * CNA data listener, reader, processor, and writer. + **************************************************************************/ + + @Bean + public StepExecutionListener cnaDataListener() { + return new ProfileDataListener(); + } + + @Bean + @StepScope + public ItemStreamReader cnaDataReader() { + return new ProfileDataReader(); + } + + @Bean + public ProfileDataProcessor cnaDataProcessor() { + return new ProfileDataProcessor(); + } + + @Bean + @StepScope + public ItemStreamWriter cnaDataWriter() { + return new ProfileDataWriter(); + } + + /*************************************************************************** + * Deciders for CNA data steps. + **************************************************************************/ + + /** + * CNA step decider. + * Sets flow execution status to SKIP if meta file and data files do not exist + * + * @return JobExecutionDecider + */ + @Bean + public JobExecutionDecider cnaStepDecider() { + return (JobExecution jobExecution, StepExecution stepExecution) -> { + MultiKeyMap datatypeMetadata = (MultiKeyMap) jobExecution.getExecutionContext().get("datatypeMetadata"); + String datatype = "cna-gistic"; + + List logMessages = (List) datatypeMetadata.get(datatype, "logMessages"); + LOG.info(logMessages.get(0)); + if ((boolean) datatypeMetadata.get(datatype, "importData")) { + LOG.info(logMessages.get(1)); + jobExecution.getExecutionContext().put("currentDatatype", datatype); + return new FlowExecutionStatus("RUN"); + } + else { + LOG.warn(logMessages.get(1)); + return new FlowExecutionStatus("SKIP"); + } + }; + } + + /** + * CNA Foundation step decider. + * Sets flow execution status to SKIP if meta file and data files do not exist + * + * @return JobExecutionDecider + */ + @Bean + public JobExecutionDecider cnaFoundationStepDecider() { + return (JobExecution jobExecution, StepExecution stepExecution) -> { + MultiKeyMap datatypeMetadata = (MultiKeyMap) jobExecution.getExecutionContext().get("datatypeMetadata"); + String datatype = "cna-foundation"; + + List logMessages = (List) datatypeMetadata.get(datatype, "logMessages"); + LOG.info(logMessages.get(0)); + if ((boolean) datatypeMetadata.get(datatype, "importData")) { + LOG.info(logMessages.get(1)); + jobExecution.getExecutionContext().put("currentDatatype", datatype); + return new FlowExecutionStatus("RUN"); + } + else { + LOG.warn(logMessages.get(1)); + return new FlowExecutionStatus("SKIP"); + } + }; + } + + /** + * CNA RAE step decider. + * Sets flow execution status to SKIP if meta file and data files do not exist + * + * @return JobExecutionDecider + */ + @Bean + public JobExecutionDecider cnaRaeStepDecider() { + return (JobExecution jobExecution, StepExecution stepExecution) -> { + MultiKeyMap datatypeMetadata = (MultiKeyMap) jobExecution.getExecutionContext().get("datatypeMetadata"); + String datatype = "cna-rae"; + + List logMessages = (List) datatypeMetadata.get(datatype, "logMessages"); + LOG.info(logMessages.get(0)); + if ((boolean) datatypeMetadata.get(datatype, "importData")) { + LOG.info(logMessages.get(1)); + jobExecution.getExecutionContext().put("currentDatatype", datatype); + return new FlowExecutionStatus("RUN"); + } + else { + LOG.warn(logMessages.get(1)); + return new FlowExecutionStatus("SKIP"); + } + }; + } + + /** + * CNA consensus step decider. + * Sets flow execution status to SKIP if meta file and data files do not exist + * + * @return JobExecutionDecider + */ + @Bean + public JobExecutionDecider cnaConsensusStepDecider() { + return (JobExecution jobExecution, StepExecution stepExecution) -> { + MultiKeyMap datatypeMetadata = (MultiKeyMap) jobExecution.getExecutionContext().get("datatypeMetadata"); + String datatype = "cna-consensus"; + + List logMessages = (List) datatypeMetadata.get(datatype, "logMessages"); + LOG.info(logMessages.get(0)); + if ((boolean) datatypeMetadata.get(datatype, "importData")) { + LOG.info(logMessages.get(1)); + jobExecution.getExecutionContext().put("currentDatatype", datatype); + return new FlowExecutionStatus("RUN"); + } + else { + LOG.warn(logMessages.get(1)); + return new FlowExecutionStatus("SKIP"); + } + }; + } + + /** + * CNA linear step decider. + * Sets flow execution status to SKIP if meta file and data files do not exist + * + * @return JobExecutionDecider + */ + @Bean + public JobExecutionDecider cnaLinearStepDecider() { + return (JobExecution jobExecution, StepExecution stepExecution) -> { + MultiKeyMap datatypeMetadata = (MultiKeyMap) jobExecution.getExecutionContext().get("datatypeMetadata"); + String datatype = "linear-cna-gistic"; + + List logMessages = (List) datatypeMetadata.get(datatype, "logMessages"); + LOG.info(logMessages.get(0)); + if ((boolean) datatypeMetadata.get(datatype, "importData")) { + LOG.info(logMessages.get(1)); + jobExecution.getExecutionContext().put("currentDatatype", datatype); + return new FlowExecutionStatus("RUN"); + } + else { + LOG.warn(logMessages.get(1)); + return new FlowExecutionStatus("SKIP"); + } + }; + } + + /** + * CNA log2 step decider. + * Sets flow execution status to SKIP if meta file and data files do not exist + * + * @return JobExecutionDecider + */ + @Bean + public JobExecutionDecider cnaLog2StepDecider() { + return (JobExecution jobExecution, StepExecution stepExecution) -> { + MultiKeyMap datatypeMetadata = (MultiKeyMap) jobExecution.getExecutionContext().get("datatypeMetadata"); + String datatype = "log2-cna"; + + List logMessages = (List) datatypeMetadata.get(datatype, "logMessages"); + LOG.info(logMessages.get(0)); + if ((boolean) datatypeMetadata.get(datatype, "importData")) { + LOG.info(logMessages.get(1)); + jobExecution.getExecutionContext().put("currentDatatype", datatype); + return new FlowExecutionStatus("RUN"); + } + else { + LOG.warn(logMessages.get(1)); + return new FlowExecutionStatus("SKIP"); + } + }; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/step/CopyNumberSegmentDataStep.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/step/CopyNumberSegmentDataStep.java new file mode 100644 index 0000000..2429f3f --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/step/CopyNumberSegmentDataStep.java @@ -0,0 +1,269 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.step; + +import org.mskcc.cbio.model.CopyNumberSegment; +import org.cbio.portal.pipelines.importer.model.CopyNumberSegmentRecord; +import org.cbio.portal.pipelines.importer.config.tasklet.CopyNumberSegmentMetadataTasklet; +import org.cbio.portal.pipelines.importer.config.listener.CopyNumberSegmentDataListener; +import org.cbio.portal.pipelines.importer.config.reader.CopyNumberSegmentDataReader; +import org.cbio.portal.pipelines.importer.config.processor.CopyNumberSegmentDataProcessor; +import org.cbio.portal.pipelines.importer.config.writer.CopyNumberSegmentDataWriter; + +import java.util.*; +import org.apache.commons.collections.map.MultiKeyMap; +import org.apache.commons.logging.*; + +import org.springframework.batch.core.*; +import org.springframework.batch.core.configuration.annotation.*; +import org.springframework.batch.core.job.builder.FlowBuilder; +import org.springframework.batch.core.job.flow.*; +import org.springframework.batch.core.scope.context.ChunkContext; +import org.springframework.batch.core.step.tasklet.Tasklet; +import org.springframework.batch.item.*; +import org.springframework.batch.repeat.RepeatStatus; + +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.context.annotation.*; + +/** + * + * @author ochoaa + */ +@Configuration +@EnableBatchProcessing +public class CopyNumberSegmentDataStep { + + public static final String IMPORT_COPY_NUMBER_SEG_DATA = "importCopyNumberSegData"; + + @Value("${chunk.interval}") + private int chunkInterval; + + @Autowired + public StepBuilderFactory stepBuilderFactory; + + private static final Log LOG = LogFactory.getLog(CopyNumberSegmentDataStep.class); + + @Bean + public Step importCopyNumberSegData() { + return stepBuilderFactory.get(IMPORT_COPY_NUMBER_SEG_DATA) + .flow(copyNumberSegStepFlow()) + .build(); + } + + /** + * Copy number segment data flow. + * Execution of copy number segment data steps will begin with "copyNumberSegmentHg19DataStep" and will + * execute conditionally based on their respective deciders + * + * @return Flow + */ + @Bean + public Flow copyNumberSegStepFlow() { + return new FlowBuilder("copyNumberSegStepFlow") + .start(initCopyNumberSegmentImportStep()) + .next(copyNumberSegmentHg19DataStep()) + .build(); + } + + /** + * Step to initiate flow for copy number segment data steps. + * + * @return Step + */ + @Bean + public Step initCopyNumberSegmentImportStep() { + return stepBuilderFactory.get("initCopyNumberSegmentImportStep") + .tasklet((StepContribution sc, ChunkContext cc) -> { + return RepeatStatus.FINISHED; + }).build(); + } + + /*************************************************************************** + * Copy number segment data import steps and flows. + **************************************************************************/ + + /** + * Step for implementing copy number segment metadata tasklet. + * + * @return Step + */ + @Bean + public Step loadCopyNumberSegmentMetadata() { + return stepBuilderFactory.get("loadCopyNumberSegmentMetadata") + .tasklet(copyNumberSegmentMetadataTasklet()) + .build(); + } + + /** + * Tasklet for loading copy number segment metadata. + * + * @return Tasklet + */ + @Bean + @StepScope + public Tasklet copyNumberSegmentMetadataTasklet() { + return new CopyNumberSegmentMetadataTasklet(); + } + + /** + * Copy number segment data import flow for reference genome id hg19. + * + * @return Flow + */ + @Bean + public Flow copyNumberSegmentHg19DataStep() { + return new FlowBuilder("copyNumberSegmentHg19DataStep") + .start(copyNumberSegmentHg19StepDecider()) + .on("RUN").to(loadCopyNumberSegmentMetadata()) + .next(copyNumberSegmentStepBuilder("copyNumberSegmentHg19DataStep")) + .next(copyNumberSegmentHg18DataStep()) + .from(copyNumberSegmentHg19StepDecider()) + .on("SKIP").to(copyNumberSegmentHg18DataStep()) + .build(); + } + + /** + * Copy number segment data import flow for reference genome id hg18. + * + * @return Flow + */ + @Bean + public Flow copyNumberSegmentHg18DataStep() { + return new FlowBuilder("copyNumberSegmentHg18DataStep") + .start(copyNumberSegmentHg18StepDecider()) + .on("RUN").to(loadCopyNumberSegmentMetadata()) + .next(copyNumberSegmentStepBuilder("copyNumberSegmentHg18DataStep")) + .from(copyNumberSegmentHg18StepDecider()) + .on("SKIP").end() + .build(); + } + + /** + * Universal step builder for copy number segment data. + * + * @param stepName + * @return Step + */ + public Step copyNumberSegmentStepBuilder(String stepName) { + return stepBuilderFactory.get(stepName) + . chunk(chunkInterval) + .reader(copyNumberSegmentDataReader()) + .processor(copyNumberSegmentDataProcessor()) + .writer(copyNumberSegmentDataWriter()) + .listener(copyNumberSegmentDataListener()) + .build(); + } + + /*************************************************************************** + * Copy number segment data listener, reader, processor, and writer. + **************************************************************************/ + + @Bean + public StepExecutionListener copyNumberSegmentDataListener() { + return new CopyNumberSegmentDataListener(); + } + + @Bean + @StepScope + public ItemStreamReader copyNumberSegmentDataReader() { + return new CopyNumberSegmentDataReader(); + } + + @Bean + public CopyNumberSegmentDataProcessor copyNumberSegmentDataProcessor() { + return new CopyNumberSegmentDataProcessor(); + } + + @Bean + @StepScope + public ItemStreamWriter copyNumberSegmentDataWriter() { + return new CopyNumberSegmentDataWriter(); + } + + /*************************************************************************** + * Deciders for copy number segment data step. + **************************************************************************/ + + /** + * Copy number segment step decider for reference genome id hg19. + * + * @return JobExecutionDecider + */ + @Bean + public JobExecutionDecider copyNumberSegmentHg19StepDecider() { + return (JobExecution jobExecution, StepExecution stepExecution) -> { + MultiKeyMap datatypeMetadata = (MultiKeyMap) jobExecution.getExecutionContext().get("datatypeMetadata"); + String datatype = "cna-hg19-seg"; + + List logMessages = (List) datatypeMetadata.get(datatype, "logMessages"); + LOG.info(logMessages.get(0)); + if ((boolean) datatypeMetadata.get(datatype, "importData")) { + LOG.info(logMessages.get(1)); + jobExecution.getExecutionContext().put("currentDatatype", datatype); + return new FlowExecutionStatus("RUN"); + } + else { + LOG.warn(logMessages.get(1)); + return new FlowExecutionStatus("SKIP"); + } + }; + } + + /** + * Copy number segment step decider for reference genome id hg18. + * + * @return JobExecutionDecider + */ + @Bean + public JobExecutionDecider copyNumberSegmentHg18StepDecider() { + return (JobExecution jobExecution, StepExecution stepExecution) -> { + MultiKeyMap datatypeMetadata = (MultiKeyMap) jobExecution.getExecutionContext().get("datatypeMetadata"); + String datatype = "cna-hg18-seg"; + + List logMessages = (List) datatypeMetadata.get(datatype, "logMessages"); + LOG.info(logMessages.get(0)); + if ((boolean) datatypeMetadata.get(datatype, "importData")) { + LOG.info(logMessages.get(1)); + jobExecution.getExecutionContext().put("currentDatatype", datatype); + return new FlowExecutionStatus("RUN"); + } + else { + LOG.warn(logMessages.get(1)); + return new FlowExecutionStatus("SKIP"); + } + }; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/step/FusionDataStep.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/step/FusionDataStep.java new file mode 100644 index 0000000..ee7cb31 --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/step/FusionDataStep.java @@ -0,0 +1,237 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.step; + +import org.cbio.portal.pipelines.importer.model.FusionRecord; +import org.cbio.portal.pipelines.importer.config.composite.CompositeMutationData; +import org.cbio.portal.pipelines.importer.config.tasklet.*; +import org.cbio.portal.pipelines.importer.config.listener.MutationDataListener; +import org.cbio.portal.pipelines.importer.config.reader.FusionDataReader; +import org.cbio.portal.pipelines.importer.config.processor.FusionDataProcessor; +import org.cbio.portal.pipelines.importer.config.writer.MutationDataWriter; + +import java.util.*; +import org.apache.commons.collections.map.MultiKeyMap; +import org.apache.commons.logging.*; + +import org.springframework.batch.core.*; +import org.springframework.batch.core.configuration.annotation.*; +import org.springframework.batch.core.job.builder.FlowBuilder; +import org.springframework.batch.core.job.flow.*; +import org.springframework.batch.core.scope.context.ChunkContext; +import org.springframework.batch.core.step.tasklet.Tasklet; +import org.springframework.batch.item.*; +import org.springframework.batch.repeat.RepeatStatus; + +import org.springframework.beans.factory.annotation.*; +import org.springframework.context.annotation.*; + +/** + * + * @author ochoaa + */ +@Configuration +@EnableBatchProcessing +public class FusionDataStep { + + public static final String IMPORT_FUSION_DATA = "importFusionData"; + + @Value("${chunk.interval}") + private int chunkInterval; + + @Autowired + public StepBuilderFactory stepBuilderFactory; + + private static final Log LOG = LogFactory.getLog(FusionDataStep.class); + + @Bean + public Step importFusionData() { + return stepBuilderFactory.get(IMPORT_FUSION_DATA) + .flow(fusionStepFlow()) + .build(); + } + + /** + * Fusion data import flow. + * + * @return Flow + */ + @Bean + public Flow fusionStepFlow() { + // execute fusion data import flow + return new FlowBuilder("importFusionStepFlow") + .start(initFusionImportStep()) + .next(fusionStepDecider()) + .on("RUN").to(loadFusionGeneticProfile()) + .next(loadFusionMetadata()) + .next(fusionStep()) + .from(fusionStepDecider()) + .on("SKIP").end() + .build(); + } + + /** + * Step to initiate flow for fusion data steps. + * + * @return Step + */ + @Bean + public Step initFusionImportStep() { + return stepBuilderFactory.get("initFusionImportStep") + .tasklet((StepContribution sc, ChunkContext cc) -> { + return RepeatStatus.FINISHED; + }).build(); + } + + /************************************************************************** + * Fusion data import steps and flows. + **************************************************************************/ + + /** + * Step for implementing genetic profile tasklet. + * + * @return Step + */ + @Bean + public Step loadFusionGeneticProfile() { + return stepBuilderFactory.get("loadFusionGeneticProfile") + .tasklet(fusionGeneticProfileTasklet()) + .build(); + } + + /** + * Tasklet for loading and importing genetic profile from meta file. + * + * @return Tasklet + */ + @Bean + @StepScope + public Tasklet fusionGeneticProfileTasklet() { + return new GeneticProfileTasklet(); + } + + /** + * Step for implementing mutation metadata tasklet. + * + * @return Step + */ + @Bean + public Step loadFusionMetadata() { + return stepBuilderFactory.get("loadFusionMetadata") + .tasklet(loadFusionMetadataTasklet()) + .build(); + } + + /** + * Tasklet for loading metadata from fusion files. + * + * @return Tasklet + */ + @Bean + @StepScope + public Tasklet loadFusionMetadataTasklet() { + return new MutationMetadataTasklet(); + } + + /** + * Step for importing fusion data. + * + * @return Step + */ + public Step fusionStep() { + return stepBuilderFactory.get("fusionStep") + . chunk(chunkInterval) + .reader(fusionDataReader()) + .processor(fusionDataProcessor()) + .writer(fusionDataWriter()) + .listener(fusionDataListener()) + .build(); + } + + /*************************************************************************** + * Fusion data listener, reader, processor, and writer. + **************************************************************************/ + + @Bean + public StepExecutionListener fusionDataListener() { + return new MutationDataListener(); + } + + @Bean + @StepScope + public ItemStreamReader fusionDataReader() { + return new FusionDataReader(); + } + + @Bean + public FusionDataProcessor fusionDataProcessor() { + return new FusionDataProcessor(); + } + + @Bean + @StepScope + public ItemStreamWriter fusionDataWriter() { + return new MutationDataWriter(); + } + + /******************************************************************* + * Decider for fusion data step. + *******************************************************************/ + + /** + * Fusion step decider. + * Sets flow execution status to SKIP if meta file and data files do not exist + * + * @return JobExecutionDecider + */ + @Bean + public JobExecutionDecider fusionStepDecider() { + return (JobExecution jobExecution, StepExecution stepExecution) -> { + MultiKeyMap datatypeMetadata = (MultiKeyMap) jobExecution.getExecutionContext().get("datatypeMetadata"); + String datatype = "fusion"; + + List logMessages = (List) datatypeMetadata.get(datatype, "logMessages"); + LOG.info(logMessages.get(0)); + if ((boolean) datatypeMetadata.get(datatype, "importData")) { + LOG.info(logMessages.get(1)); + jobExecution.getExecutionContext().put("currentDatatype", datatype); + return new FlowExecutionStatus("RUN"); + } + else { + LOG.warn(logMessages.get(1)); + return new FlowExecutionStatus("SKIP"); + } + }; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/step/GeneExpressionDataStep.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/step/GeneExpressionDataStep.java new file mode 100644 index 0000000..bc70ca1 --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/step/GeneExpressionDataStep.java @@ -0,0 +1,999 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.step; + +import org.cbio.portal.pipelines.importer.model.ProfileDataRecord; +import org.cbio.portal.pipelines.importer.config.listener.ProfileDataListener; +import org.cbio.portal.pipelines.importer.config.tasklet.GeneticProfileTasklet; +import org.cbio.portal.pipelines.importer.config.tasklet.ProfileMetadataTasklet; +import org.cbio.portal.pipelines.importer.config.reader.ProfileDataReader; +import org.cbio.portal.pipelines.importer.config.processor.ProfileDataProcessor; +import org.cbio.portal.pipelines.importer.config.writer.ProfileDataWriter; + +import java.util.*; +import org.apache.commons.collections.map.MultiKeyMap; +import org.apache.commons.logging.*; + +import org.springframework.batch.core.*; +import org.springframework.batch.core.configuration.annotation.*; +import org.springframework.batch.core.job.builder.FlowBuilder; +import org.springframework.batch.core.job.flow.*; +import org.springframework.batch.core.scope.context.ChunkContext; +import org.springframework.batch.core.step.tasklet.Tasklet; +import org.springframework.batch.item.*; +import org.springframework.batch.repeat.RepeatStatus; + +import org.springframework.beans.factory.annotation.*; +import org.springframework.context.annotation.*; + +/** + * + * @author ochoaa + */ +@Configuration +@EnableBatchProcessing +public class GeneExpressionDataStep { + + public static final String IMPORT_GENE_EXPRESSION_DATA = "importGeneExpressionData"; + + @Value("${chunk.interval}") + private int chunkInterval; + + @Autowired + public StepBuilderFactory stepBuilderFactory; + + private static final Log LOG = LogFactory.getLog(GeneExpressionDataStep.class); + + @Bean + public Step importGeneExpressionData() { + return stepBuilderFactory.get(IMPORT_GENE_EXPRESSION_DATA) + .flow(geneExpressionStepFlow()) + .build(); + } + + /** + * Gene expression data flow. + * Execution of gene expression data steps will begin with "geneExpressionAffymetrixStep" and will + * execute conditionally based on their respective deciders + * + * @return Flow + */ + @Bean + public Flow geneExpressionStepFlow() { + // execute gene expression data steps sequentially starting with geneExpressionAffymetrixStep + return new FlowBuilder("geneExpressionStepFlow") + .start(initGeneExpressionImportStep()) + .next(geneExpressionAffymetrixStep()) + .build(); + } + + /** + * Step to initiate flow for gene expression data steps. + * + * @return Step + */ + @Bean + public Step initGeneExpressionImportStep() { + return stepBuilderFactory.get("initGeneExpressionImportStep") + .tasklet((StepContribution sc, ChunkContext cc) -> { + return RepeatStatus.FINISHED; + }).build(); + } + + /*************************************************************************** + * Gene expression data import steps and flows. + **************************************************************************/ + + /** + * Step for implementing genetic profile tasklet. + * + * @return Step + */ + @Bean + public Step loadGeneExpressionGeneticProfile() { + return stepBuilderFactory.get("loadGeneExpressionGeneticProfile") + .tasklet(geneExpressionGeneticProfileTasklet()) + .build(); + } + + /** + * Tasklet for loading and importing genetic profile from meta file. + * + * @return Tasklet + */ + @Bean + @StepScope + public Tasklet geneExpressionGeneticProfileTasklet() { + return new GeneticProfileTasklet(); + } + + /** + * Step for implementing profile metadata tasklet. + * + * @return Step + */ + @Bean + public Step loadGeneExpressionMetadata() { + return stepBuilderFactory.get("loadGeneExpressionMetadata") + .tasklet(loadGeneExpressionMetadataTasklet()) + .build(); + } + + /** + * Tasklet for loading profile metadata from profile datafiles. + * + * @return Tasklet + */ + @Bean + @StepScope + public Tasklet loadGeneExpressionMetadataTasklet() { + return new ProfileMetadataTasklet(); + } + + /** + * Flow for importing gene expression Affymetrix data. + * + * @return Flow + */ + @Bean + public Flow geneExpressionAffymetrixStep() { + return new FlowBuilder("geneExpressionAffymetrixStep") + .start(geneExpressionAffymetrixStepDecider()) + .on("RUN").to(loadGeneExpressionGeneticProfile()) + .next(loadGeneExpressionMetadata()) + .next(geneExpressionStepBuilder("geneExpressionAffymetrixStep")) + .next(geneExpressionAffymetrixZscoresStep()) + .from(geneExpressionAffymetrixStepDecider()) + .on("SKIP").to(geneExpressionAffymetrixZscoresStep()) + .build(); + } + + /** + * Flow for importing gene expression Affymetrix z-scores data. + * + * @return Flow + */ + @Bean + public Flow geneExpressionAffymetrixZscoresStep() { + return new FlowBuilder("geneExpressionAffymetrixZscoresStep") + .start(geneExpressionAffymetrixZscoresStepDecider()) + .on("RUN").to(loadGeneExpressionGeneticProfile()) + .next(loadGeneExpressionMetadata()) + .next(geneExpressionStepBuilder("geneExpressionAffymetrixZscoresStep")) + .next(geneExpressionMergedStep()) + .from(geneExpressionAffymetrixZscoresStepDecider()) + .on("SKIP").to(geneExpressionMergedStep()) + .build(); + } + + /** + * Flow for importing gene expression merged data. + * + * @return Flow + */ + @Bean + public Flow geneExpressionMergedStep() { + return new FlowBuilder("geneExpressionMergedStep") + .start(geneExpressionMergedStepDecider()) + .on("RUN").to(loadGeneExpressionGeneticProfile()) + .next(loadGeneExpressionMetadata()) + .next(geneExpressionStepBuilder("geneExpressionMergedStep")) + .next(geneExpressionMergedZscoresStep()) + .from(geneExpressionMergedStepDecider()) + .on("SKIP").to(geneExpressionMergedZscoresStep()) + .build(); + } + + /** + * Flow for importing gene expression merged z-scores data. + * + * @return Flow + */ + @Bean + public Flow geneExpressionMergedZscoresStep() { + return new FlowBuilder("geneExpressionMergedZscoresStep") + .start(geneExpressionMergedZscoresStepDecider()) + .on("RUN").to(loadGeneExpressionGeneticProfile()) + .next(loadGeneExpressionMetadata()) + .next(geneExpressionStepBuilder("geneExpressionMergedZscoresStep")) + .next(geneExpressionRnaSeqStep()) + .from(geneExpressionMergedZscoresStepDecider()) + .on("SKIP").to(geneExpressionRnaSeqStep()) + .build(); + } + + /** + * Flow for importing gene expression RNAseq data. + * + * @return Flow + */ + @Bean + public Flow geneExpressionRnaSeqStep() { + return new FlowBuilder("geneExpressionRnaSeqStep") + .start(geneExpressionRnaSeqStepDecider()) + .on("RUN").to(loadGeneExpressionGeneticProfile()) + .next(loadGeneExpressionMetadata()) + .next(geneExpressionStepBuilder("geneExpressionRnaSeqStep")) + .next(geneExpressionRnaSeqZscoresStep()) + .from(geneExpressionRnaSeqStepDecider()) + .on("SKIP").to(geneExpressionRnaSeqZscoresStep()) + .build(); + } + + /** + * Flow for importing gene expression RNAseq z-scores data. + * + * @return Flow + */ + @Bean + public Flow geneExpressionRnaSeqZscoresStep() { + return new FlowBuilder("geneExpressionRnaSeqZscoresStep") + .start(geneExpressionRnaSeqZscoresStepDecider()) + .on("RUN").to(loadGeneExpressionGeneticProfile()) + .next(loadGeneExpressionMetadata()) + .next(geneExpressionStepBuilder("geneExpressionRnaSeqZscoresStep")) + .next(geneExpressionAgilentStep()) + .from(geneExpressionRnaSeqZscoresStepDecider()) + .on("SKIP").to(geneExpressionAgilentStep()) + .build(); + } + + /** + * Flow for importing gene expression Agilent data. + * + * @return Flow + */ + @Bean + public Flow geneExpressionAgilentStep() { + return new FlowBuilder("geneExpressionAgilentStep") + .start(geneExpressionAgilentStepDecider()) + .on("RUN").to(loadGeneExpressionGeneticProfile()) + .next(loadGeneExpressionMetadata()) + .next(geneExpressionStepBuilder("geneExpressionAgilentStep")) + .next(geneExpressionAgilentZscoresStep()) + .from(geneExpressionAgilentStepDecider()) + .on("SKIP").to(geneExpressionAgilentZscoresStep()) + .build(); + } + + /** + * Flow for importing gene expression Agilent z-scores data. + * + * @return Flow + */ + @Bean + public Flow geneExpressionAgilentZscoresStep() { + return new FlowBuilder("geneExpressionAgilentZscoresStep") + .start(geneExpressionAgilentZscoresStepDecider()) + .on("RUN").to(loadGeneExpressionGeneticProfile()) + .next(loadGeneExpressionMetadata()) + .next(geneExpressionStepBuilder("geneExpressionAgilentZscoresStep")) + .next(geneExpressionRnaSeqV2Step()) + .from(geneExpressionAgilentZscoresStepDecider()) + .on("SKIP").to(geneExpressionRnaSeqV2Step()) + .build(); + } + + /** + * Flow for importing gene expression RNAseq-V2 data. + * + * @return Flow + */ + @Bean + public Flow geneExpressionRnaSeqV2Step() { + return new FlowBuilder("geneExpressionRnaSeqV2Step") + .start(geneExpressionRnaSeqV2StepDecider()) + .on("RUN").to(loadGeneExpressionGeneticProfile()) + .next(loadGeneExpressionMetadata()) + .next(geneExpressionStepBuilder("geneExpressionRnaSeqV2Step")) + .next(geneExpressionRnaSeqV2ZscoresStep()) + .from(geneExpressionRnaSeqV2StepDecider()) + .on("SKIP").to(geneExpressionRnaSeqV2ZscoresStep()) + .build(); + } + + /** + * Flow for importing gene expression RNAseq-V2 z-scores data. + * + * @return Flow + */ + @Bean + public Flow geneExpressionRnaSeqV2ZscoresStep() { + return new FlowBuilder("geneExpressionRnaSeqV2ZscoresStep") + .start(geneExpressionRnaSeqV2ZscoresStepDecider()) + .on("RUN").to(loadGeneExpressionGeneticProfile()) + .next(loadGeneExpressionMetadata()) + .next(geneExpressionStepBuilder("geneExpressionRnaSeqV2ZscoresStep")) + .next(geneExpressionMiRnaStep()) + .from(geneExpressionRnaSeqV2ZscoresStepDecider()) + .on("SKIP").to(geneExpressionMiRnaStep()) + .build(); + } + + /** + * Flow for importing gene expression miRNA data. + * + * @return Flow + */ + @Bean + public Flow geneExpressionMiRnaStep() { + return new FlowBuilder("geneExpressionMiRnaStep") + .start(geneExpressionMiRnaStepDecider()) + .on("RUN").to(loadGeneExpressionGeneticProfile()) + .next(loadGeneExpressionMetadata()) + .next(geneExpressionStepBuilder("geneExpressionMiRnaStep")) + .next(geneExpressionMiRnaMedianZscoresStep()) + .from(geneExpressionMiRnaStepDecider()) + .on("SKIP").to(geneExpressionMiRnaMedianZscoresStep()) + .build(); + } + + /** + * Flow for importing gene expression miRNA median z-scores data. + * + * @return Flow + */ + @Bean + public Flow geneExpressionMiRnaMedianZscoresStep() { + return new FlowBuilder("geneExpressionMiRnaMedianZscoresStep") + .start(geneExpressionMiRnaMedianZscoresStepDecider()) + .on("RUN").to(loadGeneExpressionGeneticProfile()) + .next(loadGeneExpressionMetadata()) + .next(geneExpressionStepBuilder("geneExpressionMiRnaMedianZscoresStep")) + .next(geneExpressionMiRnaMergedMedianZscoresStep()) + .from(geneExpressionMiRnaMedianZscoresStepDecider()) + .on("SKIP").to(geneExpressionMiRnaMergedMedianZscoresStep()) + .build(); + } + + /** + * Flow for importing gene expression miRNA merged median z-scores data. + * + * @return Flow + */ + @Bean + public Flow geneExpressionMiRnaMergedMedianZscoresStep() { + return new FlowBuilder("geneExpressionMiRnaMergedMedianZscoresStep") + .start(geneExpressionMiRnaMergedMedianZscoresStepDecider()) + .on("RUN").to(loadGeneExpressionGeneticProfile()) + .next(loadGeneExpressionMetadata()) + .next(geneExpressionStepBuilder("geneExpressionMiRnaMergedMedianZscoresStep")) + .next(geneExpressionMRnaOutliersStep()) + .from(geneExpressionMiRnaMergedMedianZscoresStepDecider()) + .on("SKIP").to(geneExpressionMRnaOutliersStep()) + .build(); + } + + /** + * Flow for importing gene expression mRNA outliers data. + * + * @return Flow + */ + @Bean + public Flow geneExpressionMRnaOutliersStep() { + return new FlowBuilder("geneExpressionMRnaOutliersStep") + .start(geneExpressionMRnaOutliersStepDecider()) + .on("RUN").to(loadGeneExpressionGeneticProfile()) + .next(loadGeneExpressionMetadata()) + .next(geneExpressionStepBuilder("geneExpressionMRnaOutliersStep")) + .next(geneExpressionCaptureStep()) + .from(geneExpressionMRnaOutliersStepDecider()) + .on("SKIP").to(geneExpressionCaptureStep()) + .build(); + } + + /** + * Flow for importing gene expression capture data. + * + * @return Flow + */ + @Bean + public Flow geneExpressionCaptureStep() { + return new FlowBuilder("geneExpressionCaptureStep") + .start(geneExpressionCaptureStepDecider()) + .on("RUN").to(loadGeneExpressionGeneticProfile()) + .next(loadGeneExpressionMetadata()) + .next(geneExpressionStepBuilder("geneExpressionCaptureStep")) + .next(geneExpressionCaptureZscoresStep()) + .from(geneExpressionCaptureStepDecider()) + .on("SKIP").to(geneExpressionCaptureZscoresStep()) + .build(); + } + + /** + * Flow for importing gene expression capture z-scores data. + * + * @return Flow + */ + @Bean + public Flow geneExpressionCaptureZscoresStep() { + return new FlowBuilder("geneExpressionCaptureZscoresStep") + .start(geneExpressionCaptureZscoresStepDecider()) + .on("RUN").to(loadGeneExpressionGeneticProfile()) + .next(loadGeneExpressionMetadata()) + .next(geneExpressionStepBuilder("geneExpressionCaptureZscoresStep")) + .next(geneExpressionOtherZscoresStep()) + .from(geneExpressionCaptureZscoresStepDecider()) + .on("SKIP").to(geneExpressionOtherZscoresStep()) + .build(); + } + + /** + * Flow for importing gene expression capture z-scores data. + * + * @return Flow + */ + @Bean + public Flow geneExpressionOtherZscoresStep() { + return new FlowBuilder("geneExpressionOtherZscoresStep") + .start(geneExpressionOtherZscoresStepDecider()) + .on("RUN").to(loadGeneExpressionGeneticProfile()) + .next(loadGeneExpressionMetadata()) + .next(geneExpressionStepBuilder("geneExpressionOtherZscoresStep")) + .next(geneExpressionMRnaSeqFpkmStep()) + .from(geneExpressionOtherZscoresStepDecider()) + .on("SKIP").to(geneExpressionMRnaSeqFpkmStep()) + .build(); + } + + /** + * Flow for importing gene expression mRNAseq FPKM data. + * + * @return Flow + */ + @Bean + public Flow geneExpressionMRnaSeqFpkmStep() { + return new FlowBuilder("geneExpressionMRnaSeqFpkmStep") + .start(geneExpressionMRnaSeqFpkmStepDecider()) + .on("RUN").to(loadGeneExpressionGeneticProfile()) + .next(loadGeneExpressionMetadata()) + .next(geneExpressionStepBuilder("geneExpressionMRnaSeqFpkmStep")) + .from(geneExpressionMRnaSeqFpkmStepDecider()) + .on("SKIP").end() + .build(); + } + + /** + * Universal step builder for gene expression data. + * + * @param stepName + * @return Step + */ + public Step geneExpressionStepBuilder(String stepName) { + return stepBuilderFactory.get(stepName) + . chunk(chunkInterval) + .reader(geneExpressionDataReader()) + .processor(geneExpressionDataProcessor()) + .writer(geneExpressionDataWriter()) + .listener(geneExpressionDataListener()) + .build(); + } + + /*************************************************************************** + * Gene Expression data listener, reader, processor, and writer. + **************************************************************************/ + + @Bean + public StepExecutionListener geneExpressionDataListener() { + return new ProfileDataListener(); + } + + @Bean + @StepScope + public ItemStreamReader geneExpressionDataReader() { + return new ProfileDataReader(); + } + + @Bean + public ProfileDataProcessor geneExpressionDataProcessor() { + return new ProfileDataProcessor(); + } + + @Bean + @StepScope + public ItemStreamWriter geneExpressionDataWriter() { + return new ProfileDataWriter(); + } + + /*************************************************************************** + * Deciders for gene expression data steps. + **************************************************************************/ + + /** + * Gene expression Affymetrix step decider. + * Sets flow execution status to SKIP if meta file and data files do not exist + * + * @return JobExecutionDecider + */ + @Bean + public JobExecutionDecider geneExpressionAffymetrixStepDecider() { + return (JobExecution jobExecution, StepExecution stepExecution) -> { + MultiKeyMap datatypeMetadata = (MultiKeyMap) jobExecution.getExecutionContext().get("datatypeMetadata"); + String datatype = "affymetrix-gene-expression"; + + List logMessages = (List) datatypeMetadata.get(datatype, "logMessages"); + LOG.info(logMessages.get(0)); + if ((boolean) datatypeMetadata.get(datatype, "importData")) { + LOG.info(logMessages.get(1)); + jobExecution.getExecutionContext().put("currentDatatype", datatype); + return new FlowExecutionStatus("RUN"); + } + else { + LOG.warn(logMessages.get(1)); + return new FlowExecutionStatus("SKIP"); + } + }; + } + + /** + * Gene expression Affymetrix z-scores step decider. + * Sets flow execution status to SKIP if meta file and data files do not exist + * + * @return JobExecutionDecider + */ + @Bean + public JobExecutionDecider geneExpressionAffymetrixZscoresStepDecider() { + return (JobExecution jobExecution, StepExecution stepExecution) -> { + MultiKeyMap datatypeMetadata = (MultiKeyMap) jobExecution.getExecutionContext().get("datatypeMetadata"); + String datatype = "affymetrix-gene-expression-zscores"; + + List logMessages = (List) datatypeMetadata.get(datatype, "logMessages"); + LOG.info(logMessages.get(0)); + if ((boolean) datatypeMetadata.get(datatype, "importData")) { + LOG.info(logMessages.get(1)); + jobExecution.getExecutionContext().put("currentDatatype", datatype); + return new FlowExecutionStatus("RUN"); + } + else { + LOG.warn(logMessages.get(1)); + return new FlowExecutionStatus("SKIP"); + } + }; + } + + /** + * Gene expression merged step decider. + * Sets flow execution status to SKIP if meta file and data files do not exist + * + * @return JobExecutionDecider + */ + @Bean + public JobExecutionDecider geneExpressionMergedStepDecider() { + return (JobExecution jobExecution, StepExecution stepExecution) -> { + MultiKeyMap datatypeMetadata = (MultiKeyMap) jobExecution.getExecutionContext().get("datatypeMetadata"); + String datatype = "gene-expression-merged"; + + List logMessages = (List) datatypeMetadata.get(datatype, "logMessages"); + LOG.info(logMessages.get(0)); + if ((boolean) datatypeMetadata.get(datatype, "importData")) { + LOG.info(logMessages.get(1)); + jobExecution.getExecutionContext().put("currentDatatype", datatype); + return new FlowExecutionStatus("RUN"); + } else { + LOG.warn(logMessages.get(1)); + return new FlowExecutionStatus("SKIP"); + } + }; + } + + /** + * Gene expression merged z-scores step decider. + * Sets flow execution status to SKIP if meta file and data files do not exist + * + * @return JobExecutionDecider + */ + @Bean + public JobExecutionDecider geneExpressionMergedZscoresStepDecider() { + return (JobExecution jobExecution, StepExecution stepExecution) -> { + MultiKeyMap datatypeMetadata = (MultiKeyMap) jobExecution.getExecutionContext().get("datatypeMetadata"); + String datatype = "gene-expression-merged-zscores"; + + List logMessages = (List) datatypeMetadata.get(datatype, "logMessages"); + LOG.info(logMessages.get(0)); + if ((boolean) datatypeMetadata.get(datatype, "importData")) { + LOG.info(logMessages.get(1)); + jobExecution.getExecutionContext().put("currentDatatype", datatype); + return new FlowExecutionStatus("RUN"); + } + else { + LOG.warn(logMessages.get(1)); + return new FlowExecutionStatus("SKIP"); + } + }; + } + + /** + * Gene expression RNAseq step decider. + * Sets flow execution status to SKIP if meta file and data files do not exist + * + * @return JobExecutionDecider + */ + @Bean + public JobExecutionDecider geneExpressionRnaSeqStepDecider() { + return (JobExecution jobExecution, StepExecution stepExecution) -> { + MultiKeyMap datatypeMetadata = (MultiKeyMap) jobExecution.getExecutionContext().get("datatypeMetadata"); + String datatype = "rnaseq-gene-expression"; + + List logMessages = (List) datatypeMetadata.get(datatype, "logMessages"); + LOG.info(logMessages.get(0)); + if ((boolean) datatypeMetadata.get(datatype, "importData")) { + LOG.info(logMessages.get(1)); + jobExecution.getExecutionContext().put("currentDatatype", datatype); + return new FlowExecutionStatus("RUN"); + } + else { + LOG.warn(logMessages.get(1)); + return new FlowExecutionStatus("SKIP"); + } + }; + } + + /** + * Gene expression RNAseq z-scores step decider. + * Sets flow execution status to SKIP if meta file and data files do not exist + * + * @return JobExecutionDecider + */ + @Bean + public JobExecutionDecider geneExpressionRnaSeqZscoresStepDecider() { + return (JobExecution jobExecution, StepExecution stepExecution) -> { + MultiKeyMap datatypeMetadata = (MultiKeyMap) jobExecution.getExecutionContext().get("datatypeMetadata"); + String datatype = "rnaseq-gene-expression-zscores"; + + List logMessages = (List) datatypeMetadata.get(datatype, "logMessages"); + LOG.info(logMessages.get(0)); + if ((boolean) datatypeMetadata.get(datatype, "importData")) { + LOG.info(logMessages.get(1)); + jobExecution.getExecutionContext().put("currentDatatype", datatype); + return new FlowExecutionStatus("RUN"); + } + else { + LOG.warn(logMessages.get(1)); + return new FlowExecutionStatus("SKIP"); + } + }; + } + + /** + * Gene expression Agilent step decider. + * Sets flow execution status to SKIP if meta file and data files do not exist + * + * @return JobExecutionDecider + */ + @Bean + public JobExecutionDecider geneExpressionAgilentStepDecider() { + return (JobExecution jobExecution, StepExecution stepExecution) -> { + MultiKeyMap datatypeMetadata = (MultiKeyMap) jobExecution.getExecutionContext().get("datatypeMetadata"); + String datatype = "agilent-gene-expression"; + + List logMessages = (List) datatypeMetadata.get(datatype, "logMessages"); + LOG.info(logMessages.get(0)); + if ((boolean) datatypeMetadata.get(datatype, "importData")) { + LOG.info(logMessages.get(1)); + jobExecution.getExecutionContext().put("currentDatatype", datatype); + return new FlowExecutionStatus("RUN"); + } + else { + LOG.warn(logMessages.get(1)); + return new FlowExecutionStatus("SKIP"); + } + }; + } + + /** + * Gene expression Agilent z-scores step decider. + * Sets flow execution status to SKIP if meta file and data files do not exist + * + * @return JobExecutionDecider + */ + @Bean + public JobExecutionDecider geneExpressionAgilentZscoresStepDecider() { + return (JobExecution jobExecution, StepExecution stepExecution) -> { + MultiKeyMap datatypeMetadata = (MultiKeyMap) jobExecution.getExecutionContext().get("datatypeMetadata"); + String datatype = "agilent-gene-expression-zscores"; + + List logMessages = (List) datatypeMetadata.get(datatype, "logMessages"); + LOG.info(logMessages.get(0)); + if ((boolean) datatypeMetadata.get(datatype, "importData")) { + LOG.info(logMessages.get(1)); + jobExecution.getExecutionContext().put("currentDatatype", datatype); + return new FlowExecutionStatus("RUN"); + } + else { + LOG.warn(logMessages.get(1)); + return new FlowExecutionStatus("SKIP"); + } + }; + } + + /** + * Gene expression RNAseq-V2 step decider. + * Sets flow execution status to SKIP if meta file and data files do not exist + * + * @return JobExecutionDecider + */ + @Bean + public JobExecutionDecider geneExpressionRnaSeqV2StepDecider() { + return (JobExecution jobExecution, StepExecution stepExecution) -> { + MultiKeyMap datatypeMetadata = (MultiKeyMap) jobExecution.getExecutionContext().get("datatypeMetadata"); + String datatype = "rnaseq-v2-gene-expression"; + + List logMessages = (List) datatypeMetadata.get(datatype, "logMessages"); + LOG.info(logMessages.get(0)); + if ((boolean) datatypeMetadata.get(datatype, "importData")) { + LOG.info(logMessages.get(1)); + jobExecution.getExecutionContext().put("currentDatatype", datatype); + return new FlowExecutionStatus("RUN"); + } + else { + LOG.warn(logMessages.get(1)); + return new FlowExecutionStatus("SKIP"); + } + }; + } + + /** + * Gene expression RNAseq-V2 z-scores step decider. + * Sets flow execution status to SKIP if meta file and data files do not exist + * + * @return JobExecutionDecider + */ + @Bean + public JobExecutionDecider geneExpressionRnaSeqV2ZscoresStepDecider() { + return (JobExecution jobExecution, StepExecution stepExecution) -> { + MultiKeyMap datatypeMetadata = (MultiKeyMap) jobExecution.getExecutionContext().get("datatypeMetadata"); + String datatype = "rnaseq-v2-gene-expression-zscores"; + + List logMessages = (List) datatypeMetadata.get(datatype, "logMessages"); + LOG.info(logMessages.get(0)); + if ((boolean) datatypeMetadata.get(datatype, "importData")) { + LOG.info(logMessages.get(1)); + jobExecution.getExecutionContext().put("currentDatatype", datatype); + return new FlowExecutionStatus("RUN"); + } + else { + LOG.warn(logMessages.get(1)); + return new FlowExecutionStatus("SKIP"); + } + }; + } + + /** + * Gene expression miRNA step decider. + * Sets flow execution status to SKIP if meta file and data files do not exist + * + * @return JobExecutionDecider + */ + @Bean + public JobExecutionDecider geneExpressionMiRnaStepDecider() { + return (JobExecution jobExecution, StepExecution stepExecution) -> { + MultiKeyMap datatypeMetadata = (MultiKeyMap) jobExecution.getExecutionContext().get("datatypeMetadata"); + String datatype = "mirna-expression"; + + List logMessages = (List) datatypeMetadata.get(datatype, "logMessages"); + LOG.info(logMessages.get(0)); + if ((boolean) datatypeMetadata.get(datatype, "importData")) { + LOG.info(logMessages.get(1)); + jobExecution.getExecutionContext().put("currentDatatype", datatype); + return new FlowExecutionStatus("RUN"); + } + else { + LOG.warn(logMessages.get(1)); + return new FlowExecutionStatus("SKIP"); + } + }; + } + + /** + * Gene expression miRNA median z-scores step decider. + * Sets flow execution status to SKIP if meta file and data files do not exist + * + * @return JobExecutionDecider + */ + @Bean + public JobExecutionDecider geneExpressionMiRnaMedianZscoresStepDecider() { + return (JobExecution jobExecution, StepExecution stepExecution) -> { + MultiKeyMap datatypeMetadata = (MultiKeyMap) jobExecution.getExecutionContext().get("datatypeMetadata"); + String datatype = "mirna-median-zscores"; + + List logMessages = (List) datatypeMetadata.get(datatype, "logMessages"); + LOG.info(logMessages.get(0)); + if ((boolean) datatypeMetadata.get(datatype, "importData")) { + LOG.info(logMessages.get(1)); + jobExecution.getExecutionContext().put("currentDatatype", datatype); + return new FlowExecutionStatus("RUN"); + } + else { + LOG.warn(logMessages.get(1)); + return new FlowExecutionStatus("SKIP"); + } + }; + } + + /** + * Gene expression miRNA merged median z-scores step decider. + * Sets flow execution status to SKIP if meta file and data files do not exist + * + * @return JobExecutionDecider + */ + @Bean + public JobExecutionDecider geneExpressionMiRnaMergedMedianZscoresStepDecider() { + return (JobExecution jobExecution, StepExecution stepExecution) -> { + MultiKeyMap datatypeMetadata = (MultiKeyMap) jobExecution.getExecutionContext().get("datatypeMetadata"); + String datatype = "mirna-merged-median-zscores"; + + List logMessages = (List) datatypeMetadata.get(datatype, "logMessages"); + LOG.info(logMessages.get(0)); + if ((boolean) datatypeMetadata.get(datatype, "importData")) { + LOG.info(logMessages.get(1)); + jobExecution.getExecutionContext().put("currentDatatype", datatype); + return new FlowExecutionStatus("RUN"); + } + else { + LOG.warn(logMessages.get(1)); + return new FlowExecutionStatus("SKIP"); + } + }; + } + + /** + * Gene expression mRNA outliers step decider. + * Sets flow execution status to SKIP if meta file and data files do not exist + * + * @return JobExecutionDecider + */ + @Bean + public JobExecutionDecider geneExpressionMRnaOutliersStepDecider() { + return (JobExecution jobExecution, StepExecution stepExecution) -> { + MultiKeyMap datatypeMetadata = (MultiKeyMap) jobExecution.getExecutionContext().get("datatypeMetadata"); + String datatype = "mrna-outliers"; + + List logMessages = (List) datatypeMetadata.get(datatype, "logMessages"); + LOG.info(logMessages.get(0)); + if ((boolean) datatypeMetadata.get(datatype, "importData")) { + LOG.info(logMessages.get(1)); + jobExecution.getExecutionContext().put("currentDatatype", datatype); + return new FlowExecutionStatus("RUN"); + } + else { + LOG.warn(logMessages.get(1)); + return new FlowExecutionStatus("SKIP"); + } + }; + } + + /** + * Gene expression capture step decider. + * Sets flow execution status to SKIP if meta file and data files do not exist + * + * @return JobExecutionDecider + */ + @Bean + public JobExecutionDecider geneExpressionCaptureStepDecider() { + return (JobExecution jobExecution, StepExecution stepExecution) -> { + MultiKeyMap datatypeMetadata = (MultiKeyMap) jobExecution.getExecutionContext().get("datatypeMetadata"); + String datatype = "capture-gene-expression"; + + List logMessages = (List) datatypeMetadata.get(datatype, "logMessages"); + LOG.info(logMessages.get(0)); + if ((boolean) datatypeMetadata.get(datatype, "importData")) { + LOG.info(logMessages.get(1)); + jobExecution.getExecutionContext().put("currentDatatype", datatype); + return new FlowExecutionStatus("RUN"); + } + else { + LOG.warn(logMessages.get(1)); + return new FlowExecutionStatus("SKIP"); + } + }; + } + + /** + * Gene expression capture z-scores step decider. + * Sets flow execution status to SKIP if meta file and data files do not exist + * + * @return JobExecutionDecider + */ + @Bean + public JobExecutionDecider geneExpressionCaptureZscoresStepDecider() { + return (JobExecution jobExecution, StepExecution stepExecution) -> { + MultiKeyMap datatypeMetadata = (MultiKeyMap) jobExecution.getExecutionContext().get("datatypeMetadata"); + String datatype = "capture-gene-expression-zscores"; + + List logMessages = (List) datatypeMetadata.get(datatype, "logMessages"); + LOG.info(logMessages.get(0)); + if ((boolean) datatypeMetadata.get(datatype, "importData")) { + LOG.info(logMessages.get(1)); + jobExecution.getExecutionContext().put("currentDatatype", datatype); + return new FlowExecutionStatus("RUN"); + } + else { + LOG.warn(logMessages.get(1)); + return new FlowExecutionStatus("SKIP"); + } + }; + } + + /** + * Gene expression other z-scores step decider. + * Sets flow execution status to SKIP if meta file and data files do not exist + * + * @return JobExecutionDecider + */ + @Bean + public JobExecutionDecider geneExpressionOtherZscoresStepDecider() { + return (JobExecution jobExecution, StepExecution stepExecution) -> { + MultiKeyMap datatypeMetadata = (MultiKeyMap) jobExecution.getExecutionContext().get("datatypeMetadata"); + String datatype = "other-gene-expression-zscores"; + + List logMessages = (List) datatypeMetadata.get(datatype, "logMessages"); + LOG.info(logMessages.get(0)); + if ((boolean) datatypeMetadata.get(datatype, "importData")) { + LOG.info(logMessages.get(1)); + jobExecution.getExecutionContext().put("currentDatatype", datatype); + return new FlowExecutionStatus("RUN"); + } + else { + LOG.warn(logMessages.get(1)); + return new FlowExecutionStatus("SKIP"); + } + }; + } + + /** + * Gene expression mRNAseq FPKM step decider. + * Sets flow execution status to SKIP if meta file and data files do not exist + * + * @return JobExecutionDecider + */ + @Bean + public JobExecutionDecider geneExpressionMRnaSeqFpkmStepDecider() { + return (JobExecution jobExecution, StepExecution stepExecution) -> { + MultiKeyMap datatypeMetadata = (MultiKeyMap) jobExecution.getExecutionContext().get("datatypeMetadata"); + String datatype = "mrna-seq-fpkm"; + + List logMessages = (List) datatypeMetadata.get(datatype, "logMessages"); + LOG.info(logMessages.get(0)); + if ((boolean) datatypeMetadata.get(datatype, "importData")) { + LOG.info(logMessages.get(1)); + jobExecution.getExecutionContext().put("currentDatatype", datatype); + return new FlowExecutionStatus("RUN"); + } + else { + LOG.warn(logMessages.get(1)); + return new FlowExecutionStatus("SKIP"); + } + }; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/step/GisticGenesDataStep.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/step/GisticGenesDataStep.java new file mode 100644 index 0000000..3737a3d --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/step/GisticGenesDataStep.java @@ -0,0 +1,270 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.step; + +import org.mskcc.cbio.model.Gistic; +import org.cbio.portal.pipelines.importer.model.GisticRecord; +import org.cbio.portal.pipelines.importer.config.tasklet.GisticMetadataTasklet; +import org.cbio.portal.pipelines.importer.config.listener.GisticDataListener; +import org.cbio.portal.pipelines.importer.config.reader.GisticDataReader; +import org.cbio.portal.pipelines.importer.config.processor.GisticDataProcessor; +import org.cbio.portal.pipelines.importer.config.writer.GisticDataWriter; + +import java.util.*; +import org.apache.commons.collections.map.MultiKeyMap; +import org.apache.commons.logging.*; + +import org.springframework.batch.core.*; +import org.springframework.batch.core.configuration.annotation.*; +import org.springframework.batch.core.job.builder.FlowBuilder; +import org.springframework.batch.core.job.flow.*; +import org.springframework.batch.core.scope.context.ChunkContext; +import org.springframework.batch.core.step.tasklet.Tasklet; +import org.springframework.batch.item.*; +import org.springframework.batch.repeat.RepeatStatus; + +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.context.annotation.*; + +/** + * + * @author ochoaa + */ +@Configuration +@EnableBatchProcessing +public class GisticGenesDataStep { + + public static final String IMPORT_GISTIC_GENES_DATA = "importGisticGenesData"; + + @Value("${chunk.interval}") + private int chunkInterval; + + @Autowired + public StepBuilderFactory stepBuilderFactory; + + private static final Log LOG = LogFactory.getLog(GisticGenesDataStep.class); + + @Bean + public Step importGisticGenesData() { + return stepBuilderFactory.get(IMPORT_GISTIC_GENES_DATA) + .flow(gisticGenesStepFlow()) + .build(); + } + + /** + * Gistic data flow. + * Execution of gistic data steps will begin with "gisticGenesAmpStep" and will + * execute conditionally based on their respective deciders + * + * @return Flow + */ + @Bean + public Flow gisticGenesStepFlow() { + return new FlowBuilder("gisticGenesStepFlow") + .start(initGisticGenesImportStep()) + .next(gisticGenesAmpStep()) + .build(); + } + + /** + * Step to initiate flow for gistic data steps. + * + * @return Step + */ + @Bean + public Step initGisticGenesImportStep() { + return stepBuilderFactory.get("initGisticGenesImportStep") + .tasklet((StepContribution sc, ChunkContext cc) -> { + return RepeatStatus.FINISHED; + }).build(); + } + + /*************************************************************************** + * Gistic data import steps and flows. + **************************************************************************/ + + /** + * Step for implementing gistic metadata tasklet. + * + * @return Step + */ + @Bean + public Step loadGisticMetadata() { + return stepBuilderFactory.get("loadGisticMetadata") + .tasklet(gisticMetadataTasklet()) + .build(); + } + + /** + * Tasklet for loading gistic metadata. + * + * @return Tasklet + */ + @Bean + @StepScope + public Tasklet gisticMetadataTasklet() { + return new GisticMetadataTasklet(); + } + + /** + * Gistic amp data import flow. + * + * @return Flow + */ + @Bean + public Flow gisticGenesAmpStep() { + return new FlowBuilder("gisticGenesAmpStep") + .start(gisticGenesAmpStepDecider()) + .on("RUN").to(loadGisticMetadata()) + .next(gisticStepBuilder("gisticGenesAmpStep")) + .next(gisticGenesDelStep()) + .from(gisticGenesAmpStepDecider()) + .on("SKIP").to(gisticGenesDelStep()) + .build(); + } + + /** + * Gistic del data import flow. + * + * @return Flow + */ + @Bean + public Flow gisticGenesDelStep() { + return new FlowBuilder("gisticGenesDelStep") + .start(gisticGenesDelStepDecider()) + .on("RUN").to(loadGisticMetadata()) + .next(gisticStepBuilder("gisticGenesDelStep")) + .from(gisticGenesDelStepDecider()) + .on("SKIP").end() + .build(); + } + + /** + * Universal step builder for gistic data. + * + * @param stepName + * @return Step + */ + public Step gisticStepBuilder(String stepName) { + return stepBuilderFactory.get(stepName) + . chunk(chunkInterval) + .reader(gisticDataReader()) + .processor(gisticDataProcessor()) + .writer(gisticDataWriter()) + .listener(gisticDataListener()) + .build(); + } + + /*************************************************************************** + * Gistic data listener, reader, processor, and writer. + **************************************************************************/ + + @Bean + public StepExecutionListener gisticDataListener() { + return new GisticDataListener(); + } + + @Bean + @StepScope + public ItemStreamReader gisticDataReader() { + return new GisticDataReader(); + } + + @Bean + public GisticDataProcessor gisticDataProcessor() { + return new GisticDataProcessor(); + } + + @Bean + @StepScope + public ItemStreamWriter gisticDataWriter() { + return new GisticDataWriter(); + } + + /*************************************************************************** + * Deciders for gistic data step. + **************************************************************************/ + + /** + * Gistic amp step decider. + * + * @return JobExecutionDecider + */ + @Bean + public JobExecutionDecider gisticGenesAmpStepDecider() { + return (JobExecution jobExecution, StepExecution stepExecution) -> { + MultiKeyMap datatypeMetadata = (MultiKeyMap) jobExecution.getExecutionContext().get("datatypeMetadata"); + String datatype = "gistic-genes-amp"; + + List logMessages = (List) datatypeMetadata.get(datatype, "logMessages"); + LOG.info(logMessages.get(0)); + if ((boolean) datatypeMetadata.get(datatype, "importData")) { + LOG.info(logMessages.get(1)); + jobExecution.getExecutionContext().put("currentDatatype", datatype); + return new FlowExecutionStatus("RUN"); + } + else { + LOG.warn(logMessages.get(1)); + return new FlowExecutionStatus("SKIP"); + } + }; + } + + /** + * Gistic del step decider. + * + * @return JobExecutionDecider + */ + @Bean + public JobExecutionDecider gisticGenesDelStepDecider() { + return (JobExecution jobExecution, StepExecution stepExecution) -> { + MultiKeyMap datatypeMetadata = (MultiKeyMap) jobExecution.getExecutionContext().get("datatypeMetadata"); + String datatype = "gistic-genes-del"; + + List logMessages = (List) datatypeMetadata.get(datatype, "logMessages"); + LOG.info(logMessages.get(0)); + if ((boolean) datatypeMetadata.get(datatype, "importData")) { + LOG.info(logMessages.get(1)); + jobExecution.getExecutionContext().put("currentDatatype", datatype); + return new FlowExecutionStatus("RUN"); + } + else { + LOG.warn(logMessages.get(1)); + return new FlowExecutionStatus("SKIP"); + } + }; + } + + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/step/MethylationDataStep.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/step/MethylationDataStep.java new file mode 100644 index 0000000..876c2ca --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/step/MethylationDataStep.java @@ -0,0 +1,295 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.step; + +import org.cbio.portal.pipelines.importer.config.tasklet.*; +import org.cbio.portal.pipelines.importer.model.ProfileDataRecord; +import org.cbio.portal.pipelines.importer.config.listener.ProfileDataListener; +import org.cbio.portal.pipelines.importer.config.reader.ProfileDataReader; +import org.cbio.portal.pipelines.importer.config.processor.ProfileDataProcessor; +import org.cbio.portal.pipelines.importer.config.writer.ProfileDataWriter; + +import java.util.*; +import org.apache.commons.collections.map.MultiKeyMap; +import org.apache.commons.logging.*; + +import org.springframework.batch.core.*; +import org.springframework.batch.core.configuration.annotation.*; +import org.springframework.batch.core.job.builder.FlowBuilder; +import org.springframework.batch.core.job.flow.*; +import org.springframework.batch.core.scope.context.ChunkContext; +import org.springframework.batch.core.step.tasklet.Tasklet; +import org.springframework.batch.item.*; +import org.springframework.batch.repeat.RepeatStatus; + +import org.springframework.beans.factory.annotation.*; +import org.springframework.context.annotation.*; + +/** + * + * @author ochoaa + */ +@Configuration +@EnableBatchProcessing +public class MethylationDataStep { + + public static final String IMPORT_METHYLATION_DATA = "importMethylationData"; + + @Value("${chunk.interval}") + private int chunkInterval; + + @Autowired + public StepBuilderFactory stepBuilderFactory; + + private static final Log LOG = LogFactory.getLog(MethylationDataStep.class); + + @Bean + public Step importMethylationData() { + return stepBuilderFactory.get(IMPORT_METHYLATION_DATA) + .flow(methylationStepFlow()) + .build(); + } + + /** + * Methylation data flow. + * Execution of methylation data steps will begin with "methylationHm27Step" and will + * execute conditionally based on their respective deciders + * + * @return Flow + */ + @Bean + public Flow methylationStepFlow() { + // execute methylation data steps sequentially starting with methylationHm27Step + return new FlowBuilder("methylationStepFlow") + .start(initMethylationImportStep()) + .next(methylationHm27Step()) + .build(); + } + + /** + * Step to initiate flow for methylation data steps. + * + * @return Step + */ + @Bean + public Step initMethylationImportStep() { + return stepBuilderFactory.get("initMethylationImportStep") + .tasklet((StepContribution sc, ChunkContext cc) -> { + return RepeatStatus.FINISHED; + }).build(); + } + + /*************************************************************************** + * Methylation data import steps and flows. + **************************************************************************/ + + /** + * Step for implementing genetic profile tasklet. + * + * @return Step + */ + @Bean + public Step loadMethylationGeneticProfile() { + return stepBuilderFactory.get("loadMethylationGeneticProfile") + .tasklet(methylationGeneticProfileTasklet()) + .build(); + } + + /** + * Tasklet for loading and importing genetic profile from meta file. + * + * @return Tasklet + */ + @Bean + @StepScope + public Tasklet methylationGeneticProfileTasklet() { + return new GeneticProfileTasklet(); + } + + /** + * Step for implementing profile metadata tasklet. + * + * @return Step + */ + @Bean + public Step loadMethylationMetadata() { + return stepBuilderFactory.get("loadMethylationMetadata") + .tasklet(loadMethylationMetadataTasklet()) + .build(); + } + + /** + * Tasklet for loading profile metadata from profile datafiles. + * + * @return Tasklet + */ + @Bean + @StepScope + public Tasklet loadMethylationMetadataTasklet() { + return new ProfileMetadataTasklet(); + } + + /** + * Flow for importing methylation-hm27 data. + * + * @return Flow + */ + @Bean + public Flow methylationHm27Step() { + return new FlowBuilder("methylationHm27Step") + .start(methylationHm27StepDecider()) + .on("RUN").to(loadMethylationGeneticProfile()) + .next(loadMethylationMetadata()) + .next(methylationStepBuilder("methylationHm27Step")) + .next(methylationHm450Step()) + .from(methylationHm27StepDecider()) + .on("SKIP").to(methylationHm450Step()) + .build(); + } + + /** + * Flow for importing methylation-hm450 data. + * + * @return Flow + */ + @Bean + public Flow methylationHm450Step() { + return new FlowBuilder("methylationHm450Step") + .start(methylationHm450StepDecider()) + .on("RUN").to(loadMethylationGeneticProfile()) + .next(loadMethylationMetadata()) + .next(methylationStepBuilder("methylationHm450Step")) + .from(methylationHm450StepDecider()) + .on("SKIP").end() + .build(); + } + + /** + * Universal step builder for methylation data. + * + * @param stepName + * @return Step + */ + public Step methylationStepBuilder(String stepName) { + return stepBuilderFactory.get(stepName) + . chunk(chunkInterval) + .reader(methylationDataReader()) + .processor(methylationDataProcessor()) + .writer(methylationDataWriter()) + .listener(methylationDataListener()) + .build(); + } + + /*************************************************************************** + * Methylation data listener, reader, processor, and writer. + **************************************************************************/ + + @Bean + public StepExecutionListener methylationDataListener() { + return new ProfileDataListener(); + } + + @Bean + @StepScope + public ItemStreamReader methylationDataReader() { + return new ProfileDataReader(); + } + + @Bean + public ProfileDataProcessor methylationDataProcessor() { + return new ProfileDataProcessor(); + } + + @Bean + @StepScope + public ItemStreamWriter methylationDataWriter() { + return new ProfileDataWriter(); + } + + /*************************************************************************** + * Deciders for methylation data steps. + **************************************************************************/ + + /** + * Methylation-hm27 step decider. + * Sets flow execution status to SKIP if meta file and data files do not exist + * + * @return JobExecutionDecider + */ + @Bean + public JobExecutionDecider methylationHm27StepDecider() { + return (JobExecution jobExecution, StepExecution stepExecution) -> { + MultiKeyMap datatypeMetadata = (MultiKeyMap) jobExecution.getExecutionContext().get("datatypeMetadata"); + String datatype = "methylation-hm27"; + + List logMessages = (List) datatypeMetadata.get(datatype, "logMessages"); + LOG.info(logMessages.get(0)); + if ((boolean) datatypeMetadata.get(datatype, "importData")) { + LOG.info(logMessages.get(1)); + jobExecution.getExecutionContext().put("currentDatatype", datatype); + return new FlowExecutionStatus("RUN"); + } + else { + LOG.warn(logMessages.get(1)); + return new FlowExecutionStatus("SKIP"); + } + }; + } + + /** + * Methylation-hm450 step decider. + * Sets flow execution status to SKIP if meta file and data files do not exist + * + * @return JobExecutionDecider + */ + @Bean + public JobExecutionDecider methylationHm450StepDecider() { + return (JobExecution jobExecution, StepExecution stepExecution) -> { + MultiKeyMap datatypeMetadata = (MultiKeyMap) jobExecution.getExecutionContext().get("datatypeMetadata"); + String datatype = "methylation-hm450"; + + List logMessages = (List) datatypeMetadata.get(datatype, "logMessages"); + LOG.info(logMessages.get(0)); + if ((boolean) datatypeMetadata.get(datatype, "importData")) { + LOG.info(logMessages.get(1)); + jobExecution.getExecutionContext().put("currentDatatype", datatype); + return new FlowExecutionStatus("RUN"); + } + else { + LOG.warn(logMessages.get(1)); + return new FlowExecutionStatus("SKIP"); + } + }; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/step/MutSigDataStep.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/step/MutSigDataStep.java new file mode 100644 index 0000000..cd77f89 --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/step/MutSigDataStep.java @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.step; + +import org.mskcc.cbio.model.MutSig; +import org.cbio.portal.pipelines.importer.model.MutSigRecord; +import org.cbio.portal.pipelines.importer.config.tasklet.MutSigMetadataTasklet; +import org.cbio.portal.pipelines.importer.config.listener.MutSigDataListener; +import org.cbio.portal.pipelines.importer.config.reader.MutSigDataReader; +import org.cbio.portal.pipelines.importer.config.processor.MutSigDataProcessor; +import org.cbio.portal.pipelines.importer.config.writer.MutSigDataWriter; + +import java.util.*; +import org.apache.commons.collections.map.MultiKeyMap; +import org.apache.commons.logging.*; + +import org.springframework.batch.core.*; +import org.springframework.batch.core.configuration.annotation.*; +import org.springframework.batch.core.job.builder.FlowBuilder; +import org.springframework.batch.core.job.flow.*; +import org.springframework.batch.core.step.tasklet.Tasklet; +import org.springframework.batch.item.*; + +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.context.annotation.*; + +/** + * + * @author ochoaa + */ +@Configuration +@EnableBatchProcessing +public class MutSigDataStep { + + public static final String IMPORT_MUT_SIG_DATA = "importMutSigData"; + + @Value("${chunk.interval}") + private int chunkInterval; + + @Autowired + public StepBuilderFactory stepBuilderFactory; + + private static final Log LOG = LogFactory.getLog(MutSigDataStep.class); + + @Bean + public Step importMutSigData() { + return stepBuilderFactory.get(IMPORT_MUT_SIG_DATA) + .flow(mutSigStepFlow()) + .build(); + } + + /*************************************************************************** + * Mutsig data import steps and flows. + **************************************************************************/ + + /** + * Mutsig data flow. + * Execution of mutsig data import is dependent on whether the mutsig metafile and + * datafile exist or not. + * If not then mutsig data import is skipped + * + * @return Flow + */ + @Bean + public Flow mutSigStepFlow() { + return new FlowBuilder("mutSigStepFlow") + .start(mutSigStepDecider()) + .on("RUN").to(loadMutSigMetadata()) + .next(mutSigStep()) + .from(mutSigStepDecider()) + .on("SKIP").end() + .build(); + } + + /** + * Step for implementing mutsig metadata tasklet. + * + * @return Step + */ + @Bean + public Step loadMutSigMetadata() { + return stepBuilderFactory.get("loadMutSigMetadata") + .tasklet(mutSigMetadataTasklet()) + .build(); + } + + /** + * Tasklet for loading mutsig metadata. + * + * @return Tasklet + */ + @Bean + @StepScope + public Tasklet mutSigMetadataTasklet() { + return new MutSigMetadataTasklet(); + } + + /** + * Mutsig data import step. + * + * @return Step + */ + @Bean + public Step mutSigStep() { + return stepBuilderFactory.get("mutSigStep") + . chunk(chunkInterval) + .reader(mutSigDataReader()) + .processor(mutSigDataProcessor()) + .writer(mutSigDataWriter()) + .listener(mutSigDataListener()) + .build(); + } + + /*************************************************************************** + * Mutsig data listener, reader, processor, and writer. + **************************************************************************/ + + @Bean + public StepExecutionListener mutSigDataListener() { + return new MutSigDataListener(); + } + + @Bean + @StepScope + public ItemStreamReader mutSigDataReader() { + return new MutSigDataReader(); + } + + @Bean + public MutSigDataProcessor mutSigDataProcessor() { + return new MutSigDataProcessor(); + } + + @Bean + @StepScope + public ItemStreamWriter mutSigDataWriter() { + return new MutSigDataWriter(); + } + + /*************************************************************************** + * Deciders for mutsig data step. + **************************************************************************/ + + /** + * Mutsig step decider. + * + * @return JobExecutionDecider + */ + @Bean + public JobExecutionDecider mutSigStepDecider() { + return (JobExecution jobExecution, StepExecution stepExecution) -> { + MultiKeyMap datatypeMetadata = (MultiKeyMap) jobExecution.getExecutionContext().get("datatypeMetadata"); + String datatype = "mutation-significance-v2"; + + List logMessages = (List) datatypeMetadata.get(datatype, "logMessages"); + LOG.info(logMessages.get(0)); + if ((boolean) datatypeMetadata.get(datatype, "importData")) { + LOG.info(logMessages.get(1)); + jobExecution.getExecutionContext().put("currentDatatype", datatype); + return new FlowExecutionStatus("RUN"); + } + else { + LOG.warn(logMessages.get(1)); + return new FlowExecutionStatus("SKIP"); + } + }; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/step/MutationDataStep.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/step/MutationDataStep.java new file mode 100644 index 0000000..51cc7c5 --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/step/MutationDataStep.java @@ -0,0 +1,340 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.step; + +import org.cbio.portal.pipelines.importer.model.MafRecord; +import org.cbio.portal.pipelines.importer.config.composite.CompositeMutationData; +import org.cbio.portal.pipelines.importer.config.tasklet.*; +import org.cbio.portal.pipelines.importer.config.listener.MutationDataListener; +import org.cbio.portal.pipelines.importer.config.reader.MutationDataReader; +import org.cbio.portal.pipelines.importer.config.processor.MutationDataProcessor; +import org.cbio.portal.pipelines.importer.config.writer.MutationDataWriter; + +import java.util.*; +import org.apache.commons.collections.map.MultiKeyMap; +import org.apache.commons.logging.*; + +import org.springframework.batch.core.*; +import org.springframework.batch.core.configuration.annotation.*; +import org.springframework.batch.core.job.builder.FlowBuilder; +import org.springframework.batch.core.job.flow.*; +import org.springframework.batch.core.scope.context.ChunkContext; +import org.springframework.batch.core.step.tasklet.Tasklet; +import org.springframework.batch.item.*; +import org.springframework.batch.repeat.RepeatStatus; + +import org.springframework.beans.factory.annotation.*; +import org.springframework.context.annotation.*; + +/** + * + * @author ochoaa + */ +@Configuration +@EnableBatchProcessing +public class MutationDataStep { + + public static final String IMPORT_MUTATION_DATA = "importMutationData"; + + @Value("${chunk.interval}") + private int chunkInterval; + + @Autowired + public StepBuilderFactory stepBuilderFactory; + + private static final Log LOG = LogFactory.getLog(MutationDataStep.class); + + @Bean + public Step importMutationData() { + return stepBuilderFactory.get(IMPORT_MUTATION_DATA) + .flow(mutationStepFlow()) + .build(); + } + + /** + * Mutation data flow. + * Execution of mutation data steps will begin with "mutationStep" and will + * execute conditionally based on their respective deciders + * + * @return Flow + */ + @Bean + public Flow mutationStepFlow() { + // execute mutation data steps sequentially starting with mutationStep + return new FlowBuilder("mutationStepFlow") + .start(initMutationImportStep()) + .next(mutationStep()) + .build(); + } + + /** + * Step to initiate flow for mutation data steps. + * + * @return Step + */ + @Bean + public Step initMutationImportStep() { + return stepBuilderFactory.get("initMutationImportStep") + .tasklet((StepContribution sc, ChunkContext cc) -> { + return RepeatStatus.FINISHED; + }).build(); + } + + /************************************************************************** + * Mutation data import steps and flows. + **************************************************************************/ + + /** + * Step for implementing genetic profile tasklet. + * + * @return Step + */ + @Bean + public Step loadMutationGeneticProfile() { + return stepBuilderFactory.get("loadMutationGeneticProfile") + .tasklet(mutationGeneticProfileTasklet()) + .build(); + } + + /** + * Tasklet for loading and importing genetic profile from meta file. + * + * @return Tasklet + */ + @Bean + @StepScope + public Tasklet mutationGeneticProfileTasklet() { + return new GeneticProfileTasklet(); + } + + /** + * Step for implementing mutation metadata tasklet. + * + * @return Step + */ + @Bean + public Step loadMutationMetadata() { + return stepBuilderFactory.get("loadMutationMetadata") + .tasklet(loadMutationMetadataTasklet()) + .build(); + } + + /** + * Tasklet for loading metadata from MAF files. + * + * @return Tasklet + */ + @Bean + @StepScope + public Tasklet loadMutationMetadataTasklet() { + return new MutationMetadataTasklet(); + } + + /** + * Flow for importing mutation data. + * + * @return Flow + */ + @Bean + public Flow mutationStep() { + return new FlowBuilder("mutationStep") + .start(mutationStepDecider()) + .on("RUN").to(loadMutationGeneticProfile()) + .next(loadMutationMetadata()) + .next(mutationStepBuilder("mutationStep")) + .next(mutationGermlineStep()) + .from(mutationStepDecider()) + .on("SKIP").to(mutationGermlineStep()) + .build(); + } + + /** + * Flow for importing germline mutation data. + * + * @return Flow + */ + @Bean + public Flow mutationGermlineStep() { + return new FlowBuilder("mutationGermlineStep") + .start(mutationGermlineStepDecider()) + .on("RUN").to(loadMutationGeneticProfile()) + .next(loadMutationMetadata()) + .next(mutationStepBuilder("mutationGermlineStep")) + .next(mutationManualStep()) + .from(mutationGermlineStepDecider()) + .on("SKIP").to(mutationManualStep()) + .build(); + } + + /** + * Flow for importing manually curated mutation data. + * + * @return Flow + */ + @Bean + public Flow mutationManualStep() { + return new FlowBuilder("mutationManualStep") + .start(mutationManualStepDecider()) + .on("RUN").to(loadMutationGeneticProfile()) + .next(loadMutationMetadata()) + .next(mutationStepBuilder("mutationManualStep")) + .from(mutationManualStepDecider()) + .on("SKIP").end() + .build(); + } + + /** + * Universal step builder for mutation data. + * + * @param stepName + * @return Step + */ + public Step mutationStepBuilder(String stepName) { + return stepBuilderFactory.get(stepName) + . chunk(chunkInterval) + .reader(mutationDataReader()) + .processor(mutationDataProcessor()) + .writer(mutationDataWriter()) + .listener(mutationDataListener()) + .build(); + } + + /*************************************************************************** + * Mutation data listener, reader, processor, and writer. + **************************************************************************/ + + @Bean + public StepExecutionListener mutationDataListener() { + return new MutationDataListener(); + } + + @Bean + @StepScope + public ItemStreamReader mutationDataReader() { + return new MutationDataReader(); + } + + @Bean + public MutationDataProcessor mutationDataProcessor() { + return new MutationDataProcessor(); + } + + @Bean + @StepScope + public ItemStreamWriter mutationDataWriter() { + return new MutationDataWriter(); + } + + /******************************************************************* + * Deciders for mutation data steps. + *******************************************************************/ + + /** + * Mutation step decider. + * Sets flow execution status to SKIP if meta file and data files do not exist + * + * @return JobExecutionDecider + */ + @Bean + public JobExecutionDecider mutationStepDecider() { + return (JobExecution jobExecution, StepExecution stepExecution) -> { + MultiKeyMap datatypeMetadata = (MultiKeyMap) jobExecution.getExecutionContext().get("datatypeMetadata"); + String datatype = "mutation"; + + List logMessages = (List) datatypeMetadata.get(datatype, "logMessages"); + LOG.info(logMessages.get(0)); + if ((boolean) datatypeMetadata.get(datatype, "importData")) { + LOG.info(logMessages.get(1)); + jobExecution.getExecutionContext().put("currentDatatype", datatype); + return new FlowExecutionStatus("RUN"); + } + else { + LOG.warn(logMessages.get(1)); + return new FlowExecutionStatus("SKIP"); + } + }; + } + + /** + * Mutation Germline step decider. + * Sets flow execution status to SKIP if meta file and data files do not exist + * + * @return JobExecutionDecider + */ + @Bean + public JobExecutionDecider mutationGermlineStepDecider() { + return (JobExecution jobExecution, StepExecution stepExecution) -> { + MultiKeyMap datatypeMetadata = (MultiKeyMap) jobExecution.getExecutionContext().get("datatypeMetadata"); + String datatype = "mutation-germline"; + + List logMessages = (List) datatypeMetadata.get(datatype, "logMessages"); + LOG.info(logMessages.get(0)); + if ((boolean) datatypeMetadata.get(datatype, "importData")) { + LOG.info(logMessages.get(1)); + jobExecution.getExecutionContext().put("currentDatatype", datatype); + return new FlowExecutionStatus("RUN"); + } + else { + LOG.warn(logMessages.get(1)); + return new FlowExecutionStatus("SKIP"); + } + }; + } + + /** + * Mutation Manual step decider. + * Sets flow execution status to SKIP if meta file and data files do not exist + * + * @return JobExecutionDecider + */ + @Bean + public JobExecutionDecider mutationManualStepDecider() { + return (JobExecution jobExecution, StepExecution stepExecution) -> { + MultiKeyMap datatypeMetadata = (MultiKeyMap) jobExecution.getExecutionContext().get("datatypeMetadata"); + String datatype = "mutation-manual"; + + List logMessages = (List) datatypeMetadata.get(datatype, "logMessages"); + LOG.info(logMessages.get(0)); + if ((boolean) datatypeMetadata.get(datatype, "importData")) { + LOG.info(logMessages.get(1)); + jobExecution.getExecutionContext().put("currentDatatype", datatype); + return new FlowExecutionStatus("RUN"); + } + else { + LOG.warn(logMessages.get(1)); + return new FlowExecutionStatus("SKIP"); + } + }; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/step/ProteinLevelDataStep.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/step/ProteinLevelDataStep.java new file mode 100644 index 0000000..0b16af0 --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/step/ProteinLevelDataStep.java @@ -0,0 +1,339 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.step; + +import org.cbio.portal.pipelines.importer.config.tasklet.*; +import org.cbio.portal.pipelines.importer.model.ProfileDataRecord; +import org.cbio.portal.pipelines.importer.config.listener.ProfileDataListener; +import org.cbio.portal.pipelines.importer.config.reader.ProfileDataReader; +import org.cbio.portal.pipelines.importer.config.processor.ProfileDataProcessor; +import org.cbio.portal.pipelines.importer.config.writer.ProfileDataWriter; + +import java.util.*; +import org.apache.commons.collections.map.MultiKeyMap; +import org.apache.commons.logging.*; + +import org.springframework.batch.core.*; +import org.springframework.batch.core.configuration.annotation.*; +import org.springframework.batch.core.job.builder.FlowBuilder; +import org.springframework.batch.core.job.flow.*; +import org.springframework.batch.core.scope.context.ChunkContext; +import org.springframework.batch.core.step.tasklet.Tasklet; +import org.springframework.batch.item.*; +import org.springframework.batch.repeat.RepeatStatus; + +import org.springframework.beans.factory.annotation.*; +import org.springframework.context.annotation.*; + +/** + * + * @author ochoaa + */ +@Configuration +@EnableBatchProcessing +public class ProteinLevelDataStep { + + public static final String IMPORT_PROTEIN_LEVEL_DATA = "importProteinLevelData"; + + @Value("${chunk.interval}") + private int chunkInterval; + + @Autowired + public StepBuilderFactory stepBuilderFactory; + + private static final Log LOG = LogFactory.getLog(ProteinLevelDataStep.class); + + @Bean + public Step importProteinLevelData() { + return stepBuilderFactory.get(IMPORT_PROTEIN_LEVEL_DATA) + .flow(proteinLevelStepFlow()) + .build(); + } + + /** + * Protein-level data flow. + * Execution of protein-level data steps will begin with "rppaStep" and will + * execute conditionally based on their respective deciders + * + * @return Flow + */ + @Bean + public Flow proteinLevelStepFlow() { + // execute protein-level data steps sequentially starting with rppaStep + return new FlowBuilder("proteinLevelStepFlow") + .start(initProteinLevelImportStep()) + .next(rppaStep()) + .build(); + } + + /** + * Step to initiate flow for protein-level data steps. + * + * @return Step + */ + @Bean + public Step initProteinLevelImportStep() { + return stepBuilderFactory.get("initProteinLevelImportStep") + .tasklet((StepContribution sc, ChunkContext cc) -> { + return RepeatStatus.FINISHED; + }).build(); + } + + /*************************************************************************** + * Protein-level data import steps and flows. + **************************************************************************/ + + /** + * Step for implementing genetic profile tasklet. + * + * @return Step + */ + @Bean + public Step loadProteinLevelGeneticProfile() { + return stepBuilderFactory.get("loadProteinLevelGeneticProfile") + .tasklet(proteinLevelGeneticProfileTasklet()) + .build(); + } + + /** + * Tasklet for loading and importing genetic profile from meta file. + * + * @return Tasklet + */ + @Bean + @StepScope + public Tasklet proteinLevelGeneticProfileTasklet() { + return new GeneticProfileTasklet(); + } + + /** + * Step for implementing profile metadata tasklet. + * + * @return Step + */ + @Bean + public Step loadProteinLevelMetadata() { + return stepBuilderFactory.get("loadProteinLevelMetadata") + .tasklet(loadProteinLevelMetadataTasklet()) + .build(); + } + + /** + * Tasklet for loading profile metadata from profile datafiles. + * + * @return Tasklet + */ + @Bean + @StepScope + public Tasklet loadProteinLevelMetadataTasklet() { + return new ProfileMetadataTasklet(); + } + + /** + * Flow for importing RPPA data. + * + * @return Flow + */ + @Bean + public Flow rppaStep() { + return new FlowBuilder("rppaStep") + .start(rppaStepDecider()) + .on("RUN").to(loadProteinLevelGeneticProfile()) + .next(loadProteinLevelMetadata()) + .next(proteinLevelStepBuilder("rppaStep")) + .next(rppaZscoresStep()) + .from(rppaStepDecider()) + .on("SKIP").to(rppaZscoresStep()) + .build(); + } + + /** + * Flow for importing RPPA Z-scores data. + * + * @return Flow + */ + @Bean + public Flow rppaZscoresStep() { + return new FlowBuilder("rppaZscoresStep") + .start(rppaZscoresStepDecider()) + .on("RUN").to(loadProteinLevelGeneticProfile()) + .next(loadProteinLevelMetadata()) + .next(proteinLevelStepBuilder("rppaZscoresStep")) + .next(proteinQuantificationStep()) + .from(rppaZscoresStepDecider()) + .on("SKIP").to(proteinQuantificationStep()) + .build(); + } + + /** + * Flow for importing protein quantification data. + * + * @return Flow + */ + @Bean + public Flow proteinQuantificationStep() { + return new FlowBuilder("proteinQuantificationStep") + .start(proteinQuantificationStepDecider()) + .on("RUN").to(loadProteinLevelGeneticProfile()) + .next(loadProteinLevelMetadata()) + .next(proteinLevelStepBuilder("proteinQuantificationStep")) + .from(proteinQuantificationStepDecider()) + .on("SKIP").end() + .build(); + } + + /** + * Universal step builder for protein-level data. + * + * @param stepName + * @return Step + */ + public Step proteinLevelStepBuilder(String stepName) { + return stepBuilderFactory.get(stepName) + . chunk(chunkInterval) + .reader(proteinLevelDataReader()) + .processor(proteinLevelDataProcessor()) + .writer(proteinLevelDataWriter()) + .listener(proteinLevelDataListener()) + .build(); + } + + /*************************************************************************** + * Protein-level data listener, reader, processor, and writer. + **************************************************************************/ + + @Bean + public StepExecutionListener proteinLevelDataListener() { + return new ProfileDataListener(); + } + + @Bean + @StepScope + public ItemStreamReader proteinLevelDataReader() { + return new ProfileDataReader(); + } + + @Bean + public ProfileDataProcessor proteinLevelDataProcessor() { + return new ProfileDataProcessor(); + } + + @Bean + @StepScope + public ItemStreamWriter proteinLevelDataWriter() { + return new ProfileDataWriter(); + } + + /*************************************************************************** + * Deciders for protein-level data steps. + **************************************************************************/ + + /** + * RPPA step decider. + * Sets flow execution status to SKIP if meta file and data files do not exist + * + * @return JobExecutionDecider + */ + @Bean + public JobExecutionDecider rppaStepDecider() { + return (JobExecution jobExecution, StepExecution stepExecution) -> { + MultiKeyMap datatypeMetadata = (MultiKeyMap) jobExecution.getExecutionContext().get("datatypeMetadata"); + String datatype = "rppa"; + + List logMessages = (List) datatypeMetadata.get(datatype, "logMessages"); + LOG.info(logMessages.get(0)); + if ((boolean) datatypeMetadata.get(datatype, "importData")) { + LOG.info(logMessages.get(1)); + jobExecution.getExecutionContext().put("currentDatatype", datatype); + return new FlowExecutionStatus("RUN"); + } + else { + LOG.warn(logMessages.get(1)); + return new FlowExecutionStatus("SKIP"); + } + }; + } + + /** + * RPPA Z-scores step decider. + * Sets flow execution status to SKIP if meta file and data files do not exist + * + * @return JobExecutionDecider + */ + @Bean + public JobExecutionDecider rppaZscoresStepDecider() { + return (JobExecution jobExecution, StepExecution stepExecution) -> { + MultiKeyMap datatypeMetadata = (MultiKeyMap) jobExecution.getExecutionContext().get("datatypeMetadata"); + String datatype = "rppa-zscores"; + + List logMessages = (List) datatypeMetadata.get(datatype, "logMessages"); + LOG.info(logMessages.get(0)); + if ((boolean) datatypeMetadata.get(datatype, "importData")) { + LOG.info(logMessages.get(1)); + jobExecution.getExecutionContext().put("currentDatatype", datatype); + return new FlowExecutionStatus("RUN"); + } + else { + LOG.warn(logMessages.get(1)); + return new FlowExecutionStatus("SKIP"); + } + }; + } + + /** + * Protein quantification step decider. + * Sets flow execution status to SKIP if meta file and data files do not exist + * + * @return JobExecutionDecider + */ + @Bean + public JobExecutionDecider proteinQuantificationStepDecider() { + return (JobExecution jobExecution, StepExecution stepExecution) -> { + MultiKeyMap datatypeMetadata = (MultiKeyMap) jobExecution.getExecutionContext().get("datatypeMetadata"); + String datatype = "protein-quantification"; + + List logMessages = (List) datatypeMetadata.get(datatype, "logMessages"); + LOG.info(logMessages.get(0)); + if ((boolean) datatypeMetadata.get(datatype, "importData")) { + LOG.info(logMessages.get(1)); + jobExecution.getExecutionContext().put("currentDatatype", datatype); + return new FlowExecutionStatus("RUN"); + } + else { + LOG.warn(logMessages.get(1)); + return new FlowExecutionStatus("SKIP"); + } + }; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/step/StructuralVariantDataStep.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/step/StructuralVariantDataStep.java new file mode 100644 index 0000000..8d4f722 --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/step/StructuralVariantDataStep.java @@ -0,0 +1,223 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.step; + +import org.mskcc.cbio.model.StructuralVariant; +import org.cbio.portal.pipelines.importer.model.StructuralVariantRecord; +import org.cbio.portal.pipelines.importer.config.tasklet.*; +import org.cbio.portal.pipelines.importer.config.listener.StructuralVariantDataListener; +import org.cbio.portal.pipelines.importer.config.reader.StructuralVariantDataReader; +import org.cbio.portal.pipelines.importer.config.processor.StructuralVariantDataProcessor; +import org.cbio.portal.pipelines.importer.config.writer.StructuralVariantDataWriter; + +import java.util.*; +import org.apache.commons.collections.map.MultiKeyMap; +import org.apache.commons.logging.*; + +import org.springframework.batch.core.*; +import org.springframework.batch.core.configuration.annotation.*; +import org.springframework.batch.core.job.builder.FlowBuilder; +import org.springframework.batch.core.job.flow.*; +import org.springframework.batch.core.step.tasklet.Tasklet; +import org.springframework.batch.item.*; + +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.context.annotation.*; + +/** + * + * @author ochoaa + */ +@Configuration +@EnableBatchProcessing +public class StructuralVariantDataStep { + + public static final String IMPORT_STRUCTURAL_VARIANT_DATA = "importStructuralVariantDataStep"; + + @Value("${chunk.interval}") + private int chunkInterval; + + @Autowired + public StepBuilderFactory stepBuilderFactory; + + private static final Log LOG = LogFactory.getLog(StructuralVariantDataStep.class); + + @Bean + public Step importStructuralVariantData() { + return stepBuilderFactory.get(IMPORT_STRUCTURAL_VARIANT_DATA) + .flow(structuralVariantStepFlow()) + .build(); + } + + /************************************************************************** + * Mutation data import steps and flows. + **************************************************************************/ + + /** + * Structural variant data flow. + * Execution of structural variant data import is dependent on whether the + * structural variant metafile and datafile exist or not. + * If not then the structural variant data import is skipped + * + * @return Flow + */ + @Bean + public Flow structuralVariantStepFlow() { + return new FlowBuilder("structuralVariantStepFlow") + .start(structuralVariantStepDecider()) + .on("RUN").to(loadStructuralVariantGeneticProfile()) + .next(loadStructuralVariantMetadata()) + .next(structuralVariantStep()) + .from(structuralVariantStepDecider()) + .on("SKIP").end() + .build(); + } + + /** + * Step for implementing genetic profile tasklet. + * + * @return Step + */ + @Bean + public Step loadStructuralVariantGeneticProfile() { + return stepBuilderFactory.get("loadStructuralVariantGeneticProfile") + .tasklet(structuralVariantGeneticProfileTasklet()) + .build(); + } + + /** + * Tasklet for loading and importing genetic profile from meta file. + * + * @return Tasklet + */ + @Bean + @StepScope + public Tasklet structuralVariantGeneticProfileTasklet() { + return new GeneticProfileTasklet(); + } + + /** + * Step for implementing structural variant metadata tasklet. + * + * @return Step + */ + @Bean + public Step loadStructuralVariantMetadata() { + return stepBuilderFactory.get("loadStructuralVariantMetadata") + .tasklet(structuralVariantMetadataTasklet()) + .build(); + } + + /** + * Tasklet for loading structural variant metadata. + * + * @return Tasklet + */ + @Bean + @StepScope + public Tasklet structuralVariantMetadataTasklet() { + return new StructuralVariantMetadataTasklet(); + } + + /** + * Structural variant data import step. + * + * @return Step + */ + @Bean + public Step structuralVariantStep() { + return stepBuilderFactory.get("structuralVariantStep") + . chunk(chunkInterval) + .reader(structuralVariantDataReader()) + .processor(structuralVariantDataProcessor()) + .writer(structuralVariantDataWriter()) + .listener(structuralVariantDataListener()) + .build(); + } + + /*************************************************************************** + * Structural variant data listener, reader, processor, and writer. + **************************************************************************/ + + @Bean + public StepExecutionListener structuralVariantDataListener() { + return new StructuralVariantDataListener(); + } + + @Bean + @StepScope + public ItemStreamReader structuralVariantDataReader() { + return new StructuralVariantDataReader(); + } + + @Bean + public StructuralVariantDataProcessor structuralVariantDataProcessor() { + return new StructuralVariantDataProcessor(); + } + + @Bean + @StepScope + public ItemStreamWriter structuralVariantDataWriter() { + return new StructuralVariantDataWriter(); + } + + /*************************************************************************** + * Decider for structural variant data step. + **************************************************************************/ + + /** + * Structural variant step decider. + * + * @return JobExecutionDecider + */ + @Bean + public JobExecutionDecider structuralVariantStepDecider() { + return (JobExecution jobExecution, StepExecution stepExecution) -> { + MultiKeyMap datatypeMetadata = (MultiKeyMap) jobExecution.getExecutionContext().get("datatypeMetadata"); + String datatype = "structural-variant"; + + List logMessages = (List) datatypeMetadata.get(datatype, "logMessages"); + LOG.info(logMessages.get(0)); + if ((boolean) datatypeMetadata.get(datatype, "importData")) { + LOG.info(logMessages.get(1)); + jobExecution.getExecutionContext().put("currentDatatype", datatype); + return new FlowExecutionStatus("RUN"); + } + else { + LOG.warn(logMessages.get(1)); + return new FlowExecutionStatus("SKIP"); + } + }; + } +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/step/TimelineDataStep.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/step/TimelineDataStep.java new file mode 100644 index 0000000..4f7ac39 --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/step/TimelineDataStep.java @@ -0,0 +1,199 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.step; + +import org.mskcc.cbio.model.ClinicalEvent; +import org.cbio.portal.pipelines.importer.model.TimelineRecord; +import org.cbio.portal.pipelines.importer.config.tasklet.TimelineMetadataTasklet; +import org.cbio.portal.pipelines.importer.config.listener.TimelineDataListener; +import org.cbio.portal.pipelines.importer.config.reader.TimelineDataReader; +import org.cbio.portal.pipelines.importer.config.processor.TimelineDataProcessor; +import org.cbio.portal.pipelines.importer.config.writer.TimelineDataWriter; + +import java.util.*; +import org.apache.commons.collections.map.MultiKeyMap; +import org.apache.commons.logging.*; + +import org.springframework.batch.core.*; +import org.springframework.batch.core.configuration.annotation.*; +import org.springframework.batch.core.job.builder.FlowBuilder; +import org.springframework.batch.core.job.flow.*; +import org.springframework.batch.core.step.tasklet.Tasklet; +import org.springframework.batch.item.*; + +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.context.annotation.*; + +/** + * + * @author ochoaa + */ +@Configuration +@EnableBatchProcessing +public class TimelineDataStep { + + public static final String IMPORT_TIMELINE_DATA = "importTimelineData"; + + @Value("${chunk.interval}") + private int chunkInterval; + + @Autowired + public StepBuilderFactory stepBuilderFactory; + + private static final Log LOG = LogFactory.getLog(TimelineDataStep.class); + + @Bean + public Step importTimelineData() { + return stepBuilderFactory.get(IMPORT_TIMELINE_DATA) + .flow(timelineStepFlow()) + .build(); + } + + /*************************************************************************** + * Timeline data import steps and flows. + **************************************************************************/ + + /** + * Timeline data flow. + * Execution of timeline data import is dependent on whether the timeline metafile and + * datafiles exist or not. + * If not then timeline data import is skipped + * + * @return Flow + */ + @Bean + public Flow timelineStepFlow() { + return new FlowBuilder("timelineStepFlow") + .start(timelineStepDecider()) + .on("RUN").to(loadTimelineMetadata()) + .next(timelineStep()) + .from(timelineStepDecider()) + .on("SKIP").end() + .build(); + } + + /** + * Step for implementing timeline metadata tasklet. + * + * @return Step + */ + @Bean + public Step loadTimelineMetadata() { + return stepBuilderFactory.get("loadTimelineMetadata") + .tasklet(timelineMetadataTasklet()) + .build(); + } + + /** + * Tasklet for loading timeline metadata. + * + * @return Tasklet + */ + @Bean + @StepScope + public Tasklet timelineMetadataTasklet() { + return new TimelineMetadataTasklet(); + } + + /** + * Timeline data import step. + * + * @return Step + */ + @Bean Step timelineStep() { + return stepBuilderFactory.get("timelineStep") + . chunk(chunkInterval) + .reader(timelineDataReader()) + .processor(timelineDataProcessor()) + .writer(timelineDataWriter()) + .listener(timelineDataListener()) + .build(); + } + + /*************************************************************************** + * Timeline data listener, reader, processor, and writer. + **************************************************************************/ + + @Bean + public StepExecutionListener timelineDataListener() { + return new TimelineDataListener(); + } + + @Bean + @StepScope + public ItemStreamReader timelineDataReader() { + return new TimelineDataReader(); + } + + @Bean + public TimelineDataProcessor timelineDataProcessor() { + return new TimelineDataProcessor(); + } + + @Bean + @StepScope + public ItemStreamWriter timelineDataWriter() { + return new TimelineDataWriter(); + } + + /*************************************************************************** + * Deciders for timeline data step. + **************************************************************************/ + + /** + * Timeline step decider. + * + * @return JobExecutionDecider + */ + @Bean + public JobExecutionDecider timelineStepDecider() { + return (JobExecution jobExecution, StepExecution stepExecution) -> { + MultiKeyMap datatypeMetadata = (MultiKeyMap) jobExecution.getExecutionContext().get("datatypeMetadata"); + String datatype = "time-line-data"; + + List logMessages = (List) datatypeMetadata.get(datatype, "logMessages"); + LOG.info(logMessages.get(0)); + if ((boolean) datatypeMetadata.get(datatype, "importData")) { + LOG.info(logMessages.get(1)); + jobExecution.getExecutionContext().put("currentDatatype", datatype); + return new FlowExecutionStatus("RUN"); + } + else { + LOG.warn(logMessages.get(1)); + return new FlowExecutionStatus("SKIP"); + } + }; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/tasklet/CancerStudyTasklet.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/tasklet/CancerStudyTasklet.java new file mode 100644 index 0000000..0c08b82 --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/tasklet/CancerStudyTasklet.java @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.tasklet; + +import org.mskcc.cbio.model.CancerStudy; +import org.mskcc.cbio.persistence.jdbc.CancerStudyJdbcDaoImpl; + +import java.io.*; +import java.util.*; +import org.apache.commons.logging.*; + +import org.springframework.batch.core.StepContribution; +import org.springframework.batch.core.configuration.annotation.JobScope; +import org.springframework.batch.core.scope.context.ChunkContext; +import org.springframework.batch.core.step.tasklet.Tasklet; +import org.springframework.batch.repeat.RepeatStatus; +import org.springframework.beans.factory.annotation.*; + +/** + * Tasklet to import cancer study from meta_study.txt + * @author ochoaa + */ +@JobScope +public class CancerStudyTasklet implements Tasklet { + + @Value("#{jobParameters[stagingDirectory]}") + private String stagingDirectory; + + @Autowired + CancerStudyJdbcDaoImpl cancerStudyJdbcDaoImpl; + + private static final Log LOG = LogFactory.getLog(CancerStudyTasklet.class); + + @Override + public RepeatStatus execute(StepContribution stepContribution, ChunkContext chunkContext) throws Exception { + File metaFile = new File(stagingDirectory, "meta_study.txt"); + + // try to load cancer study meta data from study path + CancerStudy cancerStudy = loadCancerStudy(metaFile); + if (cancerStudy == null) { + chunkContext.getStepContext().getStepExecution().getJobExecution().getExecutionContext().put("importCancerStudy", false); + return RepeatStatus.FINISHED; + } + + // check for existing study with matching cancer study identifier and delete if exists + String cancerStudyIdentifier = cancerStudy.getCancerStudyIdentifier(); + CancerStudy existingStudy = cancerStudyJdbcDaoImpl.getCancerStudy(cancerStudy.getCancerStudyIdentifier()); + if (existingStudy != null) { + LOG.warn("Cancer study found with matching cancer study id: " + existingStudy.getCancerStudyIdentifier()); + chunkContext.getStepContext().getStepExecution().getJobExecution().getExecutionContext().put("existingStudy", existingStudy); + cancerStudyJdbcDaoImpl.deleteCancerStudy(existingStudy.getCancerStudyId()); + } + // import new cancer study + LOG.info("Importing cancer study: " + cancerStudyIdentifier); + CancerStudy newCancerStudy = cancerStudyJdbcDaoImpl.addCancerStudy(cancerStudy); + + // add default rollback state, original cancer study identifier, cancer study, and import cancer study status to execution context + chunkContext.getStepContext().getStepExecution().getJobExecution().getExecutionContext().put("rollbackCancerStudyState", false); + chunkContext.getStepContext().getStepExecution().getJobExecution().getExecutionContext().put("cancerStudy", newCancerStudy); + chunkContext.getStepContext().getStepExecution().getJobExecution().getExecutionContext().put("importCancerStudy", true); + return RepeatStatus.FINISHED; + } + + /** + * Load cancer study from meta_study.txt. + * Returns null if meta file doesn't exist or error loading meta data + * + * @param metaFilename + * @return CancerStudy + */ + private CancerStudy loadCancerStudy(File metaFile) throws IOException { + CancerStudy newCancerStudy = null; + + if (!metaFile.exists()) { + LOG.error("Could not find meta_study.txt in study path: " + stagingDirectory); + } + else { + Properties properties = new Properties(); + properties.load(new FileInputStream(metaFile)); + + try { + newCancerStudy = loadMetaStudyProperties(properties); + } + catch (NullPointerException ex) { + LOG.error("Error loading: " + metaFile.getCanonicalPath()); + } + } + + return newCancerStudy; + } + + /** + * Load cancer study from meta_study.txt properties. + * + * @param properties + * @return CancerStudy + */ + private CancerStudy loadMetaStudyProperties(Properties properties) { + CancerStudy cancerStudy = new CancerStudy(); + cancerStudy.setCancerStudyId(-1); + cancerStudy.setCancerStudyIdentifier(properties.getProperty("cancer_study_identifier")); + cancerStudy.setTypeOfCancerId(properties.getProperty("type_of_cancer")); + cancerStudy.setName(properties.getProperty("name")); + cancerStudy.setDescription(properties.getProperty("description")); + cancerStudy.setImportDate(new Date()); + + String shortName = cancerStudy.getName(); + boolean publicStudy = false; + String pmid = ""; + String citation = ""; + String groups = ""; + int status = 0; + try { + shortName = properties.getProperty("short_name"); + publicStudy = properties.getProperty("public_study").equalsIgnoreCase("true"); + + pmid = properties.getProperty("pmid"); + citation = properties.getProperty("citation"); + groups = properties.getProperty("groups"); + status = properties.getProperty("status").equals("1")?1:0; + } + catch (NullPointerException ex) {} + + cancerStudy.setShortName(shortName); + cancerStudy.setPublicStudy(publicStudy); + cancerStudy.setPmid(pmid); + cancerStudy.setCitation(citation); + cancerStudy.setGroups(groups); + cancerStudy.setStatus(status); + + return cancerStudy; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/tasklet/ClinicalAttributeTasklet.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/tasklet/ClinicalAttributeTasklet.java new file mode 100644 index 0000000..826fa80 --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/tasklet/ClinicalAttributeTasklet.java @@ -0,0 +1,199 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.tasklet; + +import org.mskcc.cbio.model.*; +import org.mskcc.cbio.persistence.jdbc.ClinicalAttributeJdbcDaoImpl; +import org.cbio.portal.pipelines.importer.util.DataFileUtils; + +import java.io.*; +import java.util.*; +import org.apache.commons.collections.map.MultiKeyMap; +import org.apache.commons.logging.*; + +import org.springframework.batch.core.StepContribution; +import org.springframework.batch.core.scope.context.ChunkContext; +import org.springframework.batch.core.step.tasklet.Tasklet; +import org.springframework.batch.repeat.RepeatStatus; +import org.springframework.beans.factory.annotation.*; + +/** + * + * @author ochoaa + */ +public class ClinicalAttributeTasklet implements Tasklet { + + @Autowired + ClinicalAttributeJdbcDaoImpl clinicalAttributeJdbcDaoImpl; + + private final Set IDENTIFYING_ATTRIBUTES = new HashSet<>(Arrays.asList(new String[]{"PATIENT_ID", "SAMPLE_ID"})); + + private CancerStudy cancerStudy; + private int newClinicalAttributes; + + private static final Log LOG = LogFactory.getLog(ClinicalAttributeTasklet.class); + + @Override + public RepeatStatus execute(StepContribution stepContribution, ChunkContext chunkContext) throws Exception { + this.cancerStudy = (CancerStudy) chunkContext.getStepContext().getJobExecutionContext().get("cancerStudy"); + + // get datatype and metafile properties from datatype metadata in execution context + MultiKeyMap datatypeMetadata = (MultiKeyMap) chunkContext.getStepContext().getJobExecutionContext().get("datatypeMetadata"); + String datatype = (String) chunkContext.getStepContext().getJobExecutionContext().get("currentDatatype"); + Properties properties = (Properties) datatypeMetadata.get(datatype, "properties"); + + // resolve attribute type from metafile properties - default is SAMPLE + String attributeType = "SAMPLE"; + if (properties.getProperty("datatype").equals("PATIENT_ATTRIBUTES")) { + attributeType = "PATIENT"; + } + + // load clinical attributes and data file metadata + MultiKeyMap clinicalMetadata = new MultiKeyMap(); + List dataFileList = (List) datatypeMetadata.get(datatype, "dataFileList"); + for (File dataFile : dataFileList) { + LOG.info("Loading clinical attribute meta data from: " + dataFile.getName()); + + List clinicalAttributes = loadClinicalAttributesMetadata(dataFile, attributeType); + if (!clinicalAttributes.isEmpty()) { + LOG.info("Loaded " + clinicalAttributes.size() + " clinical attributes from: " + dataFile.getName()); + clinicalMetadata.putAll(DataFileUtils.loadDataFileMetadata(dataFile)); + clinicalMetadata.put(dataFile.getName(), "clinicalAttributes", clinicalAttributes); + } + else { + LOG.error("Could not load any clinical attributes from: " + dataFile.getName()); + } + } + // add data file clinical attributes to the execution context + chunkContext.getStepContext().getStepExecution().getJobExecution().getExecutionContext().put("newClinicalAttributes", newClinicalAttributes); + chunkContext.getStepContext().getStepExecution().getJobExecution().getExecutionContext().put("attributeType", attributeType); + chunkContext.getStepContext().getStepExecution().getJobExecution().getExecutionContext().put("clinicalMetadata", clinicalMetadata); + chunkContext.getStepContext().getStepExecution().getJobExecution().getExecutionContext().put("dataFileList", dataFileList); + + return RepeatStatus.FINISHED; + } + + /** + * Load clinical attributes meta data from input file. + * + * @param clinicalDataFile + * @param attributeType + * @return List + */ + private List loadClinicalAttributesMetadata(File clinicalDataFile, String attributeType) throws Exception { + // load clinical attribute metadata from file + List clinicalAttributes = new ArrayList(); + try (FileReader reader = new FileReader(clinicalDataFile)) { + BufferedReader buff = new BufferedReader(reader); + String line = buff.readLine(); + String[] displayNames = DataFileUtils.splitDataFields(line); + String[] descriptions, datatypes, priorities, colnames; + if (line.startsWith(DataFileUtils.METADATA_PREFIX)) { + descriptions = DataFileUtils.splitDataFields(buff.readLine()); + datatypes = DataFileUtils.splitDataFields(buff.readLine()); + priorities = DataFileUtils.splitDataFields(buff.readLine()); + } + else { + colnames = displayNames; + descriptions = new String[colnames.length]; + Arrays.fill(descriptions, DataFileUtils.DEFAULT_DESCRIPTION); + datatypes = new String[colnames.length]; + Arrays.fill(datatypes, DataFileUtils.DEFAULT_DATATYPE); + priorities = new String[colnames.length]; + Arrays.fill(priorities, DataFileUtils.DEFAULT_PRIORITY); + } + + // fill in attribute types and get the column names + String[] attributeTypes = new String[displayNames.length]; + line = buff.readLine(); + // if next line starts with metadata prefix then fill in attribute types + if (line.startsWith(DataFileUtils.METADATA_PREFIX)) { + attributeTypes = DataFileUtils.splitDataFields(line); + colnames = DataFileUtils.splitDataFields(buff.readLine()); + } + else { + // set line as column names if not starts with metadata prefix + // and fill attribute types by current clinical attribute type + // (PATIENT for clinical-patient, SAMPLE for clinical-sample) + // or assume SAMPLE for attribute type "MIXED" unless attribute + // exists in db as PATIENT attribute type + colnames = DataFileUtils.splitDataFields(line); + if (attributeType.equals("MIXED")) { + Arrays.fill(attributeTypes, "SAMPLE"); + } + else { + Arrays.fill(attributeTypes, attributeType); + } + } + + int newClinAttrsAdded = 0; + for (int i=0; i. +*/ + +package org.cbio.portal.pipelines.importer.config.tasklet; + +import org.mskcc.cbio.model.*; +import org.mskcc.cbio.persistence.jdbc.*; +import org.cbio.portal.pipelines.importer.util.DataFileUtils; + +import java.io.*; +import java.util.*; +import org.apache.commons.logging.*; +import org.apache.commons.collections.map.MultiKeyMap; + +import org.springframework.batch.core.StepContribution; +import org.springframework.batch.core.scope.context.ChunkContext; +import org.springframework.batch.core.step.tasklet.Tasklet; +import org.springframework.batch.repeat.RepeatStatus; +import org.springframework.beans.factory.annotation.Autowired; + +/** + * + * @author ochoaa + */ +public class CopyNumberSegmentMetadataTasklet implements Tasklet { + + @Autowired + CopyNumberSegmentJdbcDaoImpl copyNumberSegmentJdbcDaoImpl; + + @Autowired + CancerStudyJdbcDaoImpl cancerStudyJdbcDaoImpl; + + private CancerStudy cancerStudy; + + private static final Log LOG = LogFactory.getLog(CopyNumberSegmentMetadataTasklet.class); + + @Override + public RepeatStatus execute(StepContribution stepContribution, ChunkContext chunkContext) throws Exception { + // get cancer study, datatype, and metafile properties from datatype metadata in execution context + this.cancerStudy = (CancerStudy) chunkContext.getStepContext().getJobExecutionContext().get("cancerStudy"); + MultiKeyMap datatypeMetadata = (MultiKeyMap) chunkContext.getStepContext().getJobExecutionContext().get("datatypeMetadata"); + String datatype = (String) chunkContext.getStepContext().getJobExecutionContext().get("currentDatatype"); + Properties properties = (Properties) datatypeMetadata.get(datatype, "properties"); + List dataFileList = (List) datatypeMetadata.get(datatype, "dataFileList"); + File dataFile = dataFileList.get(0); + + // import copy number segment file and load metadata from file + LOG.info("Importing COPY_NUMBER_SEG_FILE record with data filename: " + dataFile.getName()); + CopyNumberSegmentFile copyNumberSegmentFile = loadCopyNumberSegmentFile(properties); + copyNumberSegmentJdbcDaoImpl.addCopyNumberSegmentFile(copyNumberSegmentFile); + MultiKeyMap copyNumberSegmentMetadata = DataFileUtils.loadDataFileMetadata(dataFile); + + // add datafile and copy number segment metadata to execution context + chunkContext.getStepContext().getStepExecution().getJobExecution().getExecutionContext().put("dataFile", dataFile); + chunkContext.getStepContext().getStepExecution().getJobExecution().getExecutionContext().put("copyNumberSegmentMetadata", copyNumberSegmentMetadata); + + return RepeatStatus.FINISHED; + } + + /** + * Loads an instance of CopyNumberSegmentFile from the copy number segment metafile properties. + * + * @param properties + * @return CopyNumberSegmentFile + */ + private CopyNumberSegmentFile loadCopyNumberSegmentFile(Properties properties) { + // create new copy number seg file object and set fields + CopyNumberSegmentFile copyNumberSegmentFile = new CopyNumberSegmentFile(); + copyNumberSegmentFile.setCancerStudyId(cancerStudy.getCancerStudyId()); + + // set reference genome id + CopyNumberSegmentFile.ReferenceGenomeId referenceGenomeId = CopyNumberSegmentFile.ReferenceGenomeId.valueOf(properties.getProperty("reference_genome_id")); + copyNumberSegmentFile.setReferenceGenomeId(referenceGenomeId); + + // set remaining copy number seg file properties + copyNumberSegmentFile.setDescription(properties.getProperty("description")); + copyNumberSegmentFile.setFilename(properties.getProperty("data_filename")); + + return copyNumberSegmentFile; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/tasklet/DatatypeMetadataTasklet.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/tasklet/DatatypeMetadataTasklet.java new file mode 100644 index 0000000..be6c1d6 --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/tasklet/DatatypeMetadataTasklet.java @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.tasklet; + +import org.mskcc.cbio.model.CancerStudy; +import org.cbio.portal.pipelines.importer.util.DataFileUtils; + +import java.io.*; +import java.util.*; +import javax.annotation.Resource; +import org.apache.commons.collections.keyvalue.MultiKey; +import org.apache.commons.collections.map.MultiKeyMap; +import org.apache.commons.logging.*; + +import org.springframework.batch.core.StepContribution; +import org.springframework.batch.core.scope.context.ChunkContext; +import org.springframework.batch.core.step.tasklet.Tasklet; +import org.springframework.batch.repeat.RepeatStatus; +import org.springframework.beans.factory.annotation.Value; + +/** + * + * @author ochoaa + */ +public class DatatypeMetadataTasklet implements Tasklet { + + @Value("#{jobParameters[stagingDirectory]}") + private String stagingDirectory; + + @Resource(name="datatypeMetadataMap") + MultiKeyMap datatypeMetadataMap; + + private static final Log LOG = LogFactory.getLog(DatatypeMetadataTasklet.class); + + @Override + public RepeatStatus execute(StepContribution stepContribution, ChunkContext chunkContext) throws Exception { + CancerStudy cancerStudy = (CancerStudy) chunkContext.getStepContext().getStepExecution().getJobExecution().getExecutionContext().get("cancerStudy"); + MultiKeyMap datatypeMetadata = new MultiKeyMap(); + + // search cancer study path for all datatype metafiles by meta filenames + LOG.info("Searching cancer study path for all datatype metafiles"); + Set keys = datatypeMetadataMap.keySet(); + for(MultiKey key : keys){ + // init datatype metadata params + String datatype = (String) key.getKey(0); + boolean importData = false; + Properties properties = new Properties(); + List dataFileList = new ArrayList(); + List logMessages = new ArrayList(); + + // create metafile from datatype meta filename + String metaFilename = (String) datatypeMetadataMap.get(datatype, "meta_filename"); + if (metaFilename.startsWith("")) { + metaFilename = metaFilename.replace("", cancerStudy.getCancerStudyIdentifier()); + } + File metaFile = new File(stagingDirectory, metaFilename); + logMessages.add("Searching for meta filename: " + metaFile.getName()); + if (metaFile.exists()) { + // load properties from metafile if exists + properties.load(new FileInputStream(metaFile)); + + // get data filename(s) from metafile and add to list of data files if exists + String[] dataFilenames = DataFileUtils.splitDataFields(properties.getProperty("data_filename")); + for (String dataFilename : dataFilenames) { + File dataFile = new File(stagingDirectory, dataFilename); + if (dataFile.exists()) { + dataFileList.add(dataFile); + } + } + + if (!dataFileList.isEmpty()) { + // if datafile list is not empty then set import data status to true + logMessages.add("Found data files for datatype: " + datatype + " - beginning import"); + importData = true; + } + else { + logMessages.add("Data files not found - skipping import for datatype: " + datatype); + } + } + else { + logMessages.add("Meta file not found - skipping import for datatype: " + datatype); + } + // add datatype metadata to multikeymap for execution context + datatypeMetadata.put(datatype, "importData", importData); + datatypeMetadata.put(datatype, "properties", properties); + datatypeMetadata.put(datatype, "dataFileList", dataFileList); + datatypeMetadata.put(datatype, "logMessages", logMessages); + datatypeMetadata.put(datatype, "caseList", new LinkedHashSet<>()); // for adding case ids + } + + // add datatype metadata to execution context + chunkContext.getStepContext().getStepExecution().getJobExecution().getExecutionContext().put("datatypeMetadata", datatypeMetadata); + return RepeatStatus.FINISHED; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/tasklet/DeleteCancerStudyTasklet.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/tasklet/DeleteCancerStudyTasklet.java new file mode 100644 index 0000000..bacda38 --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/tasklet/DeleteCancerStudyTasklet.java @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.tasklet; + +import org.mskcc.cbio.model.CancerStudy; +import org.mskcc.cbio.persistence.jdbc.CancerStudyJdbcDaoImpl; +import org.springframework.batch.core.StepContribution; +import org.springframework.batch.core.scope.context.ChunkContext; +import org.springframework.batch.core.step.tasklet.Tasklet; +import org.springframework.batch.repeat.RepeatStatus; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.beans.factory.annotation.Value; + +/** + * Tasklet for deleting a cancer study given a cancer study identifier. + * + * @author ochoaa + */ +public class DeleteCancerStudyTasklet implements Tasklet { + + @Value("#{jobParameters[cancerStudyIdentifier]}") + private String cancerStudyIdentifier; + + @Autowired + CancerStudyJdbcDaoImpl cancerStudyJdbcDaoImpl; + + @Override + public RepeatStatus execute(StepContribution stepContribution, ChunkContext chunkContext) throws Exception { + CancerStudy cancerStudy = cancerStudyJdbcDaoImpl.getCancerStudy(cancerStudyIdentifier); + if (cancerStudy != null) { + cancerStudyJdbcDaoImpl.deleteCancerStudy(cancerStudy.getCancerStudyId()); + } + return RepeatStatus.FINISHED; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/tasklet/GeneticProfileTasklet.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/tasklet/GeneticProfileTasklet.java new file mode 100644 index 0000000..0d12667 --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/tasklet/GeneticProfileTasklet.java @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.tasklet; + +import org.mskcc.cbio.model.*; +import org.mskcc.cbio.persistence.jdbc.*; + +import java.util.*; +import com.google.common.base.Strings; +import org.apache.commons.collections.map.MultiKeyMap; +import org.apache.commons.logging.*; + +import org.springframework.batch.core.*; +import org.springframework.batch.core.scope.context.ChunkContext; +import org.springframework.batch.core.step.tasklet.Tasklet; +import org.springframework.batch.repeat.RepeatStatus; +import org.springframework.beans.factory.annotation.*; + +/** + * Tasklet for loading and importing genetic profile data. + * + * @author ochoaa + */ +public class GeneticProfileTasklet implements Tasklet { + + @Autowired + GeneticProfileJdbcDaoImpl geneticProfileJdbcDaoImpl; + + @Autowired + CancerStudyJdbcDaoImpl cancerStudyJdbcDaoImpl; + + @Autowired + GenePanelJdbcDaoImpl genePanelJdbcDaoImpl; + + private String genePanel = null; + private CancerStudy cancerStudy; + + private static final Log LOG = LogFactory.getLog(GeneticProfileTasklet.class); + + @Override + public RepeatStatus execute(StepContribution stepContribution, ChunkContext chunkContext) throws Exception { + // get cancer study, datatype, and metafile properties from datatype metadata in execution context + this.cancerStudy = (CancerStudy) chunkContext.getStepContext().getJobExecutionContext().get("cancerStudy"); + MultiKeyMap datatypeMetadata = (MultiKeyMap) chunkContext.getStepContext().getJobExecutionContext().get("datatypeMetadata"); + String datatype = (String) chunkContext.getStepContext().getJobExecutionContext().get("currentDatatype"); + Properties properties = (Properties) datatypeMetadata.get(datatype, "properties"); + + // load genetic profile and insert genetic profile and data file list into execution context if not null + GeneticProfile geneticProfile = loadGeneticProfileMetadata(properties); + if (geneticProfile == null) { + LOG.error("Error loading genetic profile from: " + datatypeMetadata.get(datatype, "meta_filename")); + chunkContext.getStepContext().getStepExecution().getJobExecution().setStatus(BatchStatus.STOPPING); + return RepeatStatus.FINISHED; + } + + // check for existing genetic profile and stop job if genetic profile already exists except for datatype MAF + GeneticProfile existingGeneticProfile = geneticProfileJdbcDaoImpl.getGeneticProfile(geneticProfile.getStableId()); + if (existingGeneticProfile != null) { + if (!existingGeneticProfile.getDatatype().equals("MAF")) { + LOG.error("Genetic profile already exists with stable id: " + geneticProfile.getStableId()); + chunkContext.getStepContext().getStepExecution().getJobExecution().setStatus(BatchStatus.STOPPING); + return RepeatStatus.FINISHED; + } + // add existing genetic profile to execution context + chunkContext.getStepContext().getStepExecution().getJobExecution().getExecutionContext().put("geneticProfile", existingGeneticProfile); + } + else { + // import new genetic profile and add to execution context + GeneticProfile newGeneticProfile = geneticProfileJdbcDaoImpl.addGeneticProfile(geneticProfile); + chunkContext.getStepContext().getStepExecution().getJobExecution().getExecutionContext().put("geneticProfile", newGeneticProfile); + } + + // add gene panel id to execution context - right now it is assumed that a gene panel, if given in a genetic profile file, + // already exists in the db + Integer genePanelId = null; + if (!Strings.isNullOrEmpty(genePanel)) { + genePanelId = genePanelJdbcDaoImpl.getGenePanelId(genePanel); + } + chunkContext.getStepContext().getStepExecution().getJobExecution().getExecutionContext().put("genePanelId", genePanelId); + + return RepeatStatus.FINISHED; + } + + /** + * Load genetic profile metadata from meta file properties. + * + * @param properties + * @return GeneticProfile + */ + private GeneticProfile loadGeneticProfileMetadata(Properties properties) { + GeneticProfile geneticProfile = new GeneticProfile(); + + // get cancer study by cancer study identifier + geneticProfile.setCancerStudy(cancerStudy); + geneticProfile.setCancerStudyId(cancerStudy.getCancerStudyId()); + + // get stable id from meta file and insert cancer study identifier if necessary + String stableId = properties.getProperty("stable_id"); + if (!stableId.startsWith(cancerStudy.getCancerStudyIdentifier()) ) { + stableId = cancerStudy.getCancerStudyIdentifier() + "_" + stableId; + } + geneticProfile.setStableId(stableId); + + // set remaining genetic profile properties - default values are empty strings + String geneticAlterationType = ""; + String profileName = ""; + String description = ""; + String datatype = ""; + String showProfileInAnalysisTab = "false"; + try { + geneticAlterationType = properties.getProperty("genetic_alteration_type"); + profileName = properties.getProperty("profile_name"); + description = properties.getProperty("profile_description"); + datatype = properties.getProperty("datatype"); + showProfileInAnalysisTab = properties.getProperty("show_profile_in_analysis_tab", "false"); + } + catch (NullPointerException ex) {} + + // set gene panel stable id if exists + if (properties.containsKey("gene_panel")) { + this.genePanel = properties.getProperty("gene_panel"); + } + // set genetic profile properties + geneticProfile.setGeneticAlterationType(geneticAlterationType); + geneticProfile.setName(profileName); + geneticProfile.setDescription(description); + geneticProfile.setDatatype(datatype); + geneticProfile.setShowProfileInAnalysisTab(showProfileInAnalysisTab.equalsIgnoreCase("true")); + + return geneticProfile; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/tasklet/GisticMetadataTasklet.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/tasklet/GisticMetadataTasklet.java new file mode 100644 index 0000000..25760c5 --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/tasklet/GisticMetadataTasklet.java @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.tasklet; + +import org.cbio.portal.pipelines.importer.util.DataFileUtils; + +import java.io.*; +import java.util.*; +import org.apache.commons.collections.map.MultiKeyMap; + +import org.springframework.batch.core.StepContribution; +import org.springframework.batch.core.scope.context.ChunkContext; +import org.springframework.batch.core.step.tasklet.Tasklet; +import org.springframework.batch.repeat.RepeatStatus; + +/** + * + * @author ochoaa + */ +public class GisticMetadataTasklet implements Tasklet { + + @Override + public RepeatStatus execute(StepContribution stepContribution, ChunkContext chunkContext) throws Exception { + MultiKeyMap datatypeMetadata = (MultiKeyMap) chunkContext.getStepContext().getJobExecutionContext().get("datatypeMetadata"); + String datatype = (String) chunkContext.getStepContext().getJobExecutionContext().get("currentDatatype"); + Properties properties = (Properties) datatypeMetadata.get(datatype, "properties"); + List dataFileList = (List) datatypeMetadata.get(datatype, "dataFileList"); + File dataFile = dataFileList.get(0); + + // load gistic metadata from datafile + MultiKeyMap gisticMetadata = DataFileUtils.loadDataFileMetadata(dataFile); + + // add datafile and gistic metadata to execution context + chunkContext.getStepContext().getStepExecution().getJobExecution().getExecutionContext().put("dataFile", dataFile); + chunkContext.getStepContext().getStepExecution().getJobExecution().getExecutionContext().put("gisticMetadata", gisticMetadata); + + return RepeatStatus.FINISHED; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/tasklet/MutSigMetadataTasklet.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/tasklet/MutSigMetadataTasklet.java new file mode 100644 index 0000000..0f812a6 --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/tasklet/MutSigMetadataTasklet.java @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.tasklet; + +import org.cbio.portal.pipelines.importer.util.DataFileUtils; + +import java.io.*; +import java.util.*; +import org.apache.commons.collections.map.MultiKeyMap; + +import org.springframework.batch.core.StepContribution; +import org.springframework.batch.core.scope.context.ChunkContext; +import org.springframework.batch.core.step.tasklet.Tasklet; +import org.springframework.batch.repeat.RepeatStatus; + +/** + * + * @author ochoaa + */ +public class MutSigMetadataTasklet implements Tasklet { + + @Override + public RepeatStatus execute(StepContribution stepContribution, ChunkContext chunkContext) throws Exception { + MultiKeyMap datatypeMetadata = (MultiKeyMap) chunkContext.getStepContext().getJobExecutionContext().get("datatypeMetadata"); + String datatype = (String) chunkContext.getStepContext().getJobExecutionContext().get("currentDatatype"); + List dataFileList = (List) datatypeMetadata.get(datatype, "dataFileList"); + File dataFile = dataFileList.get(0); + + // load mutsig metadata from datafile + MultiKeyMap mutSigMetadata = DataFileUtils.loadDataFileMetadata(dataFile); + + // datafile and mutsig metadata to execution context + chunkContext.getStepContext().getStepExecution().getJobExecution().getExecutionContext().put("dataFile", dataFile); + chunkContext.getStepContext().getStepExecution().getJobExecution().getExecutionContext().put("mutSigMetadata", mutSigMetadata); + + return RepeatStatus.FINISHED; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/tasklet/MutationMetadataTasklet.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/tasklet/MutationMetadataTasklet.java new file mode 100644 index 0000000..f02cef9 --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/tasklet/MutationMetadataTasklet.java @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.tasklet; + +import org.cbio.portal.pipelines.importer.util.DataFileUtils; + +import java.io.*; +import java.util.*; +import org.apache.commons.collections.map.MultiKeyMap; + +import org.springframework.batch.core.StepContribution; +import org.springframework.batch.core.scope.context.ChunkContext; +import org.springframework.batch.core.step.tasklet.Tasklet; +import org.springframework.batch.repeat.RepeatStatus; + +/** + * + * @author ochoaa + */ +public class MutationMetadataTasklet implements Tasklet { + + @Override + public RepeatStatus execute(StepContribution stepContribution, ChunkContext chunkContext) throws Exception { + MultiKeyMap datatypeMetadata = (MultiKeyMap) chunkContext.getStepContext().getJobExecutionContext().get("datatypeMetadata"); + String datatype = (String) chunkContext.getStepContext().getJobExecutionContext().get("currentDatatype"); + List dataFiles = (List) datatypeMetadata.get(datatype, "dataFileList"); + + // go through each datafile and load mutation file metadata + MultiKeyMap mutationFileMetadata = new MultiKeyMap(); + for (File dataFile : dataFiles) { + MultiKeyMap metadata = DataFileUtils.loadDataFileMetadata(dataFile); + mutationFileMetadata.putAll(metadata); + } + // add datafiles and mutation file metadata to execution context + chunkContext.getStepContext().getStepExecution().getJobExecution().getExecutionContext().put("dataFileList", dataFiles); + chunkContext.getStepContext().getStepExecution().getJobExecution().getExecutionContext().put("mutationFileMetadata", mutationFileMetadata); + + return RepeatStatus.FINISHED; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/tasklet/ProfileMetadataTasklet.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/tasklet/ProfileMetadataTasklet.java new file mode 100644 index 0000000..4e6f261 --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/tasklet/ProfileMetadataTasklet.java @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.tasklet; + +import org.mskcc.cbio.model.*; +import org.mskcc.cbio.persistence.jdbc.*; +import org.cbio.portal.pipelines.importer.util.DataFileUtils; + +import java.io.*; +import java.util.*; +import org.apache.commons.collections.map.MultiKeyMap; + +import org.springframework.batch.core.*; +import org.springframework.batch.core.scope.context.ChunkContext; +import org.springframework.batch.core.step.tasklet.Tasklet; +import org.springframework.batch.repeat.RepeatStatus; +import org.springframework.beans.factory.annotation.*; + +/** + * + * @author ochoaa + */ +public class ProfileMetadataTasklet implements Tasklet { + + @Autowired + SampleJdbcDaoImpl sampleJdbcDaoImpl; + + @Autowired + SampleProfileJdbcDaoImpl sampleProfileJdbcDaoImpl; + + @Autowired + GeneticProfileSamplesJdbcDaoImpl geneticProfileSamplesJdbcDaoImpl; + + private GeneticProfile geneticProfile; + private Integer genePanelId; + + @Override + public RepeatStatus execute(StepContribution stepContribution, ChunkContext chunkContext) throws Exception { + this.geneticProfile = (GeneticProfile) chunkContext.getStepContext().getJobExecutionContext().get("geneticProfile"); + this.genePanelId = (Integer) chunkContext.getStepContext().getJobExecutionContext().get("genePanelId"); + MultiKeyMap datatypeMetadata = (MultiKeyMap) chunkContext.getStepContext().getJobExecutionContext().get("datatypeMetadata"); + String datatype = (String) chunkContext.getStepContext().getJobExecutionContext().get("currentDatatype"); + List dataFileList = (List) datatypeMetadata.get(datatype, "dataFileList"); + + // go through each datafile and load datafile metadata + MultiKeyMap profileMetadata = new MultiKeyMap(); + for (File dataFile : dataFileList) { + MultiKeyMap metadata = loadProfileMetadata(dataFile); + profileMetadata.putAll(metadata); + } + // add datafile list and metadata to execution context + chunkContext.getStepContext().getStepExecution().getJobExecution().getExecutionContext().put("dataFileList", dataFileList); + chunkContext.getStepContext().getStepExecution().getJobExecution().getExecutionContext().put("profileMetadata", profileMetadata); + + return RepeatStatus.FINISHED; + } + + /** + * Loads the profile metadata from the file (non case ids, map of case ids to internal sample ids, and normal case ids) + * + * @param dataFile + * @return MultiKeyMap + */ + private MultiKeyMap loadProfileMetadata(File dataFile) throws IOException { + MultiKeyMap profileMetadata = DataFileUtils.loadDataFileMetadata(dataFile); + String[] header = (String[]) profileMetadata.get(dataFile.getName(), "header"); + + // organize columns in header into non-case ids, case ids, and normal case ids + Set nonCaseIds = new HashSet<>(); + HashMap caseIdsMap = new LinkedHashMap<>(); + Set normalCaseIds = new HashSet<>(); + for (String column : header) { + // add non-case id columns to hash set + if (DataFileUtils.nonCaseIdColumnNames.contains(column.toUpperCase())) { + nonCaseIds.add(column); + } + else { + String sampleStableId = DataFileUtils.getSampleStableId(column); + // add normal case ids to normal case ids hash set + if (DataFileUtils.isNormalSample(sampleStableId)) { + normalCaseIds.add(column); + } + // add non-normal case ids to linked hash map where key=stable id, val=internal sample id + else { + Sample sample = sampleJdbcDaoImpl.getSampleByStudy(sampleStableId, geneticProfile.getCancerStudyId()); + if (sample == null) { + continue; + } + caseIdsMap.put(column, sample.getInternalId()); + + // add sample profile for genetic profile (assumed that file has passed validation and no duplicate case ids in column exist) + sampleProfileJdbcDaoImpl.addSampleProfile(sample.getInternalId(), geneticProfile.getGeneticProfileId(), genePanelId); + } + } + } + // add samples to genetic profile samples + geneticProfileSamplesJdbcDaoImpl.addGeneticProfileSamples(geneticProfile, new ArrayList(caseIdsMap.values())); + profileMetadata.put(dataFile.getName(), "nonCaseIds", nonCaseIds); + profileMetadata.put(dataFile.getName(), "caseIdsMap", caseIdsMap); + profileMetadata.put(dataFile.getName(), "normalCaseIds", normalCaseIds); + + return profileMetadata; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/tasklet/StructuralVariantMetadataTasklet.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/tasklet/StructuralVariantMetadataTasklet.java new file mode 100644 index 0000000..64e14e1 --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/tasklet/StructuralVariantMetadataTasklet.java @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.tasklet; + +import org.cbio.portal.pipelines.importer.util.DataFileUtils; + +import java.io.*; +import java.util.*; +import org.apache.commons.collections.map.MultiKeyMap; + +import org.springframework.batch.core.StepContribution; +import org.springframework.batch.core.scope.context.ChunkContext; +import org.springframework.batch.core.step.tasklet.Tasklet; +import org.springframework.batch.repeat.RepeatStatus; + +/** + * + * @author ochoaa + */ +public class StructuralVariantMetadataTasklet implements Tasklet { + + @Override + public RepeatStatus execute(StepContribution stepContribution, ChunkContext chunkContext) throws Exception { + MultiKeyMap datatypeMetadata = (MultiKeyMap) chunkContext.getStepContext().getJobExecutionContext().get("datatypeMetadata"); + String datatype = (String) chunkContext.getStepContext().getJobExecutionContext().get("currentDatatype"); + List dataFileList = (List) datatypeMetadata.get(datatype, "dataFileList"); + File dataFile = dataFileList.get(0); + + // load structural variant metadata from datafile + MultiKeyMap structuralVariantMetadata = DataFileUtils.loadDataFileMetadata(dataFile); + + // datafile and structural variant metadata to execution context + chunkContext.getStepContext().getStepExecution().getJobExecution().getExecutionContext().put("dataFile", dataFile); + chunkContext.getStepContext().getStepExecution().getJobExecution().getExecutionContext().put("structuralVariantMetadata", structuralVariantMetadata); + + return RepeatStatus.FINISHED; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/tasklet/TimelineMetadataTasklet.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/tasklet/TimelineMetadataTasklet.java new file mode 100644 index 0000000..6a90d8e --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/tasklet/TimelineMetadataTasklet.java @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.tasklet; + +import org.cbio.portal.pipelines.importer.util.DataFileUtils; + +import java.io.*; +import java.util.*; +import org.apache.commons.logging.*; +import org.apache.commons.collections.map.MultiKeyMap; + +import org.springframework.batch.core.StepContribution; +import org.springframework.batch.core.scope.context.ChunkContext; +import org.springframework.batch.core.step.tasklet.Tasklet; +import org.springframework.batch.repeat.RepeatStatus; + +/** + * + * @author ochoaa + */ +public class TimelineMetadataTasklet implements Tasklet { + + private static final Log LOG = LogFactory.getLog(TimelineMetadataTasklet.class); + + @Override + public RepeatStatus execute(StepContribution stepContribution, ChunkContext chunkContext) throws Exception { + MultiKeyMap datatypeMetadata = (MultiKeyMap) chunkContext.getStepContext().getJobExecutionContext().get("datatypeMetadata"); + String datatype = (String) chunkContext.getStepContext().getJobExecutionContext().get("currentDatatype"); + List dataFileList = (List) datatypeMetadata.get(datatype, "dataFileList"); + + // go through list of datafiles and load datafile metadata (header, number of records) + MultiKeyMap timelineMetadata = new MultiKeyMap(); + for (File dataFile : dataFileList) { + timelineMetadata.putAll(DataFileUtils.loadDataFileMetadata(dataFile)); + } + + // add import status, datafiles, and timeline metadata to execution context + chunkContext.getStepContext().getStepExecution().getJobExecution().getExecutionContext().put("dataFileList", dataFileList); + chunkContext.getStepContext().getStepExecution().getJobExecution().getExecutionContext().put("timelineMetadata", timelineMetadata); + + return RepeatStatus.FINISHED; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/writer/CaseListWriter.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/writer/CaseListWriter.java new file mode 100644 index 0000000..1dbda6e --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/writer/CaseListWriter.java @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.writer; + +import org.mskcc.cbio.model.SampleList; +import org.mskcc.cbio.persistence.jdbc.SampleListJdbcDaoImpl; + +import java.util.*; +import org.apache.commons.logging.*; + +import org.springframework.batch.item.*; +import org.springframework.beans.factory.annotation.Autowired; + +/** + * + * @author ochoaa + */ +public class CaseListWriter implements ItemStreamWriter { + + @Autowired + SampleListJdbcDaoImpl sampleListJdbcDaoImpl; + + private int sampleListDataCount; + private int sampleListListDataCount; + + private static final Log LOG = LogFactory.getLog(CaseListWriter.class); + + @Override + public void open(ExecutionContext executionContext) throws ItemStreamException { + LOG.info("Beginning case list import"); + } + + @Override + public void update(ExecutionContext executionContext) throws ItemStreamException { + // update sample list and sample list list data count for step listener + executionContext.put("sampleListDataCount", sampleListDataCount); + executionContext.put("sampleListListDataCount", sampleListListDataCount); + } + + @Override + public void close() throws ItemStreamException {} + + @Override + public void write(List list) throws Exception { + + for (SampleList sampleList : list) { + // skip sample lists that already exist by stable id + if (sampleListJdbcDaoImpl.getSampleList(sampleList.getStableId()) != null) { + LOG.error("Sample list already exists by stable id: " + sampleList.getStableId()); + continue; + } + SampleList newSampleList = sampleListJdbcDaoImpl.addSampleList(sampleList); + this.sampleListDataCount++; + + // import sample list list for current sample list + for (Integer sampleId : sampleList.getSampleListList()) { + sampleListJdbcDaoImpl.addSampleListList(newSampleList, sampleId); + this.sampleListListDataCount++; + } + } + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/writer/ClinicalDataWriter.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/writer/ClinicalDataWriter.java new file mode 100644 index 0000000..d913ecd --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/writer/ClinicalDataWriter.java @@ -0,0 +1,275 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.writer; + +import org.mskcc.cbio.model.*; +import org.mskcc.cbio.model.summary.ClinicalDataSummary; +import org.mskcc.cbio.persistence.jdbc.*; +import org.cbio.portal.pipelines.importer.config.composite.CompositeClinicalData; + +import java.util.*; +import org.apache.commons.logging.*; +import com.google.common.base.*; +import com.google.common.collect.*; + +import org.springframework.batch.item.*; +import org.springframework.beans.factory.annotation.Autowired; + +/** + * + * @author ochoaa + */ +public class ClinicalDataWriter implements ItemStreamWriter { + + @Autowired + ClinicalDataJdbcDaoImpl clinicalDataJdbcDaoImpl; + + @Autowired + PatientJdbcDaoImpl patientJdbcDaoImpl; + + @Autowired + SampleJdbcDaoImpl sampleJdbcDaoImpl; + + private int patientCount; + private int sampleCount; + private int patientDataCount; + private int sampleDataCount; + + private final Set caseIdSet = new LinkedHashSet<>(); + private final Map> patientClinicalDataAdded = new HashMap<>(); + private final Map> sampleClinicalDataAdded = new HashMap<>(); + + private static final Log LOG = LogFactory.getLog(ClinicalDataWriter.class); + + @Override + public void open(ExecutionContext executionContext) throws ItemStreamException { + LOG.info("Beginning clinical data batch import"); + } + + @Override + public void update(ExecutionContext executionContext) throws ItemStreamException { + // insert updated record counts and case id set to execution context + executionContext.put("patientCount", patientCount); + executionContext.put("sampleCount", sampleCount); + executionContext.put("patientDataCount", patientDataCount); + executionContext.put("sampleDataCount", sampleDataCount); + executionContext.put("caseList", caseIdSet); + } + + @Override + public void close() throws ItemStreamException {} + + @Override + public void write(List list) throws Exception { + List patientClinicalData = new ArrayList(); + List sampleClinicalData = new ArrayList(); + + // for each composite clinical data, update internal ids for patient and + // sample (if applicable) and import clinical data accordingly + for (CompositeClinicalData ccd : list) { + // update internal ids for composte clinical data + try { + ccd = updateCompositeData(ccd); + } + catch (NullPointerException ex) { + LOG.error("Could not update composite clinical data for patient, sample: " + + ccd.getPatient().getStableId() + ", " + ccd.getSample().getStableId()); + ex.printStackTrace(); + continue; + } + // add case id to case id set + caseIdSet.add(ccd.getSample().getInternalId()); + + if (!ccd.getPatientClinicalData().isEmpty()) { + // init patient clinical data added map with existing clinical data + if (!patientClinicalDataAdded.containsKey(ccd.getPatient().getInternalId())) { + patientClinicalDataAdded.put(ccd.getPatient().getInternalId(), + clinicalDataJdbcDaoImpl.getPatientClinicalDataAttributes(ccd.getPatient().getInternalId())); + } + // get existing clinical data and current composite clinical data + Map existingClinicalData = patientClinicalDataAdded.get(ccd.getPatient().getInternalId()); + Map filteredClinicalData = filterClinicalAttributes(existingClinicalData, ccd, "patient"); + + // filter composite patient clinical data by whether patientClinicalAttributes contains attr id or not + Predicate attrIdFilter = (String attrId) -> filteredClinicalData.keySet().contains(attrId); + Map newPatientClinicalData = Maps.filterKeys(ccd.getPatientClinicalData(), attrIdFilter); + + // update patient clinical data added map and patient clinical data list for batch importing + existingClinicalData.putAll(filteredClinicalData); + patientClinicalDataAdded.put(ccd.getPatient().getInternalId(), existingClinicalData); + patientClinicalData.addAll(newPatientClinicalData.values()); + } + + // continue if composite clinical data record doesn't have sample data(i.e., clinical-patient) + if (ccd.getSample().getInternalId() == -1) { + continue; + } + + if (!ccd.getSampleClinicalData().isEmpty()) { + // init sample clinical data added map with existing clinical data + if (!sampleClinicalDataAdded.containsKey(ccd.getSample().getInternalId())) { + sampleClinicalDataAdded.put(ccd.getSample().getInternalId(), + clinicalDataJdbcDaoImpl.getSampleClinicalDataAttributes(ccd.getSample().getInternalId())); + } + // get existing clinical data and current composite clinical data + Map existingClinicalData = sampleClinicalDataAdded.get(ccd.getSample().getInternalId()); + Map filteredClinicalData = filterClinicalAttributes(existingClinicalData, ccd, "sample"); + + // filter composite sample clinical data by whether sampleClinicalAttributes contains attr id or not + Predicate attrIdFilter = (String attrId) -> filteredClinicalData.keySet().contains(attrId); + Map newSampleClinicalData = Maps.filterKeys(ccd.getSampleClinicalData(), attrIdFilter); + + // update sample clinical data added map and sample clinical data list for batch importing + existingClinicalData.putAll(filteredClinicalData); + sampleClinicalDataAdded.put(ccd.getSample().getInternalId(), existingClinicalData); + sampleClinicalData.addAll(newSampleClinicalData.values()); + } + } + + // import batch of patient clinical data + if (!patientClinicalData.isEmpty()) { + int rowsAffected = clinicalDataJdbcDaoImpl.addClinicalDataBatch("clinical_patient", patientClinicalData); + this.patientDataCount += rowsAffected; + } + + // import batch of sample clinical data + if (!sampleClinicalData.isEmpty()) { + int rowsAffected = clinicalDataJdbcDaoImpl.addClinicalDataBatch("clinical_sample", sampleClinicalData); + this.sampleDataCount += rowsAffected; + } + } + + /** + * Update patient and sample internal ids in composite clinical data. + * + * @param composite + * @return CompositeClinicalData + */ + private CompositeClinicalData updateCompositeData(CompositeClinicalData composite) throws Exception { + // update patient internal id in composite clinical data object + int patientInternalId = composite.getPatient().getInternalId(); + if (composite.getPatient().getInternalId() != -1) { + composite.updatePatientInternalId(patientInternalId); + } + else { + Patient existingPatient = patientJdbcDaoImpl.getPatient(composite.getPatient().getStableId(), composite.getPatient().getCancerStudyId()); + if (existingPatient != null) { + composite.updatePatientInternalId(existingPatient.getInternalId()); + } + else { + Patient newPatient = patientJdbcDaoImpl.addPatient(composite.getPatient()); + composite.updatePatientInternalId(newPatient.getInternalId()); + this.patientCount++; + } + } + + // update sample internal id in composite clinical data object + if (!Strings.isNullOrEmpty(composite.getSample().getStableId())) { + int sampleInternalId = composite.getSample().getInternalId(); + if (sampleInternalId != -1) { + composite.updateSampleInternalId(sampleInternalId); + } + else { + Sample existingSample; + if (composite.getPatient().getInternalId() != -1) { + existingSample = sampleJdbcDaoImpl.getSampleByPatient(composite.getSample().getStableId(), composite.getPatient().getInternalId()); + } + else { + existingSample = sampleJdbcDaoImpl.getSampleByStudy(composite.getSample().getStableId(), composite.getPatient().getCancerStudyId()); + } + if (existingSample != null) { + sampleInternalId = existingSample.getInternalId(); + } + else { + Sample newSample = sampleJdbcDaoImpl.addSample(composite.getSample()); + sampleInternalId = newSample.getInternalId(); + this.sampleCount++; + } + composite.updateSampleInternalId(sampleInternalId); + } + } + + return composite; + } + + /** + * Filter out existing clinical attributes for composite clinical data. + * + * @param existingClinicalAttributes + * @param attributeType + * @param composite + * @return Set + */ + private Map filterClinicalAttributes(Map existingClinicalData, CompositeClinicalData composite, String attributeType) { + // get the stable id and set of clinical attributes by attribute type + String stableId; + Map compositeClinicalData = new HashMap<>(); + if (attributeType.equals("patient")) { + stableId = composite.getPatient().getStableId(); + composite.getPatientClinicalData().values().stream().forEach((pcd) -> { + compositeClinicalData.put(pcd.getAttrId(), pcd.getAttrValue()); + }); + } + else { + stableId = composite.getSample().getStableId(); + composite.getSampleClinicalData().values().stream().forEach((scd) -> { + compositeClinicalData.put(scd.getAttrId(), scd.getAttrValue()); + }); + } + + // get intersection of existing clinical attributes and clinical attributes + // from composite object and remove them from composite clinical data + Set duplicateAttributes = new HashSet(existingClinicalData.keySet()); + duplicateAttributes.retainAll(compositeClinicalData.keySet()); + if (!duplicateAttributes.isEmpty()) { + for (String attr : duplicateAttributes) { + // get the attribute value from composite clinical data and + // compare to existing value + String value = compositeClinicalData.get(attr); + String existingValue = existingClinicalData.get(attr); + if (!value.equals(existingValue)) { + LOG.warn("Clinical data for " + attributeType + " " + stableId + " already loaded as " + + attr + "=" + existingValue + " - skipping import for " + attr + "=" + value); + } + } + } + // filter composite clinical data map + Set filteredAttrIds = compositeClinicalData.keySet(); + filteredAttrIds.removeAll(existingClinicalData.keySet()); + Predicate attrIdFilter = (String attrId) -> filteredAttrIds.contains(attrId); + Map filteredCompositeClinicalData = Maps.filterKeys(compositeClinicalData, attrIdFilter); + + return filteredCompositeClinicalData; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/writer/CopyNumberSegmentDataWriter.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/writer/CopyNumberSegmentDataWriter.java new file mode 100644 index 0000000..88f33aa --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/writer/CopyNumberSegmentDataWriter.java @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.writer; + +import org.mskcc.cbio.model.CopyNumberSegment; +import org.mskcc.cbio.persistence.jdbc.CopyNumberSegmentJdbcDaoImpl; + +import java.util.*; +import org.apache.commons.logging.*; + +import org.springframework.batch.item.*; +import org.springframework.beans.factory.annotation.Autowired; + +/** + * + * @author ochoaa + */ +public class CopyNumberSegmentDataWriter implements ItemStreamWriter { + + @Autowired + CopyNumberSegmentJdbcDaoImpl copyNumberSegmentJdbcDaoImpl; + + private final Set caseIdSet = new LinkedHashSet<>(); + private int copyNumberSegmentDataCount; + + private static final Log LOG = LogFactory.getLog(CopyNumberSegmentDataWriter.class); + + @Override + public void open(ExecutionContext executionContext) throws ItemStreamException { + LOG.info("Beginning copy number segment data batch import"); + } + + @Override + public void update(ExecutionContext executionContext) throws ItemStreamException { + // update copy number segment data count and case ids set for step listener + executionContext.put("copyNumberSegmentDataCount", copyNumberSegmentDataCount); + executionContext.put("caseList", caseIdSet); + } + + @Override + public void close() throws ItemStreamException {} + + @Override + public void write(List list) throws Exception { + // add all case ids to case id set + list.stream().forEach((cns) -> { + caseIdSet.add(cns.getSampleId()); + }); + // import batch of copy number segment records and update copy number segment data count + int rowsAffected = copyNumberSegmentJdbcDaoImpl.addCopyNumberSegmentBatch((List) list); + this.copyNumberSegmentDataCount += rowsAffected; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/writer/GisticDataWriter.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/writer/GisticDataWriter.java new file mode 100644 index 0000000..8f4051b --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/writer/GisticDataWriter.java @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.writer; + +import org.mskcc.cbio.model.*; +import org.mskcc.cbio.persistence.jdbc.*; + +import java.util.*; +import org.apache.commons.logging.*; + +import org.springframework.batch.item.*; +import org.springframework.beans.factory.annotation.Autowired; + +/** + * + * @author ochoaa + */ +public class GisticDataWriter implements ItemStreamWriter { + + @Autowired + GisticJdbcDaoImpl gisticJdbcDaoImpl; + + private Integer nextGisticRoiId; + + private int gisticDataCount; + private int gisticGeneDataCount; + + private final Set entrezGeneIdSet = new HashSet<>(); + + private static final Log LOG = LogFactory.getLog(GisticDataWriter.class); + + @Override + public void open(ExecutionContext executionContext) throws ItemStreamException { + this.nextGisticRoiId = gisticJdbcDaoImpl.getLargestGisticRoiId(); + LOG.info("Beginning gistic data batch import"); + } + + @Override + public void update(ExecutionContext executionContext) throws ItemStreamException { + // update gistic data count and total genes for step listener + executionContext.put("gisticDataCount", gisticDataCount); + executionContext.put("gisticGeneDataCount", gisticGeneDataCount); + executionContext.put("totalGeneCount", entrezGeneIdSet.size()); + } + + @Override + public void close() throws ItemStreamException {} + + @Override + public void write(List list) throws Exception { + List gisticList = new ArrayList(); + List gisticGeneList = new ArrayList(); + + // go through list of gistic data and update gistic roi id + for (Gistic gistic : list) { + // update entrez gene id set + gistic.getGenesInRegion().stream().forEach((gene) -> { + entrezGeneIdSet.add(gene.getEntrezGeneId()); + }); + // update gistic roi id + gistic.setGisticRoiId(++nextGisticRoiId); + gisticList.add(gistic); + + // update gistic roi id for gistic gene list also + gistic.getGenesInRegion().forEach((gg) -> { + gg.setGisticRoiId(nextGisticRoiId); + }); + gisticGeneList.addAll(gistic.getGenesInRegion()); + } + + // import gistic data and gistic genes to each table if not empty + if (!gisticList.isEmpty()) { + int rowsAffected = gisticJdbcDaoImpl.addGisticBatch(gisticList); + this.gisticDataCount += rowsAffected; + } + if (!gisticGeneList.isEmpty()) { + int rowsAffected = gisticJdbcDaoImpl.addGisticGenesBatch(gisticGeneList); + this.gisticGeneDataCount += rowsAffected; + } + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/writer/MutSigDataWriter.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/writer/MutSigDataWriter.java new file mode 100644 index 0000000..61700e7 --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/writer/MutSigDataWriter.java @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.writer; + +import org.mskcc.cbio.model.*; +import org.mskcc.cbio.persistence.jdbc.*; + +import java.util.*; +import org.apache.commons.logging.*; + +import org.springframework.batch.item.*; +import org.springframework.beans.factory.annotation.Autowired; + +/** + * + * @author ochoaa + */ +public class MutSigDataWriter implements ItemStreamWriter { + + @Autowired + MutSigJdbcDaoImpl mutSigJdbcDaoImpl; + + private int mutSigDataCount; + private int qValueRejects; + + private final Set entrezGeneIdSet = new HashSet<>(); + + private static final Log LOG = LogFactory.getLog(MutSigDataWriter.class); + + @Override + public void open(ExecutionContext executionContext) throws ItemStreamException { + LOG.info("Beginning mutsig data import"); + } + + @Override + public void update(ExecutionContext executionContext) throws ItemStreamException { + // add mutsig data count to execution context for step listener + executionContext.put("mutSigDataCount", mutSigDataCount); + executionContext.put("qValueRejects", qValueRejects); + executionContext.put("totalGeneCount", entrezGeneIdSet.size()); + } + + @Override + public void close() throws ItemStreamException {} + + @Override + public void write(List list) throws Exception { + List mutSigData = new ArrayList(); + for (MutSig mutSig : list) { + // filter out high q values + if (mutSig.getQValue() >= 0.1) { + this.qValueRejects++; + continue; + } + // update entrez gene id set and add mut sig record to list + entrezGeneIdSet.add(mutSig.getEntrezGeneId()); + mutSigData.add(mutSig); + } + // import batch of mutsig records and update mutsig data count (mutsig records imported) + int rowsAffected = mutSigJdbcDaoImpl.addMutSigBatch(mutSigData); + this.mutSigDataCount += rowsAffected; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/writer/MutationDataWriter.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/writer/MutationDataWriter.java new file mode 100644 index 0000000..93d7b1c --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/writer/MutationDataWriter.java @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.writer; + +import org.mskcc.cbio.model.*; +import org.mskcc.cbio.persistence.jdbc.*; +import org.cbio.portal.pipelines.importer.util.*; +import org.cbio.portal.pipelines.importer.config.composite.CompositeMutationData; + +import java.util.*; +import org.apache.commons.logging.*; + +import org.springframework.batch.item.*; +import org.springframework.beans.factory.annotation.Autowired; + +/** + * + * @author ochoaa + */ +public class MutationDataWriter implements ItemStreamWriter { + + @Autowired + SampleProfileJdbcDaoImpl sampleProfileJdbcDaoImpl; + + @Autowired + MutationJdbcDaoImpl mutationJdbcDaoImpl; + + private Integer nextMutationEventId; + private MutationFilter mutationFilter; + + private boolean isMutationDatatype; + private int mutationDataCount; + private int mutationEventDataCount; + private Integer genePanelId; + + private final Set caseIdSet = new LinkedHashSet<>(); + private final Set entrezGeneIdSet = new HashSet<>(); + + private static final Log LOG = LogFactory.getLog(MutationDataWriter.class); + + @Override + public void open(ExecutionContext executionContext) throws ItemStreamException { + this.mutationFilter = new MutationFilter(); + this.nextMutationEventId = mutationJdbcDaoImpl.getLargestMutationEventId(); + this.isMutationDatatype = (boolean) executionContext.get("isMutationDatatype"); + this.genePanelId = (Integer) executionContext.get("genePanelId"); + LOG.info("Beginning mutation data batch import"); + } + + @Override + public void update(ExecutionContext executionContext) throws ItemStreamException { + // update mutation and mutation event data counts, total genes, and + // case list to execution context for step listener + executionContext.put("mutationDataCount", mutationDataCount); + executionContext.put("mutationEventDataCount", mutationEventDataCount); + executionContext.put("mutationFilter", mutationFilter); + executionContext.put("totalGeneCount", entrezGeneIdSet.size()); + executionContext.put("caseList", caseIdSet); + } + + @Override + public void close() throws ItemStreamException {} + + @Override + public void write(List list) throws Exception { + for (CompositeMutationData cmd : list) { + if (isMutationDatatype && !mutationFilter.acceptMutation(cmd.getMutation())) { + continue; + } + // add case id and gene to lists + caseIdSet.add(cmd.getMutation().getSampleId()); + entrezGeneIdSet.add(cmd.getMutation().getEntrezGeneId()); + + // add sample profile it not already exits for genetic profile + if (!sampleProfileJdbcDaoImpl.existsInGeneticProfile(cmd.getMutation().getSampleId(), cmd.getMutation().getGeneticProfileId())) { + sampleProfileJdbcDaoImpl.addSampleProfile(cmd.getMutation().getSampleId(), cmd.getMutation().getGeneticProfileId(), genePanelId); + } + + // check if mutation event alreay exists + MutationEvent existingMutationEvent = mutationJdbcDaoImpl.getMutationEvent(cmd.getMutation().getMutationEvent()); + if (existingMutationEvent != null) { + // update mutation with existing mutation event + cmd.updateMutationEvent(existingMutationEvent); + } + else { + // update mutation event id with next mutation event id + cmd.setMutationEventId(++nextMutationEventId); + mutationJdbcDaoImpl.addMutationEvent(cmd.getMutation().getMutationEvent()); + this.mutationEventDataCount++; + } + + // check if mutation already exists + Mutation existingMutation = mutationJdbcDaoImpl.getMutation(cmd.getMutation()); + if (existingMutation != null) { + // if exists then merge current mutation with existing mutation + Mutation mergedMutation = MutationDataUtils.mergeMutationData(existingMutation, cmd.getMutation()); + mutationJdbcDaoImpl.updateMutation(mergedMutation); + } + else { + mutationJdbcDaoImpl.addMutation(cmd.getMutation()); + this.mutationDataCount++; + } + } + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/writer/ProfileDataWriter.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/writer/ProfileDataWriter.java new file mode 100644 index 0000000..ed516e3 --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/writer/ProfileDataWriter.java @@ -0,0 +1,211 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.writer; + +import org.mskcc.cbio.model.*; +import org.mskcc.cbio.persistence.jdbc.*; +import org.cbio.portal.pipelines.importer.model.ProfileDataRecord; +import org.cbio.portal.pipelines.importer.config.composite.CompositeProfileData; +import org.cbio.portal.pipelines.importer.util.GeneDataUtils; + +import java.util.*; +import org.apache.commons.logging.*; + +import org.springframework.batch.item.*; +import org.springframework.beans.factory.annotation.Autowired; + +/** + * + * @author ochoaa + */ +public class ProfileDataWriter implements ItemStreamWriter { + + @Autowired + GeneticAlterationJdbcDaoImpl geneticAlterationJdbcDaoImpl; + + @Autowired + CnaEventJdbcDaoImpl cnaEventJdbcDaoImpl; + + @Autowired + GeneDataUtils geneDataUtils; + + private Integer nextCnaEventId; + private GeneticProfile geneticProfile; + + private boolean isCnaData; + private int cnaEventCount; + private int sampleCnaEventCount; + private int geneticAlterationCount; + private int additionalEntriesSkipped; + private int validExtraRecords; + private int skippedExtraRecords; + + private final Set arrayIdSet = new HashSet<>(); + private final Set entrezGeneIdSet = new HashSet<>(); + + private static final Log LOG = LogFactory.getLog(ProfileDataWriter.class); + + @Override + public void open(ExecutionContext executionContext) throws ItemStreamException { + this.nextCnaEventId = cnaEventJdbcDaoImpl.getLargestCnaEventId(); + this.isCnaData = (boolean) executionContext.get("isCnaData"); + this.geneticProfile = (GeneticProfile) executionContext.get("geneticProfile"); + LOG.info("Beginning profile data batch import for genetic profile: " + geneticProfile.getStableId()); + } + + @Override + public void update(ExecutionContext executionContext) throws ItemStreamException { + // add counts to execution context for step listener + executionContext.put("geneticAlterationCount", geneticAlterationCount); + executionContext.put("validExtraRecords", validExtraRecords); + executionContext.put("skippedExtraRecords", skippedExtraRecords); + executionContext.put("additionalEntriesSkipped", additionalEntriesSkipped); + executionContext.put("totalGeneCount", entrezGeneIdSet.size()); + if (isCnaData) { + executionContext.put("cnaEventCount", cnaEventCount); + executionContext.put("sampleCnaEventCount", sampleCnaEventCount); + } + } + + @Override + public void close() throws ItemStreamException {} + + @Override + public void write(List list) throws Exception { + List compositeProfileDataList = new ArrayList(); + + // go through records and generate list of composite profile data + for (ProfileDataRecord pdr : list) { + // skip records with array ids that have already been added + if (pdr.isRppaProfile() && !arrayIdSet.add(pdr.getArrayId())) { + LOG.warn("Array Id found to be duplicated: " + pdr.getArrayId() + ". Record will be skipped"); + this.additionalEntriesSkipped++; + continue; + } + + // if cna data then only create one composite profile data object + // and load cna events for the profile data record + if (isCnaData) { + Gene gene = pdr.getCompositeGeneList().get(0); + + // skip entry if gene has already been loaded from datafile + if (!entrezGeneIdSet.add(gene.getEntrezGeneId())) { + LOG.warn("Skipping entry since data has already been loaded for gene: " + gene.getHugoGeneSymbol()); + this.additionalEntriesSkipped++; + continue; + } + CompositeProfileData cpd = new CompositeProfileData(); + cpd.setGene(gene); + cpd.setGeneticProfile(geneticProfile); + cpd.setCaseProfileDataRecords(pdr.getCaseProfileDataMap()); + cpd.setProfileCnaEvents(pdr.getCnaEvents()); + compositeProfileDataList.add(cpd); + } + else { + // if not cna data then generate list of composite profile data for each gene in list + List compositeProfileData = new ArrayList(); + List geneList = pdr.getCompositeGeneList(); + for (int i=0; i< geneList.size(); i++) { + Gene gene = geneList.get(i); + // skip entry if gene has already been loaded from datafile + if (!entrezGeneIdSet.add(gene.getEntrezGeneId())) { + LOG.warn("Skipping entry since data has already been loaded for gene: " + gene.getHugoGeneSymbol()); + this.additionalEntriesSkipped++; + continue; + } + + // create new composite profile data instance for current gene + CompositeProfileData cpd = new CompositeProfileData(); + cpd.setGene(gene); + cpd.setGeneticProfile(geneticProfile); + cpd.setCaseProfileDataRecords(pdr.getCaseProfileDataMap()); + if (i == 0) { + // always add first composite profile data record in list + compositeProfileData.add(cpd); + } + else { + // only add extra composite profile data records if microRNA or if RPPA profile + if (gene.getType().equals(geneDataUtils.MIRNA_TYPE) || pdr.isRppaProfile()) { + compositeProfileData.add(cpd); + this.validExtraRecords++; + } + else { + LOG.warn("Skipping ambiguous gene symbol: " + gene.getHugoGeneSymbol()); + this.skippedExtraRecords++; + } + } + compositeProfileDataList.addAll(compositeProfileData); + } + } + } + + // import genetic alteration data and cna event data if any + for (CompositeProfileData cpd : compositeProfileDataList) { + // first add genetic alteration record + geneticAlterationJdbcDaoImpl.addGeneticAlterations(cpd.getGeneticProfile(), cpd.getGene(), + new ArrayList(cpd.getCaseProfileDataRecords().values())); + this.geneticAlterationCount++; + if (!isCnaData) { + continue; + } + + // import cna events in composite profile data if not null or empty + if (cpd.getProfileCnaEvents().isEmpty()) { + continue; + } + for (CnaEvent cnaEvent : cpd.getProfileCnaEvents()) { + SampleCnaEvent sampleCnaEvent = cnaEvent.getSampleCnaEvent(); + + // check if cna event already exists + CnaEvent existingCnaEvent = cnaEventJdbcDaoImpl.getCnaEvent(cnaEvent); + if (existingCnaEvent != null) { + // if cna event already exists then update cna event id and import only the sample cna event + sampleCnaEvent.setCnaEventId(existingCnaEvent.getCnaEventId()); + cnaEventJdbcDaoImpl.addSampleCnaEvent(sampleCnaEvent); + this.sampleCnaEventCount++; + } + else { + // update cna event id with next cna event id and import both the cna event and sample cna event + cnaEvent.setCnaEventId(++nextCnaEventId); + cnaEventJdbcDaoImpl.addCnaEvent(cnaEvent); + + sampleCnaEvent.setCnaEventId(cnaEvent.getCnaEventId()); + cnaEventJdbcDaoImpl.addSampleCnaEvent(sampleCnaEvent); + this.cnaEventCount++; + this.sampleCnaEventCount++; + } + } + } + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/writer/StructuralVariantDataWriter.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/writer/StructuralVariantDataWriter.java new file mode 100644 index 0000000..feaccdf --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/writer/StructuralVariantDataWriter.java @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.writer; + +import org.mskcc.cbio.model.*; +import org.mskcc.cbio.persistence.jdbc.*; + +import java.util.*; +import org.apache.commons.logging.*; + +import org.springframework.batch.item.*; +import org.springframework.beans.factory.annotation.Autowired; + +/** + * + * @author ochoaa + */ +public class StructuralVariantDataWriter implements ItemStreamWriter { + + @Autowired + SampleProfileJdbcDaoImpl sampleProfileJdbcDaoImpl; + + @Autowired + StructuralVariantJdbcDaoImpl structuralVariantJdbcDaoImpl; + + private int structuralVariantDataCount; + private Integer genePanelId; + private final Set caseIdSet = new LinkedHashSet<>(); + + private static final Log LOG = LogFactory.getLog(StructuralVariantDataWriter.class); + + @Override + public void open(ExecutionContext executionContext) throws ItemStreamException { + this.genePanelId = (Integer) executionContext.get("genePanelId"); + LOG.info("Beginning structural variant data import"); + } + + @Override + public void update(ExecutionContext executionContext) throws ItemStreamException { + // add structural variant data count and case list to execution context for step listener + executionContext.put("structuralVariantDataCount", structuralVariantDataCount); + executionContext.put("caseList", caseIdSet); + } + + @Override + public void close() throws ItemStreamException {} + + @Override + public void write(List list) throws Exception { + for (StructuralVariant sv : list) { + // update case id set + caseIdSet.add(sv.getSampleId()); + + // add sample profile it not already exits for genetic profile + if (!sampleProfileJdbcDaoImpl.existsInGeneticProfile(sv.getSampleId(), sv.getGeneticProfileId())) { + sampleProfileJdbcDaoImpl.addSampleProfile(sv.getSampleId(), sv.getGeneticProfileId(), genePanelId); + } + } + + // import batch of structural variant records and update structural variant + // data count (structural variant records imported) + int rowsAffected = structuralVariantJdbcDaoImpl.addStructuralVariantBatch((List) list); + this.structuralVariantDataCount += rowsAffected; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/config/writer/TimelineDataWriter.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/writer/TimelineDataWriter.java new file mode 100644 index 0000000..33ab156 --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/config/writer/TimelineDataWriter.java @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.config.writer; + +import org.mskcc.cbio.model.*; +import org.mskcc.cbio.persistence.jdbc.*; + +import java.util.*; +import org.apache.commons.logging.*; + +import org.springframework.batch.item.*; +import org.springframework.beans.factory.annotation.Autowired; + +/** + * + * @author ochoaa + */ +public class TimelineDataWriter implements ItemStreamWriter { + + @Autowired + ClinicalEventJdbcDaoImpl clinicalEventJdbcDaoImpl; + + private Integer nextClinicalEventId; + + private int clinicalEventCount; + private int clinicalEventDataCount; + private final Set caseIdSet = new LinkedHashSet<>(); + + private static final Log LOG = LogFactory.getLog(TimelineDataWriter.class); + + @Override + public void open(ExecutionContext executionContext) throws ItemStreamException { + this.nextClinicalEventId = clinicalEventJdbcDaoImpl.getLargestClinicalEventId(); + LOG.info("Beginning clinical event data batch import"); + } + + @Override + public void update(ExecutionContext executionContext) throws ItemStreamException { + // update clinical event and clinical event data counts for step listener + executionContext.put("clinicalEventCount", clinicalEventCount); + executionContext.put("clinicalEventDataCount", clinicalEventDataCount); + executionContext.put("caseList", caseIdSet); + } + + @Override + public void close() throws ItemStreamException {} + + @Override + public void write(List list) throws Exception { + List clinicalEventList = new ArrayList(); + List clinicalEventDataList = new ArrayList(); + + // go through list and add all clinical events and clinical event data + // to lists for import + for (ClinicalEvent clinicalEvent : list) { + // add case id to case id set + caseIdSet.add(clinicalEvent.getPatientId()); + + // upate clinical event id for clinical event + clinicalEvent.setClinicalEventId(++nextClinicalEventId); + clinicalEventList.add(clinicalEvent); + + // only update clinical event id for clinical event data list if not empty + if (clinicalEvent.getClinicalEventData().isEmpty()) { + continue; + } + clinicalEvent.getClinicalEventData().forEach((ced) -> { + ced.setClinicalEventId(nextClinicalEventId); + }); + clinicalEventDataList.addAll(clinicalEvent.getClinicalEventData()); + } + + // import clinical events and clinical event data to each table if not empty + if (!clinicalEventList.isEmpty()) { + int rowsAffected = clinicalEventJdbcDaoImpl.addClinicalEventBatch(clinicalEventList); + this.clinicalEventCount += rowsAffected; + } + if (!clinicalEventDataList.isEmpty()) { + int rowsAffected = clinicalEventJdbcDaoImpl.addClinicalEventDataBatch(clinicalEventDataList); + this.clinicalEventDataCount += rowsAffected; + } + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/model/CopyNumberSegmentRecord.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/model/CopyNumberSegmentRecord.java new file mode 100644 index 0000000..d47dbee --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/model/CopyNumberSegmentRecord.java @@ -0,0 +1,180 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.model; + +import java.util.*; + +/** + * + * @author ochoaa + */ +public class CopyNumberSegmentRecord { + + private Integer cancerStudyId; + private Integer sampleId; + private String id; + private String chrom; + private String locStart; + private String locEnd; + private String numProbes; + private String segMean; + + /** + * @return the cancerStudyId + */ + public Integer getCancerStudyId() { + return cancerStudyId; + } + + /** + * @param cancerStudyId the cancerStudyId to set + */ + public void setCancerStudyId(Integer cancerStudyId) { + this.cancerStudyId = cancerStudyId; + } + + /** + * @return the sampleId + */ + public Integer getSampleId() { + return sampleId; + } + + /** + * @param sampleId the sampleId to set + */ + public void setSampleId(Integer sampleId) { + this.sampleId = sampleId; + } + + /** + * @return the id + */ + public String getId() { + return id; + } + + /** + * @param id the id to set + */ + public void setId(String id) { + this.id = id; + } + + /** + * @return the chrom + */ + public String getChrom() { + return chrom; + } + + /** + * @param chrom the chrom to set + */ + public void setChrom(String chrom) { + this.chrom = chrom; + } + + /** + * @return the locStart + */ + public String getLocStart() { + return locStart; + } + + /** + * @param locStart the locStart to set + */ + public void setLocStart(String locStart) { + this.locStart = locStart; + } + + /** + * @return the locEnd + */ + public String getLocEnd() { + return locEnd; + } + + /** + * @param locEnd the locEnd to set + */ + public void setLocEnd(String locEnd) { + this.locEnd = locEnd; + } + + /** + * @return the numProbes + */ + public String getNumProbes() { + return numProbes; + } + + /** + * @param numProbes the numProbes to set + */ + public void setNumProbes(String numProbes) { + this.numProbes = numProbes; + } + + /** + * @return the segMean + */ + public String getSegMean() { + return segMean; + } + + /** + * @param segMean the segMean to set + */ + public void setSegMean(String segMean) { + this.segMean = segMean; + } + + /** + * @return the copy number segment staging data map (column -> field) + */ + public HashMap getCopyNumberSegmentStagingDataMap() { + HashMap map = new HashMap<>(); + map.put("ID", "id"); + map.put("chrom", "chrom"); + map.put("chromosome", "chrom"); + map.put("loc.start", "locStart"); + map.put("loc.end", "locEnd"); + map.put("num.mark", "numProbes"); + map.put("seg.mean", "segMean"); + + return map; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/model/FusionRecord.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/model/FusionRecord.java new file mode 100644 index 0000000..123d1d3 --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/model/FusionRecord.java @@ -0,0 +1,243 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.model; + +import java.util.*; + +/** + * + * @author ochoaa + */ +public class FusionRecord { + + private Integer sampleId; + private Integer geneticProfileId; + private String hugoSymbol; + private String entrezGeneId; + private String center; + private String tumorSampleBarcode; + private String fusion; + private String dnaSupport; + private String rnaSupport; + private String method; + private String frame; + private String comments; + + /** + * @return the sampleId + */ + public Integer getSampleId() { + return sampleId; + } + + /** + * @param sampleId the sampleId to set + */ + public void setSampleId(Integer sampleId) { + this.sampleId = sampleId; + } + + /** + * @return the geneticProfileId + */ + public Integer getGeneticProfileId() { + return geneticProfileId; + } + + /** + * @param geneticProfileId the geneticProfileId to set + */ + public void setGeneticProfileId(Integer geneticProfileId) { + this.geneticProfileId = geneticProfileId; + } + + /** + * @return the hugoSymbol + */ + public String getHugoSymbol() { + return hugoSymbol; + } + + /** + * @param hugoSymbol the hugoSymbol to set + */ + public void setHugoSymbol(String hugoSymbol) { + this.hugoSymbol = hugoSymbol; + } + + /** + * @return the entrezGeneId + */ + public String getEntrezGeneId() { + return entrezGeneId; + } + + /** + * @param entrezGeneId the entrezGeneId to set + */ + public void setEntrezGeneId(String entrezGeneId) { + this.entrezGeneId = entrezGeneId; + } + + /** + * @return the center + */ + public String getCenter() { + return center; + } + + /** + * @param center the center to set + */ + public void setCenter(String center) { + this.center = center; + } + + /** + * @return the tumorSampleBarcode + */ + public String getTumorSampleBarcode() { + return tumorSampleBarcode; + } + + /** + * @param tumorSampleBarcode the tumorSampleBarcode to set + */ + public void setTumorSampleBarcode(String tumorSampleBarcode) { + this.tumorSampleBarcode = tumorSampleBarcode; + } + + /** + * @return the fusion + */ + public String getFusion() { + return fusion; + } + + /** + * @param fusion the fusion to set + */ + public void setFusion(String fusion) { + this.fusion = fusion; + } + + /** + * @return the dnaSupport + */ + public String getDnaSupport() { + return dnaSupport; + } + + /** + * @param dnaSupport the dnaSupport to set + */ + public void setDnaSupport(String dnaSupport) { + this.dnaSupport = dnaSupport; + } + + /** + * @return the rnaSupport + */ + public String getRnaSupport() { + return rnaSupport; + } + + /** + * @param rnaSupport the rnaSupport to set + */ + public void setRnaSupport(String rnaSupport) { + this.rnaSupport = rnaSupport; + } + + /** + * @return the method + */ + public String getMethod() { + return method; + } + + /** + * @param method the method to set + */ + public void setMethod(String method) { + this.method = method; + } + + /** + * @return the frame + */ + public String getFrame() { + return frame; + } + + /** + * @param frame the frame to set + */ + public void setFrame(String frame) { + this.frame = frame; + } + + /** + * @return the comments + */ + public String getComments() { + return comments; + } + + /** + * @param comments the comments to set + */ + public void setComments(String comments) { + this.comments = comments; + } + + /** + * @return the Fusion staging data map (column -> field) + */ + public Map getFusionStagingData() { + Map map = new HashMap<>(); + map.put("Hugo_Symbol", "hugoSymbol"); + map.put("Entrez_Gene_Id", "entrezGeneId"); + map.put("Center", "center"); + map.put("Tumor_Sample_Barcode", "tumorSampleBarcode"); + map.put("Fusion", "fusion"); + map.put("DNA_support", "dnaSupport"); + map.put("RNA_support", "rnaSupport"); + map.put("Method", "method"); + map.put("Frame", "frame"); + map.put("Comments", "comments"); + + return map; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/model/GisticRecord.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/model/GisticRecord.java new file mode 100644 index 0000000..7792880 --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/model/GisticRecord.java @@ -0,0 +1,180 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.model; + +import java.util.*; + +/** + * + * @author ochoaa + */ +public class GisticRecord { + + private Integer cancerStudyId; + private String chromosome; + private String peakStart; + private String peakEnd; + private String genesInRegion; + private String qValue; + private String cytoband; + private String amp; + + /** + * @return the cancerStudyId + */ + public Integer getCancerStudyId() { + return cancerStudyId; + } + + /** + * @param cancerStudyId the cancerStudyId to set + */ + public void setCancerStudyId(Integer cancerStudyId) { + this.cancerStudyId = cancerStudyId; + } + + /** + * @return the chromosome + */ + public String getChromosome() { + return chromosome; + } + + /** + * @param chromosome the chromosome to set + */ + public void setChromosome(String chromosome) { + this.chromosome = chromosome; + } + + /** + * @return the peakStart + */ + public String getPeakStart() { + return peakStart; + } + + /** + * @param peakStart the peakStart to set + */ + public void setPeakStart(String peakStart) { + this.peakStart = peakStart; + } + + /** + * @return the peakEnd + */ + public String getPeakEnd() { + return peakEnd; + } + + /** + * @param peakEnd the peakEnd to set + */ + public void setPeakEnd(String peakEnd) { + this.peakEnd = peakEnd; + } + + /** + * @return the genesInRegion + */ + public String getGenesInRegion() { + return genesInRegion; + } + + /** + * @param genesInRegion the genesInRegion to set + */ + public void setGenesInRegion(String genesInRegion) { + this.genesInRegion = genesInRegion; + } + + /** + * @return the qValue + */ + public String getqValue() { + return qValue; + } + + /** + * @param qValue the qValue to set + */ + public void setqValue(String qValue) { + this.qValue = qValue; + } + + /** + * @return the cytoband + */ + public String getCytoband() { + return cytoband; + } + + /** + * @param cytoband the cytoband to set + */ + public void setCytoband(String cytoband) { + this.cytoband = cytoband; + } + + /** + * @return the amp + */ + public String getAmp() { + return amp; + } + + /** + * @param amp the amp to set + */ + public void setAmp(String amp) { + this.amp = amp; + } + + /** + * @return the gistic staging data map (column -> field) + */ + public Map getGisticStagingDataMap() { + Map map = new HashMap<>(); + map.put("chromosome", "chromosome"); + map.put("peak_start", "peakStart"); + map.put("peak_end", "peakEnd"); + map.put("genes_in_region", "genesInRegion"); + map.put("q_value", "qValue"); + map.put("cytoband", "cytoband"); + map.put("amp", "amp"); + + return map; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/model/MafRecord.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/model/MafRecord.java new file mode 100644 index 0000000..e2974e3 --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/model/MafRecord.java @@ -0,0 +1,1431 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.model; + +import java.util.*; + +/** + * + * @author ochoaa + */ +public class MafRecord { + + private Integer sampleId; + private Integer geneticProfileId; + private String hugoSymbol; + private String entrezGeneId; + private String center; + private String ncbiBuild; + private String chromosome; + private String startPosition; + private String endPosition; + private String strand; + private String variantClassification; + private String variantType; + private String referenceAllele; + private String tumorSeqAllele1; + private String tumorSeqAllele2; + private String dbsnpRs; + private String dbsnpValStatus; + private String tumorSampleBarcode; + private String matchedNormSampleBarcode; + private String matchNormSeqAllele1; + private String matchNormSeqAllele2; + private String tumorValidationAllele1; + private String tumorValidationAllele2; + private String matchNormValidationAllele1; + private String matchNormValidationAllele2; + private String verificationStatus; + private String validationStatus; + private String mutationStatus; + private String sequencingPhase; + private String sequenceSource; + private String validationMethod; + private String score; + private String bamFile; + private String sequencer; + private String aminoAcidChange; + private String transcript; + private String tRefCount; + private String tAltCount; + private String nRefCount; + private String nAltCount; + private String tTotCov; + private String tVarCov; + private String nTotCov; + private String nVarCov; + private String tumorDepth; + private String tumorVaf; + private String normalDepth; + private String normalVaf; + private String hgvspShort; + private String codons; + private String swissprot; + private String refseq; + private String proteinPosition; + private String oncotatorCosmicOverlapping; + private String oncotatorDbsnpRs; + private String oncotatorDbsnpValStatus; + private String oncotatorProteinChange; + private String oncotatorVariantClassification; + private String oncotatorGeneSymbol; + private String oncotatorRefseqMrnaId; + private String oncotatorRefseqProtId; + private String oncotatorUniprotEntryName; + private String oncotatorUniprotAccession; + private String oncotatorCodonChange; + private String oncotatorTranscriptChange; + private String oncotatorExonAffected; + private String oncotatorProteinPosStart; + private String oncotatorProteinPosEnd; + private String oncotatorProteinChangeBe; + private String oncotatorVariantClassificationBe; + private String oncotatorGeneSymbolBe; + private String oncotatorRefseqMrnaIdBe; + private String oncotatorRefseqProtIdBe; + private String oncotatorUniprotEntryNameBe; + private String oncotatorUniprotAccessionBe; + private String oncotatorCodonChangeBe; + private String oncotatorTranscriptChangeBe; + private String oncotatorExonAffectedBe; + private String oncotatorProteinPosStartBe; + private String oncotatorProteinPosEndBe; + private String maFimpact; + private String maFis; + private String maLinkVar; + private String maLinkMsa; + private String maLinkPdb; + private String maProteinChange; + + /** + * @return the sampleId + */ + public Integer getSampleId() { + return sampleId; + } + + /** + * @param sampleId the sampleId to set + */ + public void setSampleId(Integer sampleId) { + this.sampleId = sampleId; + } + + /** + * @return the geneticProfileId + */ + public Integer getGeneticProfileId() { + return geneticProfileId; + } + + /** + * @param geneticProfileId the geneticProfileId to set + */ + public void setGeneticProfileId(Integer geneticProfileId) { + this.geneticProfileId = geneticProfileId; + } + + /** + * @return the hugoSymbol + */ + public String getHugoSymbol() { + return hugoSymbol; + } + + /** + * @param hugoSymbol the hugoSymbol to set + */ + public void setHugoSymbol(String hugoSymbol) { + this.hugoSymbol = hugoSymbol; + } + + /** + * @return the entrezGeneId + */ + public String getEntrezGeneId() { + return entrezGeneId; + } + + /** + * @param entrezGeneId the entrezGeneId to set + */ + public void setEntrezGeneId(String entrezGeneId) { + this.entrezGeneId = entrezGeneId; + } + + /** + * @return the center + */ + public String getCenter() { + return center; + } + + /** + * @param center the center to set + */ + public void setCenter(String center) { + this.center = center; + } + + /** + * @return the ncbiBuild + */ + public String getNcbiBuild() { + return ncbiBuild; + } + + /** + * @param ncbiBuild the ncbiBuild to set + */ + public void setNcbiBuild(String ncbiBuild) { + this.ncbiBuild = ncbiBuild; + } + + /** + * @return the chromosome + */ + public String getChromosome() { + return chromosome; + } + + /** + * @param chromosome the chromosome to set + */ + public void setChromosome(String chromosome) { + this.chromosome = chromosome; + } + + /** + * @return the startPosition + */ + public String getStartPosition() { + return startPosition; + } + + /** + * @param startPosition the startPosition to set + */ + public void setStartPosition(String startPosition) { + this.startPosition = startPosition; + } + + /** + * @return the endPosition + */ + public String getEndPosition() { + return endPosition; + } + + /** + * @param endPosition the endPosition to set + */ + public void setEndPosition(String endPosition) { + this.endPosition = endPosition; + } + + /** + * @return the strand + */ + public String getStrand() { + return strand; + } + + /** + * @param strand the strand to set + */ + public void setStrand(String strand) { + this.strand = strand; + } + + /** + * @return the variantClassification + */ + public String getVariantClassification() { + return variantClassification; + } + + /** + * @param variantClassification the variantClassification to set + */ + public void setVariantClassification(String variantClassification) { + this.variantClassification = variantClassification; + } + + /** + * @return the variantType + */ + public String getVariantType() { + return variantType; + } + + /** + * @param variantType the variantType to set + */ + public void setVariantType(String variantType) { + this.variantType = variantType; + } + + /** + * @return the referenceAllele + */ + public String getReferenceAllele() { + return referenceAllele; + } + + /** + * @param referenceAllele the referenceAllele to set + */ + public void setReferenceAllele(String referenceAllele) { + this.referenceAllele = referenceAllele; + } + + /** + * @return the tumorSeqAllele1 + */ + public String getTumorSeqAllele1() { + return tumorSeqAllele1; + } + + /** + * @param tumorSeqAllele1 the tumorSeqAllele1 to set + */ + public void setTumorSeqAllele1(String tumorSeqAllele1) { + this.tumorSeqAllele1 = tumorSeqAllele1; + } + + /** + * @return the tumorSeqAllele2 + */ + public String getTumorSeqAllele2() { + return tumorSeqAllele2; + } + + /** + * @param tumorSeqAllele2 the tumorSeqAllele2 to set + */ + public void setTumorSeqAllele2(String tumorSeqAllele2) { + this.tumorSeqAllele2 = tumorSeqAllele2; + } + + /** + * @return the dbsnpRs + */ + public String getDbsnpRs() { + return dbsnpRs; + } + + /** + * @param dbsnpRs the dbsnpRs to set + */ + public void setDbsnpRs(String dbsnpRs) { + this.dbsnpRs = dbsnpRs; + } + + /** + * @return the dbsnpValStatus + */ + public String getDbsnpValStatus() { + return dbsnpValStatus; + } + + /** + * @param dbsnpValStatus the dbsnpValStatus to set + */ + public void setDbsnpValStatus(String dbsnpValStatus) { + this.dbsnpValStatus = dbsnpValStatus; + } + + /** + * @return the tumorSampleBarcode + */ + public String getTumorSampleBarcode() { + return tumorSampleBarcode; + } + + /** + * @param tumorSampleBarcode the tumorSampleBarcode to set + */ + public void setTumorSampleBarcode(String tumorSampleBarcode) { + this.tumorSampleBarcode = tumorSampleBarcode; + } + + /** + * @return the matchedNormSampleBarcode + */ + public String getMatchedNormSampleBarcode() { + return matchedNormSampleBarcode; + } + + /** + * @param matchedNormSampleBarcode the matchedNormSampleBarcode to set + */ + public void setMatchedNormSampleBarcode(String matchedNormSampleBarcode) { + this.matchedNormSampleBarcode = matchedNormSampleBarcode; + } + + /** + * @return the matchNormSeqAllele1 + */ + public String getMatchNormSeqAllele1() { + return matchNormSeqAllele1; + } + + /** + * @param matchNormSeqAllele1 the matchNormSeqAllele1 to set + */ + public void setMatchNormSeqAllele1(String matchNormSeqAllele1) { + this.matchNormSeqAllele1 = matchNormSeqAllele1; + } + + /** + * @return the matchNormSeqAllele2 + */ + public String getMatchNormSeqAllele2() { + return matchNormSeqAllele2; + } + + /** + * @param matchNormSeqAllele2 the matchNormSeqAllele2 to set + */ + public void setMatchNormSeqAllele2(String matchNormSeqAllele2) { + this.matchNormSeqAllele2 = matchNormSeqAllele2; + } + + /** + * @return the tumorValidationAllele1 + */ + public String getTumorValidationAllele1() { + return tumorValidationAllele1; + } + + /** + * @param tumorValidationAllele1 the tumorValidationAllele1 to set + */ + public void setTumorValidationAllele1(String tumorValidationAllele1) { + this.tumorValidationAllele1 = tumorValidationAllele1; + } + + /** + * @return the tumorValidationAllele2 + */ + public String getTumorValidationAllele2() { + return tumorValidationAllele2; + } + + /** + * @param tumorValidationAllele2 the tumorValidationAllele2 to set + */ + public void setTumorValidationAllele2(String tumorValidationAllele2) { + this.tumorValidationAllele2 = tumorValidationAllele2; + } + + /** + * @return the matchNormValidationAllele1 + */ + public String getMatchNormValidationAllele1() { + return matchNormValidationAllele1; + } + + /** + * @param matchNormValidationAllele1 the matchNormValidationAllele1 to set + */ + public void setMatchNormValidationAllele1(String matchNormValidationAllele1) { + this.matchNormValidationAllele1 = matchNormValidationAllele1; + } + + /** + * @return the matchNormValidationAllele2 + */ + public String getMatchNormValidationAllele2() { + return matchNormValidationAllele2; + } + + /** + * @param matchNormValidationAllele2 the matchNormValidationAllele2 to set + */ + public void setMatchNormValidationAllele2(String matchNormValidationAllele2) { + this.matchNormValidationAllele2 = matchNormValidationAllele2; + } + + /** + * @return the verificationStatus + */ + public String getVerificationStatus() { + return verificationStatus; + } + + /** + * @param verificationStatus the verificationStatus to set + */ + public void setVerificationStatus(String verificationStatus) { + this.verificationStatus = verificationStatus; + } + + /** + * @return the validationStatus + */ + public String getValidationStatus() { + return validationStatus; + } + + /** + * @param validationStatus the validationStatus to set + */ + public void setValidationStatus(String validationStatus) { + this.validationStatus = validationStatus; + } + + /** + * @return the mutationStatus + */ + public String getMutationStatus() { + return mutationStatus; + } + + /** + * @param mutationStatus the mutationStatus to set + */ + public void setMutationStatus(String mutationStatus) { + this.mutationStatus = mutationStatus; + } + + /** + * @return the sequencingPhase + */ + public String getSequencingPhase() { + return sequencingPhase; + } + + /** + * @param sequencingPhase the sequencingPhase to set + */ + public void setSequencingPhase(String sequencingPhase) { + this.sequencingPhase = sequencingPhase; + } + + /** + * @return the sequenceSource + */ + public String getSequenceSource() { + return sequenceSource; + } + + /** + * @param sequenceSource the sequenceSource to set + */ + public void setSequenceSource(String sequenceSource) { + this.sequenceSource = sequenceSource; + } + + /** + * @return the validationMethod + */ + public String getValidationMethod() { + return validationMethod; + } + + /** + * @param validationMethod the validationMethod to set + */ + public void setValidationMethod(String validationMethod) { + this.validationMethod = validationMethod; + } + + /** + * @return the score + */ + public String getScore() { + return score; + } + + /** + * @param score the score to set + */ + public void setScore(String score) { + this.score = score; + } + + /** + * @return the bamFile + */ + public String getBamFile() { + return bamFile; + } + + /** + * @param bamFile the bamFile to set + */ + public void setBamFile(String bamFile) { + this.bamFile = bamFile; + } + + /** + * @return the sequencer + */ + public String getSequencer() { + return sequencer; + } + + /** + * @param sequencer the sequencer to set + */ + public void setSequencer(String sequencer) { + this.sequencer = sequencer; + } + + /** + * @return the aminoAcidChange + */ + public String getAminoAcidChange() { + return aminoAcidChange; + } + + /** + * @param aminoAcidChange the aminoAcidChange to set + */ + public void setAminoAcidChange(String aminoAcidChange) { + this.aminoAcidChange = aminoAcidChange; + } + + /** + * @return the transcript + */ + public String getTranscript() { + return transcript; + } + + /** + * @param transcript the transcript to set + */ + public void setTranscript(String transcript) { + this.transcript = transcript; + } + + /** + * @return the tRefCount + */ + public String gettRefCount() { + return tRefCount; + } + + /** + * @param tRefCount the tRefCount to set + */ + public void settRefCount(String tRefCount) { + this.tRefCount = tRefCount; + } + + /** + * @return the tAltCount + */ + public String gettAltCount() { + return tAltCount; + } + + /** + * @param tAltCount the tAltCount to set + */ + public void settAltCount(String tAltCount) { + this.tAltCount = tAltCount; + } + + /** + * @return the nRefCount + */ + public String getnRefCount() { + return nRefCount; + } + + /** + * @param nRefCount the nRefCount to set + */ + public void setnRefCount(String nRefCount) { + this.nRefCount = nRefCount; + } + + /** + * @return the nAltCount + */ + public String getnAltCount() { + return nAltCount; + } + + /** + * @param nAltCount the nAltCount to set + */ + public void setnAltCount(String nAltCount) { + this.nAltCount = nAltCount; + } + + /** + * @return the tTotCov + */ + public String gettTotCov() { + return tTotCov; + } + + /** + * @param tTotCov the tTotCov to set + */ + public void settTotCov(String tTotCov) { + this.tTotCov = tTotCov; + } + + /** + * @return the tVarCov + */ + public String gettVarCov() { + return tVarCov; + } + + /** + * @param tVarCov the tVarCov to set + */ + public void settVarCov(String tVarCov) { + this.tVarCov = tVarCov; + } + + /** + * @return the nTotCov + */ + public String getnTotCov() { + return nTotCov; + } + + /** + * @param nTotCov the nTotCov to set + */ + public void setnTotCov(String nTotCov) { + this.nTotCov = nTotCov; + } + + /** + * @return the nVarCov + */ + public String getnVarCov() { + return nVarCov; + } + + /** + * @param nVarCov the nVarCov to set + */ + public void setnVarCov(String nVarCov) { + this.nVarCov = nVarCov; + } + + /** + * @return the tumorDepth + */ + public String getTumorDepth() { + return tumorDepth; + } + + /** + * @param tumorDepth the tumorDepth to set + */ + public void setTumorDepth(String tumorDepth) { + this.tumorDepth = tumorDepth; + } + + /** + * @return the tumorVaf + */ + public String getTumorVaf() { + return tumorVaf; + } + + /** + * @param tumorVaf the tumorVaf to set + */ + public void setTumorVaf(String tumorVaf) { + this.tumorVaf = tumorVaf; + } + + /** + * @return the normalDepth + */ + public String getNormalDepth() { + return normalDepth; + } + + /** + * @param normalDepth the normalDepth to set + */ + public void setNormalDepth(String normalDepth) { + this.normalDepth = normalDepth; + } + + /** + * @return the normalVaf + */ + public String getNormalVaf() { + return normalVaf; + } + + /** + * @param normalVaf the normalVaf to set + */ + public void setNormalVaf(String normalVaf) { + this.normalVaf = normalVaf; + } + + /** + * @return the hgvspShort + */ + public String getHgvspShort() { + return hgvspShort; + } + + /** + * @param hgvspShort the hgvspShort to set + */ + public void setHgvspShort(String hgvspShort) { + this.hgvspShort = hgvspShort; + } + + /** + * @return the codons + */ + public String getCodons() { + return codons; + } + + /** + * @param codons the codons to set + */ + public void setCodons(String codons) { + this.codons = codons; + } + + /** + * @return the swissprot + */ + public String getSwissprot() { + return swissprot; + } + + /** + * @param swissprot the swissprot to set + */ + public void setSwissprot(String swissprot) { + this.swissprot = swissprot; + } + + /** + * @return the refseq + */ + public String getRefseq() { + return refseq; + } + + /** + * @param refseq the refseq to set + */ + public void setRefseq(String refseq) { + this.refseq = refseq; + } + + /** + * @return the proteinPosition + */ + public String getProteinPosition() { + return proteinPosition; + } + + /** + * @param proteinPosition the proteinPosition to set + */ + public void setProteinPosition(String proteinPosition) { + this.proteinPosition = proteinPosition; + } + + /** + * @return the oncotatorCosmicOverlapping + */ + public String getOncotatorCosmicOverlapping() { + return oncotatorCosmicOverlapping; + } + + /** + * @param oncotatorCosmicOverlapping the oncotatorCosmicOverlapping to set + */ + public void setOncotatorCosmicOverlapping(String oncotatorCosmicOverlapping) { + this.oncotatorCosmicOverlapping = oncotatorCosmicOverlapping; + } + + /** + * @return the oncotatorDbsnpRs + */ + public String getOncotatorDbsnpRs() { + return oncotatorDbsnpRs; + } + + /** + * @param oncotatorDbsnpRs the oncotatorDbsnpRs to set + */ + public void setOncotatorDbsnpRs(String oncotatorDbsnpRs) { + this.oncotatorDbsnpRs = oncotatorDbsnpRs; + } + + /** + * @return the oncotatorDbsnpValStatus + */ + public String getOncotatorDbsnpValStatus() { + return oncotatorDbsnpValStatus; + } + + /** + * @param oncotatorDbsnpValStatus the oncotatorDbsnpValStatus to set + */ + public void setOncotatorDbsnpValStatus(String oncotatorDbsnpValStatus) { + this.oncotatorDbsnpValStatus = oncotatorDbsnpValStatus; + } + + /** + * @return the oncotatorProteinChange + */ + public String getOncotatorProteinChange() { + return oncotatorProteinChange; + } + + /** + * @param oncotatorProteinChange the oncotatorProteinChange to set + */ + public void setOncotatorProteinChange(String oncotatorProteinChange) { + this.oncotatorProteinChange = oncotatorProteinChange; + } + + /** + * @return the oncotatorVariantClassification + */ + public String getOncotatorVariantClassification() { + return oncotatorVariantClassification; + } + + /** + * @param oncotatorVariantClassification the oncotatorVariantClassification to set + */ + public void setOncotatorVariantClassification(String oncotatorVariantClassification) { + this.oncotatorVariantClassification = oncotatorVariantClassification; + } + + /** + * @return the oncotatorGeneSymbol + */ + public String getOncotatorGeneSymbol() { + return oncotatorGeneSymbol; + } + + /** + * @param oncotatorGeneSymbol the oncotatorGeneSymbol to set + */ + public void setOncotatorGeneSymbol(String oncotatorGeneSymbol) { + this.oncotatorGeneSymbol = oncotatorGeneSymbol; + } + + /** + * @return the oncotatorRefseqMrnaId + */ + public String getOncotatorRefseqMrnaId() { + return oncotatorRefseqMrnaId; + } + + /** + * @param oncotatorRefseqMrnaId the oncotatorRefseqMrnaId to set + */ + public void setOncotatorRefseqMrnaId(String oncotatorRefseqMrnaId) { + this.oncotatorRefseqMrnaId = oncotatorRefseqMrnaId; + } + + /** + * @return the oncotatorRefseqProtId + */ + public String getOncotatorRefseqProtId() { + return oncotatorRefseqProtId; + } + + /** + * @param oncotatorRefseqProtId the oncotatorRefseqProtId to set + */ + public void setOncotatorRefseqProtId(String oncotatorRefseqProtId) { + this.oncotatorRefseqProtId = oncotatorRefseqProtId; + } + + /** + * @return the oncotatorUniprotEntryName + */ + public String getOncotatorUniprotEntryName() { + return oncotatorUniprotEntryName; + } + + /** + * @param oncotatorUniprotEntryName the oncotatorUniprotEntryName to set + */ + public void setOncotatorUniprotEntryName(String oncotatorUniprotEntryName) { + this.oncotatorUniprotEntryName = oncotatorUniprotEntryName; + } + + /** + * @return the oncotatorUniprotAccession + */ + public String getOncotatorUniprotAccession() { + return oncotatorUniprotAccession; + } + + /** + * @param oncotatorUniprotAccession the oncotatorUniprotAccession to set + */ + public void setOncotatorUniprotAccession(String oncotatorUniprotAccession) { + this.oncotatorUniprotAccession = oncotatorUniprotAccession; + } + + /** + * @return the oncotatorCodonChange + */ + public String getOncotatorCodonChange() { + return oncotatorCodonChange; + } + + /** + * @param oncotatorCodonChange the oncotatorCodonChange to set + */ + public void setOncotatorCodonChange(String oncotatorCodonChange) { + this.oncotatorCodonChange = oncotatorCodonChange; + } + + /** + * @return the oncotatorTranscriptChange + */ + public String getOncotatorTranscriptChange() { + return oncotatorTranscriptChange; + } + + /** + * @param oncotatorTranscriptChange the oncotatorTranscriptChange to set + */ + public void setOncotatorTranscriptChange(String oncotatorTranscriptChange) { + this.oncotatorTranscriptChange = oncotatorTranscriptChange; + } + + /** + * @return the oncotatorExonAffected + */ + public String getOncotatorExonAffected() { + return oncotatorExonAffected; + } + + /** + * @param oncotatorExonAffected the oncotatorExonAffected to set + */ + public void setOncotatorExonAffected(String oncotatorExonAffected) { + this.oncotatorExonAffected = oncotatorExonAffected; + } + + /** + * @return the oncotatorProteinPosStart + */ + public String getOncotatorProteinPosStart() { + return oncotatorProteinPosStart; + } + + /** + * @param oncotatorProteinPosStart the oncotatorProteinPosStart to set + */ + public void setOncotatorProteinPosStart(String oncotatorProteinPosStart) { + this.oncotatorProteinPosStart = oncotatorProteinPosStart; + } + + /** + * @return the oncotatorProteinPosEnd + */ + public String getOncotatorProteinPosEnd() { + return oncotatorProteinPosEnd; + } + + /** + * @param oncotatorProteinPosEnd the oncotatorProteinPosEnd to set + */ + public void setOncotatorProteinPosEnd(String oncotatorProteinPosEnd) { + this.oncotatorProteinPosEnd = oncotatorProteinPosEnd; + } + + /** + * @return the oncotatorProteinChangeBe + */ + public String getOncotatorProteinChangeBe() { + return oncotatorProteinChangeBe; + } + + /** + * @param oncotatorProteinChangeBe the oncotatorProteinChangeBe to set + */ + public void setOncotatorProteinChangeBe(String oncotatorProteinChangeBe) { + this.oncotatorProteinChangeBe = oncotatorProteinChangeBe; + } + + /** + * @return the oncotatorVariantClassificationBe + */ + public String getOncotatorVariantClassificationBe() { + return oncotatorVariantClassificationBe; + } + + /** + * @param oncotatorVariantClassificationBe the oncotatorVariantClassificationBe to set + */ + public void setOncotatorVariantClassificationBe(String oncotatorVariantClassificationBe) { + this.oncotatorVariantClassificationBe = oncotatorVariantClassificationBe; + } + + /** + * @return the oncotatorGeneSymbolBe + */ + public String getOncotatorGeneSymbolBe() { + return oncotatorGeneSymbolBe; + } + + /** + * @param oncotatorGeneSymbolBe the oncotatorGeneSymbolBe to set + */ + public void setOncotatorGeneSymbolBe(String oncotatorGeneSymbolBe) { + this.oncotatorGeneSymbolBe = oncotatorGeneSymbolBe; + } + + /** + * @return the oncotatorRefseqMrnaIdBe + */ + public String getOncotatorRefseqMrnaIdBe() { + return oncotatorRefseqMrnaIdBe; + } + + /** + * @param oncotatorRefseqMrnaIdBe the oncotatorRefseqMrnaIdBe to set + */ + public void setOncotatorRefseqMrnaIdBe(String oncotatorRefseqMrnaIdBe) { + this.oncotatorRefseqMrnaIdBe = oncotatorRefseqMrnaIdBe; + } + + /** + * @return the oncotatorRefseqProtIdBe + */ + public String getOncotatorRefseqProtIdBe() { + return oncotatorRefseqProtIdBe; + } + + /** + * @param oncotatorRefseqProtIdBe the oncotatorRefseqProtIdBe to set + */ + public void setOncotatorRefseqProtIdBe(String oncotatorRefseqProtIdBe) { + this.oncotatorRefseqProtIdBe = oncotatorRefseqProtIdBe; + } + + /** + * @return the oncotatorUniprotEntryNameBe + */ + public String getOncotatorUniprotEntryNameBe() { + return oncotatorUniprotEntryNameBe; + } + + /** + * @param oncotatorUniprotEntryNameBe the oncotatorUniprotEntryNameBe to set + */ + public void setOncotatorUniprotEntryNameBe(String oncotatorUniprotEntryNameBe) { + this.oncotatorUniprotEntryNameBe = oncotatorUniprotEntryNameBe; + } + + /** + * @return the oncotatorUniprotAccessionBe + */ + public String getOncotatorUniprotAccessionBe() { + return oncotatorUniprotAccessionBe; + } + + /** + * @param oncotatorUniprotAccessionBe the oncotatorUniprotAccessionBe to set + */ + public void setOncotatorUniprotAccessionBe(String oncotatorUniprotAccessionBe) { + this.oncotatorUniprotAccessionBe = oncotatorUniprotAccessionBe; + } + + /** + * @return the oncotatorCodonChangeBe + */ + public String getOncotatorCodonChangeBe() { + return oncotatorCodonChangeBe; + } + + /** + * @param oncotatorCodonChangeBe the oncotatorCodonChangeBe to set + */ + public void setOncotatorCodonChangeBe(String oncotatorCodonChangeBe) { + this.oncotatorCodonChangeBe = oncotatorCodonChangeBe; + } + + /** + * @return the oncotatorTranscriptChangeBe + */ + public String getOncotatorTranscriptChangeBe() { + return oncotatorTranscriptChangeBe; + } + + /** + * @param oncotatorTranscriptChangeBe the oncotatorTranscriptChangeBe to set + */ + public void setOncotatorTranscriptChangeBe(String oncotatorTranscriptChangeBe) { + this.oncotatorTranscriptChangeBe = oncotatorTranscriptChangeBe; + } + + /** + * @return the oncotatorExonAffectedBe + */ + public String getOncotatorExonAffectedBe() { + return oncotatorExonAffectedBe; + } + + /** + * @param oncotatorExonAffectedBe the oncotatorExonAffectedBe to set + */ + public void setOncotatorExonAffectedBe(String oncotatorExonAffectedBe) { + this.oncotatorExonAffectedBe = oncotatorExonAffectedBe; + } + + /** + * @return the oncotatorProteinPosStartBe + */ + public String getOncotatorProteinPosStartBe() { + return oncotatorProteinPosStartBe; + } + + /** + * @param oncotatorProteinPosStartBe the oncotatorProteinPosStartBe to set + */ + public void setOncotatorProteinPosStartBe(String oncotatorProteinPosStartBe) { + this.oncotatorProteinPosStartBe = oncotatorProteinPosStartBe; + } + + /** + * @return the oncotatorProteinPosEndBe + */ + public String getOncotatorProteinPosEndBe() { + return oncotatorProteinPosEndBe; + } + + /** + * @param oncotatorProteinPosEndBe the oncotatorProteinPosEndBe to set + */ + public void setOncotatorProteinPosEndBe(String oncotatorProteinPosEndBe) { + this.oncotatorProteinPosEndBe = oncotatorProteinPosEndBe; + } + + /** + * @return the maFimpact + */ + public String getMaFimpact() { + return maFimpact; + } + + /** + * @param maFimpact the maFimpact to set + */ + public void setMaFimpact(String maFimpact) { + this.maFimpact = maFimpact; + } + + /** + * @return the maFis + */ + public String getMaFis() { + return maFis; + } + + /** + * @param maFis the maFis to set + */ + public void setMaFis(String maFis) { + this.maFis = maFis; + } + + /** + * @return the maLinkVar + */ + public String getMaLinkVar() { + return maLinkVar; + } + + /** + * @param maLinkVar the maLinkVar to set + */ + public void setMaLinkVar(String maLinkVar) { + this.maLinkVar = maLinkVar; + } + + /** + * @return the maLinkMsa + */ + public String getMaLinkMsa() { + return maLinkMsa; + } + + /** + * @param maLinkMsa the maLinkMsa to set + */ + public void setMaLinkMsa(String maLinkMsa) { + this.maLinkMsa = maLinkMsa; + } + + /** + * @return the maLinkPdb + */ + public String getMaLinkPdb() { + return maLinkPdb; + } + + /** + * @param maLinkPdb the maLinkPdb to set + */ + public void setMaLinkPdb(String maLinkPdb) { + this.maLinkPdb = maLinkPdb; + } + + /** + * @return the maProteinChange + */ + public String getMaProteinChange() { + return maProteinChange; + } + + /** + * @param maProteinChange the maProteinChange to set + */ + public void setMaProteinChange(String maProteinChange) { + this.maProteinChange = maProteinChange; + } + + /** + * @return the MAF staging data map (column -> field) + */ + public Map getMafStagingDataMap() { + Map map = new HashMap<>(); + map.put("Hugo_Symbol", "hugoSymbol"); + map.put("Entrez_Gene_Id", "entrezGeneId"); + map.put("Center", "center"); + map.put("NCBI_Build", "ncbiBuild"); + map.put("Chromosome", "chromosome"); + map.put("Start_Position", "startPosition"); + map.put("End_Position", "endPosition"); + map.put("Strand", "strand"); + map.put("Variant_Classification", "variantClassification"); + map.put("Variant_Type", "variantType"); + map.put("Reference_Allele", "referenceAllele"); + map.put("Tumor_Seq_Allele1", "tumorSeqAllele1"); + map.put("Tumor_Seq_Allele2", "tumorSeqAllele2"); + map.put("dbSNP_RS", "dbsnpRs"); + map.put("dbSNP_Val_Status", "dbsnpValStatus"); + map.put("Tumor_Sample_Barcode", "tumorSampleBarcode"); + map.put("Matched_Norm_Sample_Barcode", "matchedNormSampleBarcode"); + map.put("Match_Norm_Seq_Allele1", "matchNormSeqAllele1"); + map.put("Match_Norm_Seq_Allele2", "matchNormSeqAllele2"); + map.put("Tumor_Validation_Allele1", "tumorValidationAllele1"); + map.put("Tumor_Validation_Allele2", "tumorValidationAllele2"); + map.put("Match_Norm_Validation_Allele1", "matchNormValidationAllele1"); + map.put("Match_Norm_Validation_Allele2", "matchNormValidationAllele2"); + map.put("Verification_Status", "verificationStatus"); + map.put("Validation_Status", "validationStatus"); + map.put("Mutation_Status", "mutationStatus"); + map.put("Sequencing_Phase", "sequencingPhase"); + map.put("Sequence_Source", "sequenceSource"); + map.put("Validation_Method", "validationMethod"); + map.put("Score", "score"); + map.put("BAM_File", "bamFile"); + map.put("Sequencer", "sequencer"); + map.put("Amino_Acid_Change", "aminoAcidChange"); + map.put("Transcript", "transcript"); + map.put("t_ref_count", "tRefCount"); + map.put("t_alt_count", "tAltCount"); + map.put("n_ref_count", "nRefCount"); + map.put("n_alt_count", "nAltCount"); + map.put("TTotCov", "tTotCov"); + map.put("TVarCov", "tVarCov"); + map.put("NTotCov", "nTotCov"); + map.put("NVarCov", "nVarCov"); + map.put("normal_depth", "normalDepth"); + map.put("normal_vaf", "normalVaf"); + map.put("HGVSp_Short", "hgvspShort"); + map.put("Codons", "codons"); + map.put("SWISSPROT", "swissprot"); + map.put("RefSeq", "refseq"); + map.put("Protein_position", "proteinPosition"); + map.put("ONCOTATOR_COSMIC_OVERLAPPING", "oncotatorCosmicOverlapping"); + map.put("ONCOTATOR_DBSNP_RS", "oncotatorDbsnpRs"); + map.put("ONCOTATOR_DBSNP_VAL_STATUS", "oncotatorDbsnpValStatus"); + map.put("ONCOTATOR_PROTEIN_CHANGE", "oncotatorProteinChange"); + map.put("ONCOTATOR_VARIANT_CLASSIFICATION", "oncotatorVariantClassification"); + map.put("ONCOTATOR_GENE_SYMBOL", "oncotatorGeneSymbol"); + map.put("ONCOTATOR_REFSEQ_MRNA_ID", "oncotatorRefseqMrnaId"); + map.put("ONCOTATOR_REFSEQ_PROT_ID", "oncotatorRefseqProtId"); + map.put("ONCOTATOR_UNIPROT_ENTRY_NAME", "oncotatorUniprotEntryName"); + map.put("ONCOTATOR_UNIPROT_ACCESSION", "oncotatorUniprotAccession"); + map.put("ONCOTATOR_CODON_CHANGE", "oncotatorCodonChange"); + map.put("ONCOTATOR_TRANSCRIPT_CHANGE", "oncotatorTranscriptChange"); + map.put("ONCOTATOR_EXON_AFFECTED", "oncotatorExonAffected"); + map.put("ONCOTATOR_PROTEIN_POS_START", "oncotatorProteinPosStart"); + map.put("ONCOTATOR_PROTEIN_POS_END", "oncotatorProteinPosEnd"); + map.put("ONCOTATOR_PROTEIN_CHANGE_BEST_EFFECT", "oncotatorProteinChangeBe"); + map.put("ONCOTATOR_VARIANT_CLASSIFICATION_BEST_EFFECT", "oncotatorVariantClassificationBe"); + map.put("ONCOTATOR_GENE_SYMBOL_BEST_EFFECT", "oncotatorGeneSymbolBe"); + map.put("ONCOTATOR_REFSEQ_MRNA_ID_BEST_EFFECT", "oncotatorRefseqMrnaIdBe"); + map.put("ONCOTATOR_REFSEQ_PROT_ID_BEST_EFFECT", "oncotatorRefseqProtIdBe"); + map.put("ONCOTATOR_UNIPROT_ENTRY_NAME_BEST_EFFECT", "oncotatorUniprotEntryNameBe"); + map.put("ONCOTATOR_UNIPROT_ACCESSION_BEST_EFFECT", "oncotatorUniprotAccessionBe"); + map.put("ONCOTATOR_CODON_CHANGE_BEST_EFFECT", "oncotatorCodonChangeBe"); + map.put("ONCOTATOR_TRANSCRIPT_CHANGE_BEST_EFFECT", "oncotatorTranscriptChangeBe"); + map.put("ONCOTATOR_EXON_AFFECTED_BEST_EFFECT", "oncotatorExonAffectedBe"); + map.put("ONCOTATOR_PROTEIN_POS_START_BEST_EFFECT", "oncotatorProteinPosStartBe"); + map.put("ONCOTATOR_PROTEIN_POS_END_BEST_EFFECT", "oncotatorProteinPosEndBe"); + map.put("MA:FImpact", "maFimpact"); + map.put("MA:FIS", "maFis"); + map.put("MA:link.var", "maLinkVar"); + map.put("MA:link.MSA", "maLinkMsa"); + map.put("MA:link.PDB", "maLinkPdb"); + map.put("MA:protein.change", "maProteinChange"); + + // these columns have multiple possible header names + map.put("t_depth", "tumorDepth"); + map.put("t_vaf", "tumorVaf"); + map.put("tumor_depth", "tumorDepth"); + map.put("tumor_vaf", "tumorVaf"); + + return map; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/model/MutSigRecord.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/model/MutSigRecord.java new file mode 100644 index 0000000..0e9132b --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/model/MutSigRecord.java @@ -0,0 +1,168 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.model; + +import java.util.*; + +/** + * + * @author ochoaa + */ +public class MutSigRecord { + + private Integer cancerStudyId; + private String rank; + private String hugoSymbol; + private String numBasesCovered; + private String numMutations; + private String pValue; + private String qValue; + + /** + * @return the cancerStudyId + */ + public Integer getCancerStudyId() { + return cancerStudyId; + } + + /** + * @param cancerStudyId the cancerStudyId to set + */ + public void setCancerStudyId(Integer cancerStudyId) { + this.cancerStudyId = cancerStudyId; + } + + /** + * @return the rank + */ + public String getRank() { + return rank; + } + + /** + * @param rank the rank to set + */ + public void setRank(String rank) { + this.rank = rank; + } + + /** + * @return the hugoSymbol + */ + public String getHugoSymbol() { + return hugoSymbol; + } + + /** + * @param hugoSymbol the hugoSymbol to set + */ + public void setHugoSymbol(String hugoSymbol) { + this.hugoSymbol = hugoSymbol; + } + + /** + * @return the numBasesCovered + */ + public String getNumBasesCovered() { + return numBasesCovered; + } + + /** + * @param numBasesCovered the numBasesCovered to set + */ + public void setNumBasesCovered(String numBasesCovered) { + this.numBasesCovered = numBasesCovered; + } + + /** + * @return the numMutations + */ + public String getNumMutations() { + return numMutations; + } + + /** + * @param numMutations the numMutations to set + */ + public void setNumMutations(String numMutations) { + this.numMutations = numMutations; + } + + /** + * @return the pValue + */ + public String getpValue() { + return pValue; + } + + /** + * @param pValue the pValue to set + */ + public void setpValue(String pValue) { + this.pValue = pValue; + } + + /** + * @return the qValue + */ + public String getqValue() { + return qValue; + } + + /** + * @param qValue the qValue to set + */ + public void setqValue(String qValue) { + this.qValue = qValue; + } + + /** + * @return the mutsig staging data map (column -> field) + */ + public Map getMutSigStagingDataMap() { + Map map = new HashMap<>(); + map.put("rank", "rank"); + map.put("gene", "hugoSymbol"); + map.put("p", "pValue"); + map.put("q", "qValue"); + + // for mutsig columns that have multiple possible column names + map.put("N", "numBasesCovered"); + map.put("Nnon", "numBasesCovered"); + map.put("n", "numMutations"); + map.put("nnon", "numMutations"); + + return map; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/model/ProfileDataRecord.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/model/ProfileDataRecord.java new file mode 100644 index 0000000..451413a --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/model/ProfileDataRecord.java @@ -0,0 +1,208 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.model; + +import org.mskcc.cbio.model.*; + +import java.util.*; + +/** + * + * @author ochoaa + */ +public class ProfileDataRecord { + + private Integer geneticProfileId; + private String hugoSymbol; + private String entrezGeneId; + private String compositeElementRef; + private String arrayId; + private List compositeGeneList; + private Map caseProfileDataMap; + private List cnaEvents; + private boolean rppaProfile; + private boolean cnaData; + + /** + * @return the geneticProfileId + */ + public Integer getGeneticProfileId() { + return geneticProfileId; + } + + /** + * @param geneticProfileId the geneticProfileId to set + */ + public void setGeneticProfileId(Integer geneticProfileId) { + this.geneticProfileId = geneticProfileId; + } + + /** + * @return the hugoSymbol + */ + public String getHugoSymbol() { + return hugoSymbol; + } + + /** + * @param hugoSymbol the hugoSymbol to set + */ + public void setHugoSymbol(String hugoSymbol) { + this.hugoSymbol = hugoSymbol; + } + + /** + * @return the entrezGeneId + */ + public String getEntrezGeneId() { + return entrezGeneId; + } + + /** + * @param entrezGeneId the entrezGeneId to set + */ + public void setEntrezGeneId(String entrezGeneId) { + this.entrezGeneId = entrezGeneId; + } + + /** + * @return the compositeElementRef + */ + public String getCompositeElementRef() { + return compositeElementRef; + } + + /** + * @param compositeElementRef the compositeElementRef to set + */ + public void setCompositeElementRef(String compositeElementRef) { + this.compositeElementRef = compositeElementRef; + } + + /** + * @return the arrayId + */ + public String getArrayId() { + return arrayId; + } + + /** + * @param arrayId the arrayId to set + */ + public void setArrayId(String arrayId) { + this.arrayId = arrayId; + } + + /** + * @return the compositeGeneList + */ + public List getCompositeGeneList() { + return compositeGeneList; + } + + /** + * @param compositeGeneList the compositeGeneList to set + */ + public void setCompositeGeneList(List compositeGeneList) { + this.compositeGeneList = compositeGeneList; + } + + /** + * @return the caseProfileDataMap + */ + public Map getCaseProfileDataMap() { + return caseProfileDataMap; + } + + /** + * @param caseProfileDataMap the caseProfileDataMap to set + */ + public void setCaseProfileDataMap(Map caseProfileDataMap) { + this.caseProfileDataMap = caseProfileDataMap; + } + + /** + * @return the cnaEvents + */ + public List getCnaEvents() { + return cnaEvents; + } + + /** + * @param cnaEvents the cnaEvents to set + */ + public void setCnaEvents(List cnaEvents) { + this.cnaEvents = cnaEvents; + } + + /** + * @return the rppaProfile + */ + public boolean isRppaProfile() { + return rppaProfile; + } + + /** + * @param rppaProfile the rppaProfile to set + */ + public void setRppaProfile(boolean rppaProfile) { + this.rppaProfile = rppaProfile; + } + + /** + * @return the cnaData + */ + public boolean isCnaData() { + return cnaData; + } + + /** + * @param cnaData the cnaData to set + */ + public void setCnaData(boolean cnaData) { + this.cnaData = cnaData; + } + + /** + * @return the profile data non-case id map (column -> field) + */ + public Map getNonCaseIdsMap() { + Map map = new HashMap<>(); + map.put("HUGO_SYMBOL", "hugoSymbol"); + map.put("ENTREZ_GENE_ID", "entrezGeneId"); + map.put("COMPOSITE.ELEMENT.REF", "compositeElementRef"); + + return map; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/model/StructuralVariantRecord.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/model/StructuralVariantRecord.java new file mode 100644 index 0000000..04419e3 --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/model/StructuralVariantRecord.java @@ -0,0 +1,499 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.model; + +import java.util.*; + +/** + * + * @author ochoaa + */ +public class StructuralVariantRecord { + + private Integer sampleInternalId; + private Integer geneticProfileId; + private String sampleId; + private String annotation; + private String breakpointType; + private String comments; + private String confidenceClass; + private String connectionType; + private String eventInfo; + private String mapQ; + private String normalReadCount; + private String normalVariantCount; + private String pairedEndReadSupport; + private String site1Chrom; + private String site1Desc; + private String site1Gene; + private String site1Pos; + private String site2Chrom; + private String site2Desc; + private String site2Gene; + private String site2Pos; + private String splitReadSupport; + private String svClassName; + private String svDesc; + private String svLength; + private String tumorReadCount; + private String tumorVariantCount; + private String variantStatusName; + + /** + * @return the sampleInternalId + */ + public Integer getSampleInternalId() { + return sampleInternalId; + } + + /** + * @param sampleInternalId the sampleInternalId to set + */ + public void setSampleInternalId(Integer sampleInternalId) { + this.sampleInternalId = sampleInternalId; + } + + /** + * @return the geneticProfileId + */ + public Integer getGeneticProfileId() { + return geneticProfileId; + } + + /** + * @param geneticProfileId the geneticProfileId to set + */ + public void setGeneticProfileId(Integer geneticProfileId) { + this.geneticProfileId = geneticProfileId; + } + + /** + * @return the sampleId + */ + public String getSampleId() { + return sampleId; + } + + /** + * @param sampleId the sampleId to set + */ + public void setSampleId(String sampleId) { + this.sampleId = sampleId; + } + + /** + * @return the annotation + */ + public String getAnnotation() { + return annotation; + } + + /** + * @param annotation the annotation to set + */ + public void setAnnotation(String annotation) { + this.annotation = annotation; + } + + /** + * @return the breakpointType + */ + public String getBreakpointType() { + return breakpointType; + } + + /** + * @param breakpointType the breakpointType to set + */ + public void setBreakpointType(String breakpointType) { + this.breakpointType = breakpointType; + } + + /** + * @return the comments + */ + public String getComments() { + return comments; + } + + /** + * @param comments the comments to set + */ + public void setComments(String comments) { + this.comments = comments; + } + + /** + * @return the confidenceClass + */ + public String getConfidenceClass() { + return confidenceClass; + } + + /** + * @param confidenceClass the confidenceClass to set + */ + public void setConfidenceClass(String confidenceClass) { + this.confidenceClass = confidenceClass; + } + + /** + * @return the connectionType + */ + public String getConnectionType() { + return connectionType; + } + + /** + * @param connectionType the connectionType to set + */ + public void setConnectionType(String connectionType) { + this.connectionType = connectionType; + } + + /** + * @return the eventInfo + */ + public String getEventInfo() { + return eventInfo; + } + + /** + * @param eventInfo the eventInfo to set + */ + public void setEventInfo(String eventInfo) { + this.eventInfo = eventInfo; + } + + /** + * @return the mapQ + */ + public String getMapQ() { + return mapQ; + } + + /** + * @param mapQ the mapQ to set + */ + public void setMapQ(String mapQ) { + this.mapQ = mapQ; + } + + /** + * @return the normalReadCount + */ + public String getNormalReadCount() { + return normalReadCount; + } + + /** + * @param normalReadCount the normalReadCount to set + */ + public void setNormalReadCount(String normalReadCount) { + this.normalReadCount = normalReadCount; + } + + /** + * @return the normalVariantCount + */ + public String getNormalVariantCount() { + return normalVariantCount; + } + + /** + * @param normalVariantCount the normalVariantCount to set + */ + public void setNormalVariantCount(String normalVariantCount) { + this.normalVariantCount = normalVariantCount; + } + + /** + * @return the pairedEndReadSupport + */ + public String getPairedEndReadSupport() { + return pairedEndReadSupport; + } + + /** + * @param pairedEndReadSupport the pairedEndReadSupport to set + */ + public void setPairedEndReadSupport(String pairedEndReadSupport) { + this.pairedEndReadSupport = pairedEndReadSupport; + } + + /** + * @return the site1Chrom + */ + public String getSite1Chrom() { + return site1Chrom; + } + + /** + * @param site1Chrom the site1Chrom to set + */ + public void setSite1Chrom(String site1Chrom) { + this.site1Chrom = site1Chrom; + } + + /** + * @return the site1Desc + */ + public String getSite1Desc() { + return site1Desc; + } + + /** + * @param site1Desc the site1Desc to set + */ + public void setSite1Desc(String site1Desc) { + this.site1Desc = site1Desc; + } + + /** + * @return the site1Gene + */ + public String getSite1Gene() { + return site1Gene; + } + + /** + * @param site1Gene the site1Gene to set + */ + public void setSite1Gene(String site1Gene) { + this.site1Gene = site1Gene; + } + + /** + * @return the site1Pos + */ + public String getSite1Pos() { + return site1Pos; + } + + /** + * @param site1Pos the site1Pos to set + */ + public void setSite1Pos(String site1Pos) { + this.site1Pos = site1Pos; + } + + /** + * @return the site2Chrom + */ + public String getSite2Chrom() { + return site2Chrom; + } + + /** + * @param site2Chrom the site2Chrom to set + */ + public void setSite2Chrom(String site2Chrom) { + this.site2Chrom = site2Chrom; + } + + /** + * @return the site2Desc + */ + public String getSite2Desc() { + return site2Desc; + } + + /** + * @param site2Desc the site2Desc to set + */ + public void setSite2Desc(String site2Desc) { + this.site2Desc = site2Desc; + } + + /** + * @return the site2Gene + */ + public String getSite2Gene() { + return site2Gene; + } + + /** + * @param site2Gene the site2Gene to set + */ + public void setSite2Gene(String site2Gene) { + this.site2Gene = site2Gene; + } + + /** + * @return the site2Pos + */ + public String getSite2Pos() { + return site2Pos; + } + + /** + * @param site2Pos the site2Pos to set + */ + public void setSite2Pos(String site2Pos) { + this.site2Pos = site2Pos; + } + + /** + * @return the splitReadSupport + */ + public String getSplitReadSupport() { + return splitReadSupport; + } + + /** + * @param splitReadSupport the splitReadSupport to set + */ + public void setSplitReadSupport(String splitReadSupport) { + this.splitReadSupport = splitReadSupport; + } + + /** + * @return the svClassName + */ + public String getSvClassName() { + return svClassName; + } + + /** + * @param svClassName the svClassName to set + */ + public void setSvClassName(String svClassName) { + this.svClassName = svClassName; + } + + /** + * @return the svDesc + */ + public String getSvDesc() { + return svDesc; + } + + /** + * @param svDesc the svDesc to set + */ + public void setSvDesc(String svDesc) { + this.svDesc = svDesc; + } + + /** + * @return the svLength + */ + public String getSvLength() { + return svLength; + } + + /** + * @param svLength the svLength to set + */ + public void setSvLength(String svLength) { + this.svLength = svLength; + } + + /** + * @return the tumorReadCount + */ + public String getTumorReadCount() { + return tumorReadCount; + } + + /** + * @param tumorReadCount the tumorReadCount to set + */ + public void setTumorReadCount(String tumorReadCount) { + this.tumorReadCount = tumorReadCount; + } + + /** + * @return the tumorVariantCount + */ + public String getTumorVariantCount() { + return tumorVariantCount; + } + + /** + * @param tumorVariantCount the tumorVariantCount to set + */ + public void setTumorVariantCount(String tumorVariantCount) { + this.tumorVariantCount = tumorVariantCount; + } + + /** + * @return the variantStatusName + */ + public String getVariantStatusName() { + return variantStatusName; + } + + /** + * @param variantStatusName the variantStatusName to set + */ + public void setVariantStatusName(String variantStatusName) { + this.variantStatusName = variantStatusName; + } + + /** + * @return the structural variant staging data map (column -> field) + */ + public Map getStructuralVariantStagingDataMap() { + Map map = new HashMap<>(); + map.put("SampleId", "sampleId"); + map.put("Annotation", "annotation"); + map.put("Breakpoint_Type", "breakpointType"); + map.put("Comments", "comments"); + map.put("Confidence_Class", "confidenceClass"); + map.put("Connection_Type", "connectionType"); + map.put("Event_Info", "eventInfo"); + map.put("Mapq", "mapQ"); + map.put("Normal_Read_Count", "normalReadCount"); + map.put("Normal_Variant_Count", "normalVariantCount"); + map.put("Paired_End_Read_Support", "pairedEndReadSupport"); + map.put("Site1_Chrom", "site1Chrom"); + map.put("Site1_Desc", "site1Desc"); + map.put("Site1_Gene", "site1Gene"); + map.put("Site1_Pos", "site1Pos"); + map.put("Site2_Chrom", "site2Chrom"); + map.put("Site2_Desc", "site2Desc"); + map.put("Site2_Gene", "site2Gene"); + map.put("Site2_Pos", "site2Pos"); + map.put("Split_Read_Support", "splitReadSupport"); + map.put("Sv_Class_Name", "svClassName"); + map.put("Sv_Desc", "svDesc"); + map.put("Sv_Length", "svLength"); + map.put("Tumor_Read_Count", "tumorReadCount"); + map.put("Tumor_Variant_Count", "tumorVariantCount"); + map.put("Variant_Status_Name", "variantStatusName"); + + return map; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/model/TimelineRecord.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/model/TimelineRecord.java new file mode 100644 index 0000000..e1d3778 --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/model/TimelineRecord.java @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.model; + +import java.util.*; + +/** + * + * @author ochoaa + */ +public class TimelineRecord { + + private Integer patientInternalId; + private String patientId; + private String startDate; + private String stopDate; + private String eventType; + private Map clinicalEventDataMap; + + /** + * @return the patientInternalId + */ + public Integer getPatientInternalId() { + return patientInternalId; + } + + /** + * @param patientInternalId the patientInternalId to set + */ + public void setPatientInternalId(Integer patientInternalId) { + this.patientInternalId = patientInternalId; + } + + /** + * @return the patientId + */ + public String getPatientId() { + return patientId; + } + + /** + * @param patientId the patientId to set + */ + public void setPatientId(String patientId) { + this.patientId = patientId; + } + + /** + * @return the startDate + */ + public String getStartDate() { + return startDate; + } + + /** + * @param startDate the startDate to set + */ + public void setStartDate(String startDate) { + this.startDate = startDate; + } + + /** + * @return the stopDate + */ + public String getStopDate() { + return stopDate; + } + + /** + * @param stopDate the stopDate to set + */ + public void setStopDate(String stopDate) { + this.stopDate = stopDate; + } + + /** + * @return the eventType + */ + public String getEventType() { + return eventType; + } + + /** + * @param eventType the eventType to set + */ + public void setEventType(String eventType) { + this.eventType = eventType; + } + + /** + * @return the clinicalEventDataMap + */ + public Map getClinicalEventDataMap() { + return clinicalEventDataMap; + } + + /** + * @param clinicalEventDataMap the clinicalEventDataMap to set + */ + public void setClinicalEventDataMap(Map clinicalEventDataMap) { + this.clinicalEventDataMap = clinicalEventDataMap; + } + + /** + * @return the timeline staging data map (column -> field) + */ + public Map getTimelineStagingDataMap() { + Map map = new HashMap<>(); + map.put("PATIENT_ID", "patientId"); + map.put("START_DATE", "startDate"); + map.put("STOP_DATE", "stopDate"); + map.put("EVENT_TYPE", "eventType"); + + return map; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/util/DataFileUtils.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/util/DataFileUtils.java new file mode 100644 index 0000000..11636b1 --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/util/DataFileUtils.java @@ -0,0 +1,363 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.util; + +import org.mskcc.cbio.model.Sample; + +import java.io.*; +import java.nio.file.*; +import java.util.*; +import java.util.regex.*; +import com.google.common.base.Strings; +import org.apache.commons.collections.map.MultiKeyMap; + +import org.apache.commons.lang.StringUtils; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.stereotype.Repository; + +/** + * General utils class for files and file data. + * + * @author ochoaa + */ +@Repository +public class DataFileUtils { + + // general constants + public static String DELIMITER = "\t"; + public static String METADATA_PREFIX = "#"; + public static String DEFAULT_DESCRIPTION = "MISSING"; + public static String DEFAULT_DATATYPE = "STRING"; + public static String DEFAULT_ATTRIBUTE_TYPE = "SAMPLE"; + public static String DEFAULT_PRIORITY = "1"; + public static String DEFAULT_NOT_AVAILABLE = "NA"; + public static Integer DEFAULT_MISSING_INT = -1; + public static Integer DATA_TRUNCATION_THRESHOLD = 255; + + // regex expressions for stable id's + public static final String TCGA_BARCODE_PREFIX = "TCGA"; + public static final Pattern TCGA_SAMPLE_BARCODE_REGEX = + Pattern.compile("^(TCGA-\\w\\w-\\w\\w\\w\\w-\\d\\d).*$"); + public static final Pattern TCGA_SAMPLE_TYPE_BARCODE_REGEX = + Pattern.compile("^TCGA-\\w\\w-\\w\\w\\w\\w-(\\d\\d).*$"); + + public static final Set nonCaseIdColumnNames = new HashSet(Arrays.asList( + new String[]{"GENE SYMBOL", "HUGO_SYMBOL", "ENTREZ_GENE_ID", + "LOCUS ID", "CYTOBAND", "COMPOSITE.ELEMENT.REF"})); + + public static final HashMap sampleTypeByTcgaCode = new HashMap<>(); + @Autowired + private void setSampleTypeByTcgaCode() { + sampleTypeByTcgaCode.put("01", Sample.SampleType.PRIMARY_SOLID_TUMOR); + sampleTypeByTcgaCode.put("02", Sample.SampleType.RECURRENT_SOLID_TUMOR); + sampleTypeByTcgaCode.put("03", Sample.SampleType.PRIMARY_BLOOD_TUMOR); + sampleTypeByTcgaCode.put("04", Sample.SampleType.RECURRENT_BLOOD_TUMOR); + sampleTypeByTcgaCode.put("06", Sample.SampleType.METASTATIC); + sampleTypeByTcgaCode.put("10", Sample.SampleType.BLOOD_NORMAL); + sampleTypeByTcgaCode.put("11", Sample.SampleType.SOLID_NORMAL); + } + + public static enum NullOrEmptyValues { + NOT_APPLICABLE("Not Applicable"), + NOT_AVAILABLE("Not Available"), + SENT("sent"), + NULL("null"), + MISSING(""), + NA("NA"), + N_A("N/A"); + + private final String propertyName; + + NullOrEmptyValues(String propertyName) { this.propertyName = propertyName; } + @Override + public String toString() { return propertyName; } + + static public boolean has(String value) { + if (value == null) return false; + if (value.trim().equals("")) return true; + try { + value = value.replaceAll("[\\[|\\]\\/]", ""); + value = value.replaceAll(" ", "_"); + return valueOf(value.toUpperCase()) != null; + } + catch (IllegalArgumentException ex) { + return false; + } + } + } + + /** + * List data files in directory by file pattern. + * + * @param directory + * @param filePattern + * @return List + */ + public static List listDataFiles(String directory, String filePattern) throws IOException { + List dataFiles = new ArrayList(); + + for (Path file : Files.newDirectoryStream(Paths.get(directory), filePattern)) { + dataFiles.add(file.toFile()); + } + + return dataFiles; + } + + /** + * List data files in directory by file pattern with filename filter. + * + * @param directory + * @param filePattern + * @param filter + * @return List + */ + public static List listDataFiles(String directory, String filePattern, String filter) throws IOException { + if (Strings.isNullOrEmpty(filter)) { + return listDataFiles(directory, filePattern); + } + + List dataFiles = new ArrayList(); + for (Path file : Files.newDirectoryStream(Paths.get(directory), filePattern)) { + if (!file.toFile().getName().contains(filter)) { + dataFiles.add(file.toFile()); + } + } + + return dataFiles; + } + + /** + * Light string processing/cleanup for tab delimited files. + * + * @param line + * @return String[] + */ + public static String[] splitDataFields(String line) { + line = line.replaceAll("^" + METADATA_PREFIX + "+", ""); + String[] fields = line.split(DELIMITER, -1); + + return fields; + } + + /** + * Returns the header of a given datafile. + * + * @param dataFile + * @return String[] + * @throws IOException + */ + public static String[] getFileHeader(File dataFile) throws IOException { + String[] columnNames; + + try (FileReader reader = new FileReader(dataFile)) { + BufferedReader buff = new BufferedReader(reader); + String line = buff.readLine(); + + // keep reading until line does not start with meta data prefix + while (line.startsWith(METADATA_PREFIX)) { + line = buff.readLine(); + } + // extract the maf file header + columnNames = splitDataFields(line); + reader.close(); + } + + return columnNames; + } + + /** + * Loads the datafile metadata (file header and number of records). + * + * @param dataFile + * @return MultiKeyMap + * @throws IOException + */ + public static MultiKeyMap loadDataFileMetadata(File dataFile) throws IOException { + String[] columnNames; + int numRecords = 0; + + // get the file header and record count + try (FileReader reader = new FileReader(dataFile)) { + BufferedReader buff = new BufferedReader(reader); + String line = buff.readLine(); + + // keep reading until line does not start with meta data prefix + while (line.startsWith(DataFileUtils.METADATA_PREFIX)) { + line = buff.readLine(); + } + // extract the file header + columnNames = DataFileUtils.splitDataFields(line); + + // keep reading file to get count of records + while (buff.readLine() != null) { + numRecords++; + } + reader.close(); + } + MultiKeyMap metadata = new MultiKeyMap(); + metadata.put(dataFile.getName(), "header", columnNames); + metadata.put(dataFile.getName(), "numRecords", numRecords); + + return metadata; + } + + /** + * Determines if string is null, empty, or 'null' value. + * + * @param value + * @return boolean + */ + public static boolean isNullOrEmptyValue(String value) { + return Strings.isNullOrEmpty(value) || NullOrEmptyValues.has(value); + } + + /** + * Determines if integer string is null/empty or default missing int. + * + * @param value + * @return boolean + */ + public static boolean isNullOrMissingInt(String value) { + return isNullOrEmptyValue(value) || Integer.valueOf(value).equals(DEFAULT_MISSING_INT); + } + + /** + * Formats a given string for enum SampleType. + * + * @param barcode + * @param sampleTypeValue + * @return String + */ + public static String getSampleTypeString(String barcode, String sampleTypeValue) { + // set default to sample type value given from sample + String sampleTypeString = Sample.SampleType.PRIMARY_SOLID_TUMOR.getName(); + + // if tcga barcode then try to get sample type by tcga code + if (barcode.startsWith(TCGA_BARCODE_PREFIX)) { + Matcher tcgaSampleBarcodeMatcher = TCGA_SAMPLE_TYPE_BARCODE_REGEX.matcher(barcode); + if (tcgaSampleBarcodeMatcher.find() && sampleTypeByTcgaCode.containsKey(tcgaSampleBarcodeMatcher.group(1))) { + sampleTypeString = sampleTypeByTcgaCode.get(tcgaSampleBarcodeMatcher.group(1)).getName(); + } + } + else if (!Strings.isNullOrEmpty(sampleTypeValue)) { + sampleTypeString = sampleTypeValue; + } + + return StringUtils.join(sampleTypeString.trim().split(" "), "_").toUpperCase(); + } + + /** + * Returns a patient stable id. + * + * @param stableId + * @return String + */ + public static String getPatientStableId(String stableId) { + return getStableId(stableId, false); + } + + /** + * Returns a sample stable ID. + * + * @param stableId + * @return String + */ + public static String getSampleStableId(String stableId) { + return getStableId(stableId, true); + } + + /** + * Cleans up TCGA stable ID's. + * + * @param barcode + * @param forSample + * @return String + */ + private static String getStableId(String barcode, boolean forSample) { + if (!barcode.startsWith(TCGA_BARCODE_PREFIX)) { + return barcode; + } + // light clean up on the tcga barcode + if (barcode.contains("Tumor")) { + barcode = barcode.replace("Tumor", "01"); + } + else if (barcode.contains("Normal")) { + barcode = barcode.replace("Normal", "11"); + } + + // set default stable id string + String stableId = barcode + "-01"; + try { + String[] parts = barcode.split("-"); + List stableIdBuilder = new ArrayList( + Arrays.asList(new String[]{parts[0], parts[1], parts[2]})); + + if (forSample) { + stableIdBuilder.add(parts[3]); + Matcher tcgaSampleBarcodeMatcher = TCGA_SAMPLE_BARCODE_REGEX.matcher(StringUtils.join(stableIdBuilder, "-")); + stableId = (tcgaSampleBarcodeMatcher.find()) ? tcgaSampleBarcodeMatcher.group(1) : StringUtils.join(stableIdBuilder, "-"); + } + else { + stableId = StringUtils.join(stableIdBuilder, "-"); + } + } + catch (ArrayIndexOutOfBoundsException ex) {} + + + return stableId; + } + + /** + * Determines whether TCGA sample type is normal or not. + * + * @param barcode + * @return boolean + */ + public static boolean isNormalSample(String barcode) { + Matcher tcgaSampleBarcodeMatcher = TCGA_SAMPLE_TYPE_BARCODE_REGEX.matcher(barcode); + + Sample.SampleType sampleType = Sample.SampleType.PRIMARY_SOLID_TUMOR; + if (tcgaSampleBarcodeMatcher.find() && sampleTypeByTcgaCode.containsKey(tcgaSampleBarcodeMatcher.group(1))) { + sampleType = sampleTypeByTcgaCode.get(tcgaSampleBarcodeMatcher.group(1)); + } + + boolean isNormal = false; + if (sampleType.getName().equals(Sample.SampleType.BLOOD_NORMAL.getName()) || + sampleType.getName().equals(Sample.SampleType.SOLID_NORMAL.getName())) { + isNormal = true; + } + + return isNormal; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/util/GeneDataUtils.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/util/GeneDataUtils.java new file mode 100644 index 0000000..dc721d3 --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/util/GeneDataUtils.java @@ -0,0 +1,326 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.util; + +import org.mskcc.cbio.model.Gene; +import org.mskcc.cbio.persistence.jdbc.GeneJdbcDaoImpl; + +import java.util.*; +import java.util.regex.*; +import org.apache.commons.logging.*; +import com.google.common.base.Strings; + +import org.springframework.batch.core.configuration.annotation.JobScope; +import org.springframework.batch.item.ExecutionContext; +import org.springframework.batch.item.file.FlatFileItemReader; +import org.springframework.batch.item.file.mapping.*; +import org.springframework.batch.item.file.transform.*; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.core.io.ClassPathResource; +import org.springframework.stereotype.Repository; + +/** + * Utils class for gene data + * @author ochoaa + */ +@Repository +@JobScope +public class GeneDataUtils { + + public final String PHOSPHOPROTEIN_TYPE = "phosphoprotein"; + public final String MIRNA_TYPE = "miRNA"; + + private final String DISAMBIGUOUS_GENES_RESOURCE = "gene_symbol_disambiguation.txt"; + + @Autowired + GeneJdbcDaoImpl geneJdbcDaoImpl; + + private static final Log LOG = LogFactory.getLog(GeneDataUtils.class); + + // params to cache genes in db + public static final Map hugoGeneSymbolMap = new HashMap<>(); + public static final Map entrezGeneIdMap = new HashMap<>(); + public static final Map> geneAliasMap = new HashMap<>(); + @Autowired + private void loadGenesFromDb() { + for (Gene gene : geneJdbcDaoImpl.listAllGenes()) { + // fill gene symbol, gene id, and gene alias maps + hugoGeneSymbolMap.put(gene.getHugoGeneSymbol(), gene); + entrezGeneIdMap.put(gene.getEntrezGeneId(), gene); + + if (gene.getAliases() != null) { + gene.getAliases().stream().forEach((alias) -> { + geneAliasMap.getOrDefault(alias, new ArrayList()).add(gene); + }); + } + } + } + + // param to hold disambiguous genes + public static final Map disambiguousGenes = new HashMap<>(); + @Autowired + private void loadDisambiguousGenes() throws Exception { + // init tab-delim tokenizer with column names + DelimitedLineTokenizer tokenizer = new DelimitedLineTokenizer(DelimitedLineTokenizer.DELIMITER_TAB); + tokenizer.setNames(new String[]{"Gene_Symbol", "Entrez_Gene_Id"}); + + // init line mapper for disambiguous gene file resource + DefaultLineMapper> lineMapper = new DefaultLineMapper(); + lineMapper.setLineTokenizer(tokenizer); + lineMapper.setFieldSetMapper((FieldSet fs) -> { + HashMap map = new HashMap<>(); + map.put("alias", fs.readString("Gene_Symbol")); + map.put("entrezGeneId", fs.readInt("Entrez_Gene_Id")); + return map; + }); + + // set up file reader context + FlatFileItemReader> reader = new FlatFileItemReader(); + reader.setResource(new ClassPathResource(DISAMBIGUOUS_GENES_RESOURCE)); + reader.setLineMapper(lineMapper); + reader.setLinesToSkip(1); + reader.open(new ExecutionContext()); + + // read through each line of disambiguous genes file + Map record = reader.read(); + while (record != null) { + String alias = (String) record.get("alias"); + Gene gene = geneJdbcDaoImpl.getGene((Integer) record.get("entrezGeneId")); + if (gene == null) { + LOG.warn("Could not resolve disambiguous gene alias " + alias + " from " + DISAMBIGUOUS_GENES_RESOURCE); + } + else { + disambiguousGenes.put(alias, gene); + } + record = reader.read(); + } + reader.close(); + } + + // param to hold valid chromosomes + public static final Map validChromosomeValues = new HashMap<>(); + @Autowired + private void setValidChromosomeValues() { + // init valid chromosome values map + for (int chr=1; chr<=24; chr++) { + validChromosomeValues.put(String.valueOf(chr), String.valueOf(chr)); + validChromosomeValues.put("CHR"+String.valueOf(chr), String.valueOf(chr)); + } + // add other valid chromosome values to map + validChromosomeValues.put("X", "23"); + validChromosomeValues.put("CHRX", "23"); + validChromosomeValues.put("Y", "24"); + validChromosomeValues.put("CHRY", "24"); + validChromosomeValues.put("NA", "NA"); + validChromosomeValues.put("MT", "MT"); + } + + /** + * Given an entrez gene id, hugo gene symbol, and chromosome, returns a Gene instance. + * If gene is not found, returns null + * + * @param entrezGeneId + * @param hugoGeneSymbol + * @param chr + * @return Gene + */ + public Gene resolveGeneFromRecordData(Integer entrezGeneId, String hugoGeneSymbol, String chr) { + Gene gene; + + // first try to get gene by entrez gene id + if (entrezGeneIdMap.containsKey(entrezGeneId)) { + gene = entrezGeneIdMap.get(entrezGeneId); + } + else if (hugoGeneSymbolMap.containsKey(hugoGeneSymbol)) { + gene = hugoGeneSymbolMap.get(hugoGeneSymbol); + } + // if gene does not exist by given entrez gene id or hugo gene symbol then guess gene + else { + gene = guessGene(hugoGeneSymbol, chr); + } + + return gene; + } + + /** + * Given an hugo gene symbol and chromosome, returns a Gene instance. + * If gene is not found, returns null + * + * @param hugoGeneSymbol + * @param chr + * @return Gene + */ + public Gene resolveGeneFromRecordData(String hugoGeneSymbol, String chr) { + Gene gene; + + // first try to get gene by hugo gene symbol + if (hugoGeneSymbolMap.containsKey(hugoGeneSymbol)) { + gene = hugoGeneSymbolMap.get(hugoGeneSymbol); + } + // if gene does not exist by given entrez gene id or hugo gene symbol then guess gene + else { + gene = guessGene(hugoGeneSymbol, chr); + } + + return gene; + } + + /** + * Given an array of gene symbols, returns a list of Gene instances. + * If all gene symbols cannot be found, returns empty list + * + * @param geneSymbols + * @return List + */ + public List resolveGeneFromCompositeElementRef(String[] geneSymbols) { + List genes = new ArrayList(); + for (String geneSymbol : geneSymbols) { + Gene gene = guessGene(geneSymbol, null); + if (gene != null) { + genes.add(gene); + } + else { + LOG.warn("Could not resolve gene from symbol: " + geneSymbol); + } + } + + return genes; + } + + /** + * Guesses gene by given hugo gene symbol and/or chromosome value. + * + * @param hugoGeneSymbol + * @param chr + * @return Gene + */ + public Gene guessGene(String hugoGeneSymbol, String chr) { + Gene gene = null; // default value + + if (!Strings.isNullOrEmpty(hugoGeneSymbol)) { + // hugo gene symbol possibly entrez gene id instead + if (hugoGeneSymbol.matches("[0-9]+") && entrezGeneIdMap.containsKey(Integer.valueOf(hugoGeneSymbol))) { + gene = entrezGeneIdMap.get(Integer.valueOf(hugoGeneSymbol)); + } + // try finding hugo gene symbol in map + else if (hugoGeneSymbolMap.containsKey(hugoGeneSymbol.toUpperCase())) { + gene = hugoGeneSymbolMap.get(hugoGeneSymbol.toUpperCase()); + } + // try finding gene by disambiguous gene symbol + else if (disambiguousGenes.containsKey(hugoGeneSymbol.toUpperCase())) { + gene = disambiguousGenes.get(hugoGeneSymbol.toUpperCase()); + } + // try finding gene by alias and normalized chromosome value + else if (geneAliasMap.containsKey(hugoGeneSymbol)) { + + // get normalized chromosome value + String normalizedChr = getNormalizedChromosome(chr); + if (!Strings.isNullOrEmpty(normalizedChr)) { + + // add gene aliases with matching chromosome values to list + List matchesByChrValue = new ArrayList(); + for (Gene alias : geneAliasMap.get(hugoGeneSymbol)) { + // get normalized chromosome value for alias + String aliasChrValue = getChromosomeFromCytoband(alias.getCytoband()); + if (aliasChrValue.equals(normalizedChr)) { + matchesByChrValue.add(alias); + } + } + // if list not empty then select first in list by default + if (!matchesByChrValue.isEmpty()) { + gene = matchesByChrValue.get(0); + } + } + } + } + + return gene; + } + + /** + * Returns a phospho gene with aliases. + * + * @param gene + * @param residue + * @return Gene + */ + public Gene createPhosphoGene(Gene gene, String residue) { + Gene phosphoGene = new Gene(); + phosphoGene.setHugoGeneSymbol(gene.getHugoGeneSymbol().toUpperCase() + "_" + residue); + phosphoGene.setType(PHOSPHOPROTEIN_TYPE); + phosphoGene.setCytoband(gene.getCytoband()); + phosphoGene.setAliases(Arrays.asList(new String[]{"rppa-phospho", + "phosphoprotein", "phospho"+gene.getHugoGeneSymbol().toUpperCase()})); + + return phosphoGene; + } + + /** + * Return normalized chromosome value. + * If not found, returns null + * + * @param chr + * @return String + */ + public String getNormalizedChromosome(String chr) { + if (!Strings.isNullOrEmpty(chr) && !DataFileUtils.isNullOrEmptyValue(chr)) { + return validChromosomeValues.get(chr.toUpperCase()); + } + return null; + } + + /** + * Returns normalized chromosome value from cytoband. + * If not found, returns null + * + * @param cytoband + * @return String + */ + public String getChromosomeFromCytoband(String cytoband) { + String chromosome = null; + if (!Strings.isNullOrEmpty(cytoband)) { + if (cytoband.startsWith("X") || cytoband.startsWith("Y")) { + chromosome = cytoband.substring(0, 1); + } + else { + Pattern p = Pattern.compile("([0-9]+).*"); + Matcher m = p.matcher(cytoband); + if (m.find()) { + chromosome = m.group(1); + } + } + } + return getNormalizedChromosome(chromosome); + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/util/MutationDataUtils.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/util/MutationDataUtils.java new file mode 100644 index 0000000..53c1023 --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/util/MutationDataUtils.java @@ -0,0 +1,408 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.util; + +import org.mskcc.cbio.model.Mutation; +import org.cbio.portal.pipelines.importer.model.MafRecord; + +import java.util.*; +import java.util.regex.*; +import com.google.common.base.Strings; +import com.google.common.collect.Sets; +import org.apache.commons.lang.StringUtils; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.stereotype.Repository; + +/** + * Utils class for resolving mutation data + * + * @author ochoaa + */ +@Repository +public class MutationDataUtils { + + public static final boolean CANONICAL_TRANSCRIPT = true; + + private static final String[] ignoredMutationTypes = {"silent", "loh", "wildtype", + "3'utr", "5'utr", "5'flank", "igr", "rna"}; + + private static final HashMap transformedOmaScoreMap = new HashMap<>(); + @Autowired + private void setTransformedOmaScoreMap() { + transformedOmaScoreMap.put("H", "H"); + transformedOmaScoreMap.put("high", "H"); + transformedOmaScoreMap.put("M", "M"); + transformedOmaScoreMap.put("medium", "M"); + transformedOmaScoreMap.put("L", "L"); + transformedOmaScoreMap.put("low", "L"); + transformedOmaScoreMap.put("N", "N"); + transformedOmaScoreMap.put("neutral", "N"); + transformedOmaScoreMap.put("[sent]", DataFileUtils.DEFAULT_NOT_AVAILABLE); + } + + /** + * Calculate the end position from the reference allele, tumor seq allele, and start position. + * + * @param referenceAllele + * @param tumorSeqAllele + * @param startPosition + * @return Long + */ + public static Long calculateEndPosition(String referenceAllele, String tumorSeqAllele, Long startPosition) { + long endPosition = 0; + + if (referenceAllele.equals("-")) { + endPosition = startPosition + 1; + } + else { + endPosition = startPosition + tumorSeqAllele.length() - 1; + } + + return endPosition; + } + + /** + * Resolve the tumor seq allele given a reference allele, tumor seq allele 1, and tumor seq allele 2. + * + * @param referenceAllele + * @param tumorSeqAllele1 + * @param tumorSeqAllele2 + * @return String + */ + public static String resolveTumorSeqAllele(String referenceAllele, String tumorSeqAllele1, String tumorSeqAllele2) { + String tumorSeqAllele = tumorSeqAllele1; + if (!Strings.isNullOrEmpty(referenceAllele) && referenceAllele.equals(tumorSeqAllele1)) { + tumorSeqAllele = tumorSeqAllele2; + } + + return tumorSeqAllele; + } + + /** + * Determine if given mutation type is acceptable or not. + * + * @param mutationType + * @return boolean + */ + public static boolean isAcceptableMutation(String mutationType) { + boolean valid = true; + if (DataFileUtils.isNullOrEmptyValue(mutationType)) { + valid = false; + } + else { + for (String mutType : ignoredMutationTypes) { + if (mutType.equals("rna") && mutationType.equalsIgnoreCase(mutType)) { + valid = false; + } + else if (mutationType.toLowerCase().startsWith(mutType)) { + valid = false; + } + } + } + + return valid; + } + + /** + * Resolve mutation type given a variant classification and oncotator variant classification. + * + * @param variantClassification + * @param oncotatorVariantClassification + * @return String + */ + public static String resolveMutationType(String variantClassification, String oncotatorVariantClassification) { + String mutationType = variantClassification; + if (!isAcceptableMutation(variantClassification) && + isAcceptableMutation(oncotatorVariantClassification)) { + mutationType = oncotatorVariantClassification; + } + + return mutationType; + } + + /** + * Resolve protein change value given a protein change and amino acid change values. + * + * @param proteinChange + * @param aminoAcidChange + * @return String + */ + public static String resolveProteinChange(String proteinChange, String aminoAcidChange, String maProteinChange) { + String resolvedProteinChange = "MUTATED"; + if (!DataFileUtils.isNullOrEmptyValue(proteinChange)) { + resolvedProteinChange = proteinChange; + } + else if (!DataFileUtils.isNullOrEmptyValue(aminoAcidChange)) { + resolvedProteinChange = aminoAcidChange; + } + else if (!DataFileUtils.isNullOrEmptyValue(maProteinChange)) { + resolvedProteinChange = maProteinChange; + } + + return getNormalizedProteinChange(resolvedProteinChange); + } + + /** + * Removes the starting 'p.' from a given protein change if found. + * + * @param proteinChange + * @return String + */ + public static String getNormalizedProteinChange(String proteinChange) { + if (proteinChange.startsWith("p.")) { + proteinChange = proteinChange.substring(2); + } + return proteinChange; + } + + /** + * Resolve the protein start position from protein position and protein change. + * + * @param proteinPosition + * @param proteinChange + * @return Integer + */ + public static Integer resolveProteinStartPosition(String proteinPosition, String proteinChange) { + // parts[0] is the protein start-end positions, parts[1] is the length + String[] positions = proteinPosition.split("/")[0].split("-"); + + Integer startPosition = DataFileUtils.DEFAULT_MISSING_INT; + try { + startPosition = Integer.valueOf(positions[0]); + } + catch (ArrayIndexOutOfBoundsException | NumberFormatException ex) {} + + // if start position is -1 then try extracting from protein change + if (startPosition.equals(DataFileUtils.DEFAULT_MISSING_INT)) { + Pattern p = Pattern.compile(".*[A-Z]([0-9]+)[^0-9]+"); + Matcher m = p.matcher(proteinChange); + if (m.find()) { + startPosition = Integer.valueOf(m.group(1)); + } + } + + + return startPosition; + } + + /** + * Resolve the protein end position from protein position. + * If null or missing int then return start position + * + * @param proteinPosition + * @param proteinChange + * @return Integer + */ + public static Integer resolveProteinEndPosition(String proteinPosition, String proteinChange) { + // parts[0] is the protein start-end positions, parts[1] is the length + String[] positions = proteinPosition.split("/")[0].split("-"); + + Integer endPosition = DataFileUtils.DEFAULT_MISSING_INT; + try { + endPosition = Integer.valueOf(positions[1]); + } + catch (ArrayIndexOutOfBoundsException | NumberFormatException ex) {} + + // change value of end position if -1 + if (endPosition.equals(DataFileUtils.DEFAULT_MISSING_INT)) { + return resolveProteinStartPosition(proteinPosition, proteinChange); + } + + return endPosition; + } + + /** + * Resolve the tumor alt count from a given MAF record. + * Returns default missing int value if missing from record + * + * @param mafRecord + * @return Integer + */ + public static Integer resolveTumorAltCount(MafRecord mafRecord) { + Integer result = DataFileUtils.DEFAULT_MISSING_INT; + + try { + if (!DataFileUtils.isNullOrMissingInt(mafRecord.gettAltCount())) { + result = Integer.valueOf(mafRecord.gettAltCount()); + } + else if (!DataFileUtils.isNullOrMissingInt(mafRecord.gettVarCov())) { + result = Integer.valueOf(mafRecord.gettVarCov()); + } + else if (!DataFileUtils.isNullOrMissingInt(mafRecord.getTumorDepth()) && + !DataFileUtils.isNullOrMissingInt(mafRecord.getTumorVaf())) { + result = Math.round(Integer.valueOf(mafRecord.getTumorDepth()) * Integer.valueOf(mafRecord.getTumorVaf())); + } + } + catch (NumberFormatException ex) {} + + return result; + } + + /** + * Resolve the tumor ref count from a given MAF record. + * Returns default missing int value if missing from record + * + * @param mafRecord + * @return Integer + */ + public static Integer resolveTumorRefCount(MafRecord mafRecord) { + Integer result = DataFileUtils.DEFAULT_MISSING_INT; + + try { + if (!DataFileUtils.isNullOrMissingInt(mafRecord.gettRefCount())) { + result = Integer.valueOf(mafRecord.gettRefCount()); + } + else if (!DataFileUtils.isNullOrMissingInt(mafRecord.gettVarCov())) { + result = Integer.valueOf(mafRecord.gettTotCov()) - Integer.valueOf(mafRecord.gettVarCov()); + } + else if (!DataFileUtils.isNullOrMissingInt(mafRecord.getTumorDepth()) && + !DataFileUtils.isNullOrMissingInt(mafRecord.getTumorVaf())) { + result = Integer.valueOf(mafRecord.getTumorDepth()) - Math.round(Integer.valueOf(mafRecord.getTumorDepth()) * Integer.valueOf(mafRecord.getTumorVaf())); + } + } + catch (NumberFormatException ex) {} + + return result; + } + + /** + * Resolve the normal alt count from a given MAF record. + * Returns default missing int value if missing from record + * + * @param mafRecord + * @return Integer + */ + public static Integer resolveNormalAltCount(MafRecord mafRecord) { + Integer result = DataFileUtils.DEFAULT_MISSING_INT; + + try { + if (!DataFileUtils.isNullOrMissingInt(mafRecord.getnAltCount())) { + result = Integer.valueOf(mafRecord.getnAltCount()); + } + else if (!DataFileUtils.isNullOrMissingInt(mafRecord.getnVarCov())) { + result = Integer.valueOf(mafRecord.getnVarCov()); + } + else if (!DataFileUtils.isNullOrMissingInt(mafRecord.getNormalDepth()) && + !DataFileUtils.isNullOrMissingInt(mafRecord.getNormalVaf())) { + result = Math.round(Integer.valueOf(mafRecord.getNormalDepth()) * Integer.valueOf(mafRecord.getNormalVaf())); + } + } + catch (NumberFormatException ex) {} + + return result; + } + + /** + * Resolve the normal ref count from a given MAF record. + * Returns default missing int value if missing from record + * + * @param mafRecord + * @return Integer + */ + public static Integer resolveNormalRefCount(MafRecord mafRecord) { + Integer result = DataFileUtils.DEFAULT_MISSING_INT; + + try { + if (!DataFileUtils.isNullOrMissingInt(mafRecord.getnRefCount())) { + result = Integer.valueOf(mafRecord.getnRefCount()); + } + else if (!DataFileUtils.isNullOrMissingInt(mafRecord.getnVarCov()) && + !DataFileUtils.isNullOrMissingInt(mafRecord.getnTotCov())) { + result = Integer.valueOf(mafRecord.getnTotCov()) - Integer.valueOf(mafRecord.getnVarCov()); + } + else if (!DataFileUtils.isNullOrMissingInt(mafRecord.getNormalDepth()) && + !DataFileUtils.isNullOrMissingInt(mafRecord.getNormalVaf())) { + result = Integer.valueOf(mafRecord.getNormalDepth()) - Math.round(Integer.valueOf(mafRecord.getNormalDepth()) * Integer.valueOf(mafRecord.getNormalVaf())); + } + } + catch (NumberFormatException ex) {} + + return result; + } + + /** + * Transform the OMA score. + * + * @param omaScore + * @return String + */ + public static String transformOmaScore(String omaScore) { + if (transformedOmaScoreMap.containsKey(omaScore.toUpperCase())) { + return transformedOmaScoreMap.get(omaScore.toUpperCase()); + } + else if (transformedOmaScoreMap.containsKey(omaScore.toLowerCase())) { + return transformedOmaScoreMap.get(omaScore.toLowerCase()); + } + else { + return omaScore; + } + } + + /** + * Merge given mutations. + * + * @param mut1 + * @param mut2 + * @return Mutation + */ + public static Mutation mergeMutationData(Mutation mut1, Mutation mut2) { + Mutation mergedMutation = mut1; + if (!mut1.getMatchedNormSampleBarcode().equalsIgnoreCase(mut2.getMatchedNormSampleBarcode()) && + DataFileUtils.isNormalSample(mut2.getMatchedNormSampleBarcode())) { + mergedMutation = mut2; + } + else if (!mut1.getValidationStatus().equalsIgnoreCase(mut2.getValidationStatus()) && + (mut2.getValidationStatus().equalsIgnoreCase("Valid") || mut2.getValidationStatus().equalsIgnoreCase("VALIDATED"))) { + mergedMutation = mut2; + } + else if (!mut1.getMutationStatus().equalsIgnoreCase(mut2.getMutationStatus())) { + if (mut2.getMutationStatus().equalsIgnoreCase("Germline")) { + mergedMutation = mut2; + } + else if (mut2.getMutationStatus().equalsIgnoreCase("SOMATIC") && + !mut1.getMutationStatus().equalsIgnoreCase("Germline")) { + mergedMutation = mut2; + } + } + // merge centers for mutations + Set mut1Centers = new HashSet<>(Arrays.asList(mut1.getCenter().split(";"))); + Set mut2Centers = new HashSet<>(Arrays.asList(mut2.getCenter().split(";"))); + Set mergedCenters = Sets.union(mut1Centers, mut2Centers); + mergedCenters.remove("NA"); + mergedMutation.setCenter(StringUtils.join(mergedCenters, ";")); + mergedMutation.setMutationEventId(mut1.getMutationEventId()); + + return mergedMutation; + } + +} diff --git a/importer/src/main/java/org/cbio/portal/pipelines/importer/util/MutationFilter.java b/importer/src/main/java/org/cbio/portal/pipelines/importer/util/MutationFilter.java new file mode 100644 index 0000000..db77df9 --- /dev/null +++ b/importer/src/main/java/org/cbio/portal/pipelines/importer/util/MutationFilter.java @@ -0,0 +1,221 @@ +/* + * Copyright (c) 2016 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.cbio.portal.pipelines.importer.util; + +import org.mskcc.cbio.model.Mutation; + +import java.util.*; +import org.springframework.beans.factory.annotation.Autowired; + +/** + * + * @author ochoaa + */ +public class MutationFilter { + + // params for filtered mutations counts + private Integer decisions; + private Integer accepts; + private Integer mutationStatusNoneRejects; + private Integer silentOrIntronRejects; + private Integer lohOrWildtypeRejects; + private Integer redactedRejects; + private Integer utrRejects; + private Integer igrRejects; + + public static final Set whiteListGenesForPromoterMutations = new HashSet<>(); + @Autowired + private void setWhiteListGeneEntrezIds() { + whiteListGenesForPromoterMutations.add(7015); + } + + public MutationFilter() { + this.decisions = 0; + this.accepts = 0; + this.mutationStatusNoneRejects = 0; + this.silentOrIntronRejects = 0; + this.lohOrWildtypeRejects = 0; + this.redactedRejects = 0; + this.utrRejects = 0; + this.igrRejects = 0; + } + + /** + * update decisions count + */ + public void updateDecisions() { + this.decisions++; + } + + /** + * update accepts count + */ + public void updateAccepts() { + this.accepts++; + } + + /** + * update mutation status none rejects count + */ + public void updateMutationStatusNoneRejects() { + this.mutationStatusNoneRejects++; + } + + /** + * update silent or intron rejects count + */ + public void updateSilentOrIntronRejects() { + this.silentOrIntronRejects++; + } + + /** + * update loh or wildtype rejects count + */ + public void updateLohOrWildtypeRejects() { + this.lohOrWildtypeRejects++; + } + + /** + * update redacted rejects count + */ + public void updateRedactedRejects() { + this.redactedRejects++; + } + + /** + * update utr rejects count + */ + public void updateUtrRejects() { + this.utrRejects++; + } + + /** + * update igr rejects count + */ + public void updateIgrRejects() { + this.igrRejects++; + } + + /** + * Determines whether mutation should be filtered out or not. + * + * @param mutation + * @return boolean + */ + public boolean acceptMutation(Mutation mutation) { + decisions++; + + // reject records with Mutation_Status 'None' + if (isPatternMatch(mutation.getMutationStatus(), new String[]{"None"})) { + mutationStatusNoneRejects++; + return false; + } + // reject silent and intronic mutations + if (isPatternMatch(mutation.getMutationEvent().getMutationType(), new String[]{"Silent", "Intron"})) { + silentOrIntronRejects++; + return false; + } + // reject loh and wildtype mutations + if (isPatternMatch(mutation.getMutationStatus(), new String[]{"LOH", "Wildtype"})) { + lohOrWildtypeRejects++; + return false; + } + // reject redacted muations + if (isPatternMatch(mutation.getValidationStatus(), new String[]{"Redacted"})) { + redactedRejects++; + return false; + } + // reject 3'utr and 5'utr mutations + if (isPatternMatch(mutation.getMutationEvent().getMutationType(), new String[]{"3'UTR", "3'Flank", "5'UTR"})) { + utrRejects++; + return false; + } + // accept 5'Flank promoter mutations for white listed genes + if (isPatternMatch(mutation.getMutationEvent().getMutationType(), new String[]{"5'Flank"})) { + if (whiteListGenesForPromoterMutations.contains(mutation.getEntrezGeneId())) { + mutation.getMutationEvent().setMutationType("Promoter"); + } + else { + utrRejects++; + return false; + } + } + // mutations with hugo symbols that are unknown will be treated as intergenic + if (isPatternMatch(mutation.getMutationEvent().getMutationType(), new String[]{"IGR"})) { + igrRejects++; + return false; + } + + // increment accepted mutations + accepts++; + + return true; + } + + /** + * Checks if value starts with given pattern. + * + * @param value + * @param patterns + * @return boolean + */ + private boolean isPatternMatch(String value, String[] patterns) { + if (DataFileUtils.isNullOrEmptyValue(value)) { + return true; + } + + boolean hasPattern = false; + for (String pattern : patterns) { + if (value.toLowerCase().startsWith(pattern.toLowerCase())) { + hasPattern = true; + } + } + + return hasPattern; + } + + /** + * Prints the MutationFilter summary statistics. + */ + public void printSummaryStatistics(){ + String summary = "Mutation filter decisions: " + decisions + + "\nRejects: " + (decisions-accepts) + + "\nMutation Status 'None' Rejects: " + mutationStatusNoneRejects + + "\nSilent or Intron Rejects: " + silentOrIntronRejects + + "\nUTR Rejects: " + utrRejects + + "\nIGR Rejects: " + igrRejects + + "\nLOH or Wild Type Rejects: " + lohOrWildtypeRejects; + System.out.println(summary); + } + +} diff --git a/importer/src/main/resources/gene_symbol_disambiguation.txt b/importer/src/main/resources/gene_symbol_disambiguation.txt new file mode 100644 index 0000000..11ede08 --- /dev/null +++ b/importer/src/main/resources/gene_symbol_disambiguation.txt @@ -0,0 +1,3 @@ +Gene_Symbol Entrez_Gene_ID +MLL2 8085 +CDC2 983 diff --git a/pom.xml b/pom.xml new file mode 100644 index 0000000..bd1d41e --- /dev/null +++ b/pom.xml @@ -0,0 +1,45 @@ + + + 4.0.0 + org.cbio.portal.pipelines + master + pom + cBioPortal Importer Master + 0.1.0 + master maven module + https://github.com/cBioPortal/importer/ + + + importer + + + + + org.springframework.boot + spring-boot-starter-parent + 1.2.7.RELEASE + + + + 1.8 + 1.3.0 + + + + + org.springframework.boot + spring-boot-starter-batch + + + + + + + + org.springframework.boot + spring-boot-maven-plugin + + + + +