diff --git a/.github/workflows/check-bioc.yaml b/.github/workflows/check-bioc.yaml index 2be10d9be..2439140e4 100644 --- a/.github/workflows/check-bioc.yaml +++ b/.github/workflows/check-bioc.yaml @@ -51,10 +51,11 @@ jobs: fail-fast: false matrix: config: - - { os: ubuntu-latest, r: '4.2.2', bioc: '3.16', cont: "bioconductor/bioconductor_docker:RELEASE_3_16", rspm: "https://packagemanager.posit.co" } - ## - { os: ubuntu-latest, r: '4.3', bioc: '3.17', cont: "bioconductor/bioconductor_docker:RELEASE_3_17", rspm: "https://packagemanager.posit.co/cran/__linux__/jammy/latest" } - - { os: macOS-latest, r: '4.2.2', bioc: '3.16'} - ## - { os: windows-latest, r: '4.2.2', bioc: '3.16'} + - { os: ubuntu-latest, r: 'devel', bioc: '3.22', cont: "bioconductor/bioconductor_docker:RELEASE_3_22", rspm: "https://packagemanager.posit.co/cran/__linux__/jammy/latest" } + - { os: ubuntu-latest, r: '4.5.1', bioc: '3.21'} + - { os: macOS-latest, r: '4.5.1', bioc: '3.21'} + - { os: windows-latest, r: '4.5.1', bioc: '3.21'} + env: R_REMOTES_NO_ERRORS_FROM_WARNINGS: true RSPM: ${{ matrix.config.rspm }} @@ -77,7 +78,7 @@ jobs: ## https://github.com/r-lib/actions/blob/master/examples/check-standard.yaml ## If they update their steps, we will also need to update ours. - name: Checkout Repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 ## R is already included in the Bioconductor docker images - name: Setup R from r-lib @@ -100,7 +101,7 @@ jobs: - name: Cache R packages if: "!contains(github.event.head_commit.message, '/nocache') && runner.os != 'Linux'" - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ${{ env.R_LIBS_USER }} key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_$${{ matrix.config.bioc }}-$${{ matrix.config.r }}-${{ hashFiles('.github/depends.Rds') }} @@ -108,7 +109,7 @@ jobs: - name: Cache R packages on Linux if: "!contains(github.event.head_commit.message, '/nocache') && runner.os == 'Linux' " - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: /home/runner/work/_temp/Library key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_$${{ matrix.config.bioc }}-r-${{ matrix.config.r }}--${{ hashFiles('.github/depends.Rds') }} diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml index 5f31712af..1b374b251 100644 --- a/.github/workflows/test-coverage.yaml +++ b/.github/workflows/test-coverage.yaml @@ -2,9 +2,7 @@ # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help on: push: - branches: main pull_request: - branches: main name: test-coverage @@ -15,7 +13,7 @@ jobs: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: r-lib/actions/setup-r@v2 with: @@ -29,3 +27,5 @@ jobs: - name: Test coverage run: covr::codecov(quiet = FALSE) shell: Rscript {0} + env: + CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} diff --git a/DESCRIPTION b/DESCRIPTION index 575cab2fd..969b5a478 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: RAIDS Type: Package -Title: Accurate Inference of Genetic Ancestry from Cancer Sequences +Title: Robust Ancestry Inference using Data Synthesis Description: This package implements specialized algorithms that enable genetic ancestry inference from various cancer sequences sources (RNA, Exome and Whole-Genome sequences). This package also implements a @@ -9,11 +9,14 @@ Description: This package implements specialized algorithms that enable following publication: Belleau, P et al. Genetic Ancestry Inference from Cancer-Derived Molecular Data across Genomic and Transcriptomic Platforms. Cancer Res 1 January 2023; 83 (1): 49–58. -Version: 0.99.4 +Version: 1.7.1 Authors@R: c(person("Pascal", "Belleau", email="pascal_belleau@hotmail.com", role=c("cre", "aut"), comment = c(ORCID = "0000-0002-0802-1071")), person("Astrid", "Deschênes", email="adeschen@hotmail.com", role=c("aut"), comment = c(ORCID = "0000-0001-7846-6749")), + person(given="David A.", family="Tuveson", + email="dtuveson@cshl.edu", + role=c("aut"), comment=c(ORCID="0000-0002-8017-2712")), person("Alexander", "Krasnitz", email="krasnitz@cshl.edu", role=c("aut"))) License: Apache License (>= 2) @@ -25,7 +28,9 @@ Depends: R (>= 4.2.0), SNPRelate, stats, utils, - GENESIS + GENESIS, + dplyr, + Rsamtools Imports: S4Vectors, GenomicRanges, ensembldb, @@ -36,15 +41,22 @@ Imports: S4Vectors, pROC, IRanges, AnnotationFilter, - rlang + rlang, + VariantAnnotation, + MatrixGenerics, + ggplot2, + stringr Suggests: testthat, knitr, rmarkdown, BiocStyle, withr, - BSgenome.Hsapiens.UCSC.hg38 + GenomeInfoDb, + BSgenome.Hsapiens.UCSC.hg38, + EnsDb.Hsapiens.v86 BugReports: https://github.com/KrasnitzLab/RAIDS/issues URL: https://krasnitzlab.github.io/RAIDS/ biocViews: Genetics, Software, Sequencing, WholeGenome, PrincipalComponent, - GeneticVariability, DimensionReduction -RoxygenNote: 7.2.3 + GeneticVariability, DimensionReduction, BiocViews +Roxygen: list(markdown = TRUE) +RoxygenNote: 7.3.2 diff --git a/NAMESPACE b/NAMESPACE index a13fc8dd1..c062f7143 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,37 +1,42 @@ # Generated by roxygen2: do not edit by hand export(add1KG2SampleGDS) -export(addBlockFromPlink2GDS) +export(addBlockFromDetFile) export(addGeneBlockGDSRefAnnot) -export(addPhase1KG2SampleGDSFromFile) -export(addPhase1KG2SampleGDSFromGDS) +export(addGeneBlockRefAnnot) export(addRef2GDS1KG) export(addStudy1Kg) -export(appendStudy2GDS1KG) -export(basePCASample) +export(computeAncestryFromSynthetic) export(computeAncestryFromSyntheticFile) export(computeKNNRefSample) export(computeKNNRefSynthetic) export(computePCAMultiSynthetic) export(computePCARefSample) -export(computePoolSyntheticAncestry) export(computePoolSyntheticAncestryGr) -export(computePrunedPCARef) export(computeSyntheticROC) +export(createAUROCGraph) +export(createAccuracyGraph) export(createStudy2GDS1KG) export(estimateAllelicFraction) export(generateGDS1KG) export(generateMapSnvSel) export(generatePhase1KG2GDS) +export(generatePhaseRef) export(getRef1KGPop) +export(getRefSuperPop) export(groupChr1KGSNV) export(identifyRelative) +export(identifyRelativeRef) +export(inferAncestry) +export(inferAncestryDNA) +export(inferAncestryGeneAware) export(prepPed1KG) export(prepSynthetic) -export(projectSample2PCA) export(pruningSample) export(runExomeAncestry) +export(runRNAAncestry) export(select1KGPop) +export(select1KGPopForSynthetic) export(snvListVCF) export(splitSelectByPop) export(syntheticGeno) @@ -41,7 +46,15 @@ importFrom(BSgenome,strand) importFrom(GENESIS,pcairPartition) importFrom(GenomicRanges,GRanges) importFrom(GenomicRanges,reduce) +importFrom(GenomicRanges,seqnames) +importFrom(GenomicRanges,start) +importFrom(GenomicRanges,width) importFrom(IRanges,IRanges) +importFrom(MatrixGenerics,rowRanges) +importFrom(Rsamtools,BamFile) +importFrom(Rsamtools,PileupParam) +importFrom(Rsamtools,ScanBamParam) +importFrom(Rsamtools,pileup) importFrom(S4Vectors,Rle) importFrom(S4Vectors,aggregate) importFrom(S4Vectors,isSingleNumber) @@ -52,7 +65,12 @@ importFrom(SNPRelate,snpgdsOpen) importFrom(SNPRelate,snpgdsPCA) importFrom(SNPRelate,snpgdsPCASNPLoading) importFrom(SNPRelate,snpgdsPCASampLoading) +importFrom(VariantAnnotation,geno) +importFrom(VariantAnnotation,readVcf) importFrom(class,knn) +importFrom(dplyr,"%>%") +importFrom(dplyr,group_by) +importFrom(dplyr,summarize) importFrom(ensembldb,exonsBy) importFrom(ensembldb,genes) importFrom(ensembldb,toSAF) @@ -71,9 +89,23 @@ importFrom(gdsfmt,read.gdsn) importFrom(gdsfmt,readmode.gdsn) importFrom(gdsfmt,sync.gds) importFrom(gdsfmt,write.gdsn) +importFrom(ggplot2,aes) +importFrom(ggplot2,element_rect) +importFrom(ggplot2,element_text) +importFrom(ggplot2,facet_grid) +importFrom(ggplot2,geom_line) +importFrom(ggplot2,geom_ribbon) +importFrom(ggplot2,ggplot) +importFrom(ggplot2,ggtitle) +importFrom(ggplot2,scale_colour_manual) +importFrom(ggplot2,theme) +importFrom(ggplot2,theme_classic) +importFrom(ggplot2,ylab) +importFrom(ggplot2,ylim) importFrom(methods,is) importFrom(pROC,multiclass.roc) importFrom(pROC,roc) +importFrom(rlang,.data) importFrom(rlang,arg_match) importFrom(stats,dbinom) importFrom(stats,mad) @@ -82,6 +114,7 @@ importFrom(stats,pbinom) importFrom(stats,qbinom) importFrom(stats,quantile) importFrom(stats,rmultinom) +importFrom(stringr,str_detect) importFrom(utils,read.csv) importFrom(utils,read.csv2) importFrom(utils,read.delim) diff --git a/R/RAIDS.R b/R/RAIDS.R index b7741e1c3..4a04dcf18 100644 --- a/R/RAIDS.R +++ b/R/RAIDS.R @@ -25,8 +25,6 @@ #' https://doi.org/10.1158/0008-5472.CAN-22-0682 #' #' -#' @docType package -#' #' @name RAIDS-package #' #' @aliases RAIDS-package RAIDS @@ -38,37 +36,683 @@ #' Pascal Belleau #' #' @seealso -#' \itemize{ -#' \item \code{\link{prepPed1KG}} {This function extracts the -#' needed information from the 1000 Genomes pedigree file and formats it -#' into a \code{data.frame} so in can be used in following steps -#' of the ancestry inference process.} -#' \item \code{\link{generateMapSnvSel}} {The function applies a cut-off -#' filter to the SNP information file to retain only the SNP that have a -#' frequency superior or equal to the specified cut-off in at least one -#' super population.} -#' \item \code{\link{generateGDS1KG}} {This function generates the GDS -#' file that will contain the information from 1KG. } -#' \item \code{\link{identifyRelative}} {The function identify patients -#' that are genetically related in the 1KG GDS file. } -#' \item \code{\link{addRef2GDS1KG}} { This function adds the information -#' about the unrelated patients to the 1KG GDS file. } -#' \item \code{\link{add1KG2SampleGDS}} { This function adds the genotype -#' information for the list of pruned SNVs into the Profile GDS file } -#' \item \code{\link{appendStudy2GDS1KG}} { This function creates the -#' Sample GDS file(s) for one or multiple specific samples -#' using the information from a Sample RDS description file and the 1KG -#' GDS file. } -#' \item \code{\link{estimateAllelicFraction}} { This function estimates -#' the allelic fraction of the pruned SNVs for a specific sample and add -#' the information to the associated GDS Sample file. The allelic fraction -#' estimation method is adapted to the type of study (DNA or RNA). } -#' \item \code{\link{computeSyntheticROC}} { This function calculate the -#' AUROC of the inferences for specific values of D and K using the -#' inferred ancestry results from the synthetic profiles.} +#' \describe{ +#' \item{\code{\link{runExomeAncestry}}}{This function runs most steps +#' leading to the ancestry inference call on a specific exome profile.} +#' \item{\code{\link{runExomeAncestry}}}{This function runs most steps +#' leading to the ancestry inference call on a specific RNA profile.} +#' \item{\code{\link{createAccuracyGraph}}}{The function extracts the +#' required information from an output generated by RAIDS to create a +#' graphic representation of the accuracy for different values of +#' PCA dimensions and K-neighbors through all tested ancestries.} #' } #' #' @return RAIDS #' @encoding UTF-8 #' @keywords package +#' @keywords internal +"_PACKAGE" + + +#' A small \code{data.frame} containing the information related to +#' synthetic profiles. The ancestry of the profiles used to generate the +#' synthetic profiles must be present. +#' +#' The object is a \code{data.frame} with 7 columns. The row names of +#' the \code{data.frame} must be the profile unique identifiers. +#' +#' This dataset can be +#' used to test the \code{\link{computeSyntheticROC}} function. +#' +#' @name pedSynthetic +#' +#' @docType data +#' +#' @aliases pedSynthetic +#' +#' @format The \code{data.frame} containing the information about the +#' synthetic profiles. The row names of +#' the \code{data.frame} correspond to the profile unique identifiers. +#' The \code{data.frame} contains 7 columns: +#' \describe{ +#' \item{\code{data.id}}{ a \code{character} string representing the unique +#' synthetic profile identifier.} +#' \item{\code{case.id}}{ a \code{character} string representing the unique +#' profile identifier that was used to generate the synthetic profile.} +#' \item{\code{sample.type}}{ a \code{character} string representing the type +#' of profile. } +#' \item{\code{diagnosis}}{ a \code{character} string representing the +#' diagnosis of profile that was used to generate the synthetic profile. } +#' \item{\code{source}}{ a \code{character} string representing the +#' source of the synthetic profile. } +#' \item{\code{study.id}}{ a \code{character} string representing the +#' name of the study to which the synthetic profile is associated. } +#' \item{\code{superPop}}{ a \code{character} string representing the +#' super population of the profile that was used to generate the synthetic +#' profile. } +#' } +#' +#' @return The \code{data.frame} containing the information about the +#' synthetic profiles. The row names of +#' the \code{data.frame} correspond to the profile unique identifiers. +#' The \code{data.frame} contains 7 columns: +#' \describe{ +#' \item{\code{data.id}}{ a \code{character} string representing the unique +#' synthetic profile identifier.} +#' \item{\code{case.id}}{ a \code{character} string representing the unique +#' profile identifier that was used to generate the synthetic profile.} +#' \item{\code{sample.type}}{ a \code{character} string representing the type +#' of profile.} +#' \item{\code{diagnosis}}{ a \code{character} string representing the +#' diagnosis of profile that was used to generate the synthetic profile. } +#' \item{\code{source}}{ a \code{character} string representing the +#' source of the synthetic profile. } +#' \item{\code{study.id}}{ a \code{character} string representing the +#' name of the study to which the synthetic profile is associated. } +#' \item{\code{superPop}}{ a \code{character} string representing the +#' super population of the profile that was used to generate the synthetic +#' profile. } +#' } +#' +#' @seealso +#' \describe{ +#' \item{\code{\link{computeSyntheticROC}}}{ for calculating the AUROC of +#' the inferences for specific values of D and K using the inferred +#' ancestry results from the synthetic profiles} +#' } +#' +#' @usage data(pedSynthetic) +#' +#' @keywords datasets +#' +#' @examples +#' +#' ## Loading demo dataset containing pedigree information for synthetic +#' ## profiles +#' data(pedSynthetic) +#' +#' ## Loading demo dataset containing the inferred ancestry results +#' ## for the synthetic data +#' data(matKNNSynthetic) +#' +#' ## Retain one K and one D value +#' matKNN <- matKNNSynthetic[matKNNSynthetic$D == 5 & matKNNSynthetic$K == 4, ] +#' +#' ## Compile statistics from the +#' ## synthetic profiles for fixed values of D and K +#' results <- RAIDS:::computeSyntheticROC(matKNN=matKNN, +#' matKNNAncestryColumn="SuperPop", +#' pedCall=pedSynthetic, pedCallAncestryColumn="superPop", +#' listCall=c("EAS", "EUR", "AFR", "AMR", "SAS")) +#' +#' results$matAUROC.All +#' results$matAUROC.Call +#' results$listROC.Call +#' +#' +NULL + + +#' A small \code{data.frame} containing the +#' inferred ancestry on the synthetic profiles. +#' +#' The object is a \code{data.frame} with 4 columns. +#' +#' This dataset can be +#' used to test the \code{\link{computeSyntheticROC}} function. +#' +#' @name matKNNSynthetic +#' +#' @docType data +#' +#' @aliases matKNNSynthetic +#' +#' @format The \code{data.frame} containing the information about the +#' synthetic profiles. The \code{data.frame} contains 4 columns: +#' \describe{ +#' \item{\code{sample.id}}{ a \code{character} string representing the unique +#' synthetic profile identifier.} +#' \item{\code{D}}{ a \code{numeric} representing the number of dimensions used +#' to infer the ancestry of the synthetic profile.} +#' \item{\code{K}}{ a \code{numeric} representing the number of neighbors used +#' to infer the ancestry of the synthetic profile.} +#' \item{\code{SuperPop}}{ a \code{character} string representing the +#' inferred ancestry of the synthetic profile for the specific D and K values.} +#' } +#' +#' @return The \code{data.frame} containing the information about the +#' synthetic profiles. The \code{data.frame} contains 4 columns: +#' \describe{ +#' \item{\code{sample.id}}{ a \code{character} string representing the unique +#' synthetic profile identifier.} +#' \item{\code{D}}{ a \code{numeric} representing the number of dimensions used +#' to infer the ancestry of the synthetic profile.} +#' \item{\code{K}}{ a \code{numeric} representing the number of neighbors used +#' to infer the ancestry of the synthetic profile.} +#' \item{\code{SuperPop}}{ a \code{character} string representing the +#' inferred ancestry of the synthetic profile for the specific D and K values.} +#' } +#' +#' @seealso +#' \describe{ +#' \item{\code{\link{computeSyntheticROC}}}{ for calculating the AUROC of +#' the inferences for specific values of D and K using the inferred +#' ancestry results from the synthetic profiles} +#' } +#' +#' @usage data(matKNNSynthetic) +#' +#' @keywords datasets +#' +#' @examples +#' +#' ## Loading demo dataset containing pedigree information for synthetic +#' ## profiles +#' data(pedSynthetic) +#' +#' ## Loading demo dataset containing the inferred ancestry results +#' ## for the synthetic data +#' data(matKNNSynthetic) +#' +#' ## Retain one K and one D value +#' matKNN <- matKNNSynthetic[matKNNSynthetic$D == 5 & matKNNSynthetic$K == 4, ] +#' +#' ## Compile statistics from the +#' ## synthetic profiles for fixed values of D and K +#' results <- RAIDS:::computeSyntheticROC(matKNN=matKNN, +#' matKNNAncestryColumn="SuperPop", +#' pedCall=pedSynthetic, pedCallAncestryColumn="superPop", +#' listCall=c("EAS", "EUR", "AFR", "AMR", "SAS")) +#' +#' results$matAUROC.All +#' results$matAUROC.Call +#' results$listROC.Call +#' +#' +NULL + + +#' A small \code{data.frame} containing the +#' SNV information. +#' +#' The object is a \code{data.frame} with 17 columns. +#' +#' This dataset can be +#' used to test the \code{\link{calcAFMLRNA}} and \code{\link{tableBlockAF}} +#' internal functions. +#' +#' @name snpPositionDemo +#' +#' @docType data +#' +#' @aliases snpPositionDemo +#' +#' @format The \code{data.frame} containing the information about the +#' synthetic profiles. The \code{data.frame} contains 4 columns: +#' \describe{ +#' \item{\code{cnt.tot}}{ a \code{integer} representing the number of reads at +#' the SNV position.} +#' \item{\code{cnt.ref}}{ a \code{integer} representing the number of reads +#' corresponding to the reference at the SNV position.} +#' \item{\code{cnt.alt}}{ a \code{integer} representing the number of reads +#' different than the reference at the SNV position.} +#' \item{\code{snp.pos}}{ a \code{integer} representing the position of the +#' SNV on the chromosome.} +#' \item{\code{snp.chr}}{ a \code{integer} representing the chromosome on which +#' the SNV is located.} +#' \item{\code{normal.geno}}{ a \code{integer} representing the genotype +#' (0=wild-type reference; 1=heterozygote; 2=homozygote alternative; +#' 3=unkown).} +#' \item{\code{pruned}}{ a \code{logical} indicated if the SNV is pruned.} +#' \item{\code{snp.index}}{ a \code{integer} representing the index of the +#' SNV in the reference SNV GDS file.} +#' \item{\code{keep}}{ a \code{logical} indicated if the genotype +#' exists for the SNV.} +#' \item{\code{hetero}}{ a \code{logical} indicated if the SNV is +#' heterozygote.} +#' \item{\code{homo}}{ a \code{logical} indicated if the SNV is homozygote.} +#' \item{\code{block.id}}{ a \code{integer} representing the block identifier +#' associated to the current SNV.} +#' \item{\code{phase}}{ a \code{integer} representing the block identifier +#' associated to the current SNV.} +#' \item{\code{lap}}{ a \code{numeric} representing the lower allelic +#' fraction.} +#' \item{\code{LOH}}{ a \code{integer} indicating if the SNV is in an LOH +#' region (0=not LOH, 1=in LOH).} +#' \item{\code{imbAR}}{ a \code{integer} indicating if the SNV is in an +#' imbalanced region (-1=not classified as imbalanced or LOH, 0=in LOH; +#' 1=tested positive for imbalance in at least 1 window).} +#' \item{\code{freq}}{ a \code{numeric} representing the frequency of the +#' variant in the the reference.} +#' } +#' +#' @return The \code{data.frame} containing the information about the +#' synthetic profiles. The \code{data.frame} contains 4 columns: +#' \describe{ +#' \item{\code{cnt.tot}}{ a \code{integer} representing the number of reads at +#' the SNV position.} +#' \item{\code{cnt.ref}}{ a \code{integer} representing the number of reads +#' corresponding to the reference at the SNV position.} +#' \item{\code{cnt.alt}}{ a \code{integer} representing the number of reads +#' different than the reference at the SNV position.} +#' \item{\code{snp.pos}}{ a \code{integer} representing the position of the +#' SNV on the chromosome.} +#' \item{\code{snp.chr}}{ a \code{integer} representing the chromosome on which +#' the SNV is located.} +#' \item{\code{normal.geno}}{ a \code{integer} representing the genotype +#' (0=wild-type reference; 1=heterozygote; 2=homozygote alternative; 3=unkown).} +#' \item{\code{pruned}}{ a \code{logical} indicated if the SNV is pruned.} +#' \item{\code{snp.index}}{ a \code{integer} representing the index of the +#' SNV in the reference SNV GDS file.} +#' \item{\code{keep}}{ a \code{logical} indicated if the genotype +#' exists for the SNV.} +#' \item{\code{hetero}}{ a \code{logical} indicated if the SNV is heterozygote.} +#' \item{\code{homo}}{ a \code{logical} indicated if the SNV is homozygote.} +#' \item{\code{block.id}}{ a \code{integer} representing the block identifier +#' associated to the current SNV.} +#' \item{\code{phase}}{ a \code{integer} representing the block identifier +#' associated to the current SNV.} +#' \item{\code{lap}}{ a \code{numeric} representing the lower allelic fraction.} +#' \item{\code{LOH}}{ a \code{integer} indicating if the SNV is in an LOH region +#' (0=not LOH, 1=in LOH).} +#' \item{\code{imbAR}}{ a \code{integer} indicating if the SNV is in an +#' imbalanced region (-1=not classified as imbalanced or LOH, 0=in LOH; +#' 1=tested positive for imbalance in at least 1 window).} +#' \item{\code{freq}}{ a \code{numeric} representing the frequency of the +#' variant in the the reference.} +#' } +#' +#' @usage data(snpPositionDemo) +#' +#' @keywords datasets +#' +#' @examples +#' +#' ## Loading demo dataset containing SNV information +#' data(snpPositionDemo) +#' +#' ## Only use a subset of heterozygote SNVs related to one block +#' subset <- snpPositionDemo[which(snpPositionDemo$block.id == 2750 & +#' snpPositionDemo$hetero), c("cnt.ref", "cnt.alt", "phase")] +#' +#' ## Compute the log likelihood ratio based on the coverage of +#' ## each allele in a specific block +#' result <- RAIDS:::calcAFMLRNA(subset) +#' head(result) +#' +#' +NULL + + +#' The PCA results of the demo 1KG reference dataset for demonstration purpose. +#' Beware that the PCA has been run on a very small subset of the +#' 1KG reference dataset +#' and should not be used to call ancestry inference on a real profile. +#' +#' The object is a \code{list}. +#' +#' This object can be +#' used to test the \code{\link{computePCAMultiSynthetic}} function. +#' +#' @name demoPCA1KG +#' +#' @docType data +#' +#' @aliases demoPCA1KG +#' +#' @format The \code{list} containing the PCA results for a small subset of +#' the reference 1KG dataset. The \code{list} contains 2 entries: +#' \describe{ +#' \item{pruned}{ a \code{vector} of SNV identifiers specifying selected SNVs +#' for the PCA analysis.} +#' \item{pca.unrel}{ a \code{snpgdsPCAClass} object containing the eigenvalues +#' as generated by \link[SNPRelate]{snpgdsPCA} function.} +#' } +#' +#' @return The \code{list} containing the PCA results for a small subset of +#' the reference 1KG dataset. The \code{list} contains 2 entries: +#' \describe{ +#' \item{pruned}{ a \code{vector} of SNV identifiers specifying selected SNVs +#' for the PCA analysis.} +#' \item{pca.unrel}{ a \code{snpgdsPCAClass} object containing the eigenvalues +#' as generated by \link[SNPRelate]{snpgdsPCA} function.} +#' } +#' +#' @usage data(demoPCA1KG) +#' +#' @keywords datasets +#' +#' @examples +#' +#' ## Required library +#' library(gdsfmt) +#' +#' ## Loading demo PCA on subset of 1KG reference dataset +#' data(demoPCA1KG) +#' +#' ## Path to the demo Profile GDS file is located in this package +#' dataDir <- system.file("extdata/demoKNNSynthetic", package="RAIDS") +#' +#' # The name of the synthetic study +#' studyID <- "MYDATA.Synthetic" +#' +#' samplesRM <- c("HG00246", "HG00325", "HG00611", "HG01173", "HG02165", +#' "HG01112", "HG01615", "HG01968", "HG02658", "HG01850", "HG02013", +#' "HG02465", "HG02974", "HG03814", "HG03445", "HG03689", "HG03789", +#' "NA12751", "NA19107", "NA18548", "NA19075", "NA19475", "NA19712", +#' "NA19731", "NA20528", "NA20908") +#' names(samplesRM) <- c("GBR", "FIN", "CHS","PUR", "CDX", "CLM", "IBS", +#' "PEL", "PJL", "KHV", "ACB", "GWD", "ESN", "BEB", "MSL", "STU", "ITU", +#' "CEU", "YRI", "CHB", "JPT", "LWK", "ASW", "MXL", "TSI", "GIH") +#' +#' ## Open the Profile GDS file +#' gdsProfile <- snpgdsOpen(file.path(dataDir, "ex1.gds")) +#' +#' ## Projects synthetic profiles on demo 1KG PCA +#' results <- computePCAMultiSynthetic(gdsProfile=gdsProfile, +#' listPCA=demoPCA1KG, sampleRef=samplesRM, studyIDSyn=studyID, +#' verbose=FALSE) +#' +#' ## The eigenvectors for the synthetic profiles +#' head(results$eigenvector) +#' +#' ## Close Profile GDS file (important) +#' closefn.gds(gdsProfile) +#' +NULL + + +#' The PCA result of demo synthetic profiles projected on the demo subset +#' 1KG reference PCA. +#' +#' The object is a \code{list}. +#' +#' This object can be +#' used to test the \code{\link{computeKNNRefSynthetic}} function. +#' +#' @name demoPCASyntheticProfiles +#' +#' @docType data +#' +#' @aliases demoPCASyntheticProfiles +#' +#' @format The \code{list} containing the PCA result of demo synthetic +#' profiles projected on the demo subset 1KG reference PCA. +#' The \code{list} contains 3 entries: +#' \describe{ +#' \item{sample.id}{ a \code{character} string representing the unique +#' identifier of the synthetic profiles.} +#' \item{eigenvector.ref}{ a \code{matrix} of \code{numeric} containing +#' the eigenvectors for the reference profiles.} +#' \item{eigenvector}{ a \code{matrix} of \code{numeric} containing the +#' eigenvectors for the current synthetic profiles projected on the demo +#' PCA 1KG reference profiles.} +#' } +#' +#' @return The \code{list} containing the PCA result of demo synthetic +#' profiles projected on the demo subset 1KG reference PCA. +#' The \code{list} contains 3 entries: +#' \describe{ +#' \item{sample.id}{ a \code{character} string representing the unique +#' identifier of the synthetic profiles.} +#' \item{eigenvector.ref}{ a \code{matrix} of \code{numeric} containing +#' the eigenvectors for the reference profiles.} +#' \item{eigenvector}{ a \code{matrix} of \code{numeric} containing the +#' eigenvectors for the current synthetic profiles projected on the demo +#' PCA 1KG reference profiles.} +#' } +#' +#' @seealso +#' \describe{ +#' \item{\code{\link{computeKNNRefSynthetic}}}{ for running a k-nearest +#' neighbors analysis on a subset of the synthetic data set.} +#' } +#' +#' @usage data(demoPCASyntheticProfiles) +#' +#' @keywords datasets +#' +#' @examples +#' +#' ## Required library +#' library(gdsfmt) +#' +#' ## Load the demo PCA on the synthetic profiles projected on the +#' ## demo 1KG reference PCA +#' data(demoPCASyntheticProfiles) +#' +#' ## Load the known ancestry for the demo 1KG reference profiles +#' data(demoKnownSuperPop1KG) +#' +#' ## Path to the demo Profile GDS file is located in this package +#' dataDir <- system.file("extdata/demoKNNSynthetic", package="RAIDS") +#' +#' ## Open the Profile GDS file +#' gdsProfile <- snpgdsOpen(file.path(dataDir, "ex1.gds")) +#' +#' # The name of the synthetic study +#' studyID <- "MYDATA.Synthetic" +#' +#' ## Projects synthetic profiles on 1KG PCA +#' results <- computeKNNRefSynthetic(gdsProfile=gdsProfile, +#' listEigenvector=demoPCASyntheticProfiles, +#' listCatPop=c("EAS", "EUR", "AFR", "AMR", "SAS"), studyIDSyn=studyID, +#' spRef=demoKnownSuperPop1KG) +#' +#' ## The inferred ancestry for the synthetic profiles for different values +#' ## of D and K +#' head(results$matKNN) +#' +#' ## Close Profile GDS file (important) +#' closefn.gds(gdsProfile) +#' +NULL + + +#' The known super population ancestry of the demo 1KG reference profiles. +#' +#' The object is a \code{vector}. +#' +#' This object can be +#' used to test the \code{\link{computeKNNRefSynthetic}} and +#' \code{\link{computePoolSyntheticAncestryGr}} functions. +#' +#' @name demoKnownSuperPop1KG +#' +#' @docType data +#' +#' @aliases demoKnownSuperPop1KG +#' +#' @format The \code{vector} containing the know super population ancestry +#' for the demo 1KG reference profiles. +#' +#' @return The \code{vector} containing the know super population ancestry +#' for the demo 1KG reference profiles. +#' +#' @seealso +#' \describe{ +#' \item{\code{\link{computeKNNRefSynthetic}}}{ for running a k-nearest +#' neighbors analysis on a subset of the synthetic data set.} +#' \item{\code{\link{computePoolSyntheticAncestryGr}}}{ for running a +#' PCA analysis using 1 synthetic profile from each sub-continental +#' population.} +#' } +#' +#' @usage data(demoKnownSuperPop1KG) +#' +#' @keywords datasets +#' +#' @examples +#' +#' ## Required library +#' library(gdsfmt) +#' +#' ## Load the demo PCA on the synthetic profiles projected on the +#' ## demo 1KG reference PCA +#' data(demoPCASyntheticProfiles) +#' +#' ## Load the known ancestry for the demo 1KG reference profiles +#' data(demoKnownSuperPop1KG) +#' +#' ## Path to the demo Profile GDS file is located in this package +#' dataDir <- system.file("extdata/demoKNNSynthetic", package="RAIDS") +#' +#' ## Open the Profile GDS file +#' gdsProfile <- snpgdsOpen(file.path(dataDir, "ex1.gds")) +#' +#' # The name of the synthetic study +#' studyID <- "MYDATA.Synthetic" +#' +#' ## Projects synthetic profiles on 1KG PCA +#' results <- computeKNNRefSynthetic(gdsProfile=gdsProfile, +#' listEigenvector=demoPCASyntheticProfiles, +#' listCatPop=c("EAS", "EUR", "AFR", "AMR", "SAS"), studyIDSyn=studyID, +#' spRef=demoKnownSuperPop1KG) +#' +#' ## The inferred ancestry for the synthetic profiles for different values +#' ## of D and K +#' head(results$matKNN) +#' +#' ## Close Profile GDS file (important) +#' closefn.gds(gdsProfile) +#' +NULL + + +#' The pedigree information about a demo profile called 'ex1'. +#' +#' The object is a \code{data.frame}. +#' +#' This object can be +#' used to test the \code{\link{runExomeAncestry}} function. +#' +#' @name demoPedigreeEx1 +#' +#' @docType data +#' +#' @aliases demoPedigreeEx1 +#' +#' @format The \code{data.frame} containing the information about a demo +#' profile called 'ex1'. the \code{data.frame} has 5 columns: +#' \describe{ +#' \item{Name.ID}{ a \code{character} string representing the unique +#' identifier of the profile.} +#' \item{Case.ID}{ a \code{character} string representing the unique +#' identifier of the case associated to the profile.} +#' \item{Sample.Type}{ a \code{character} string describing the type of +#' profile.} +#' \item{Diagnosis}{ a \code{character} string describing the diagnosis of the +#' profile.} +#' \item{Source}{ a \code{character} string describing the source of the +#' profile.} +#' } +#' +#' +#' @return The \code{data.frame} containing the information about a demo +#' profile called 'ex1'. the \code{data.frame} has 5 columns: +#' \describe{ +#' \item{Name.ID}{ a \code{character} string representing the unique +#' identifier of the profile.} +#' \item{Case.ID}{ a \code{character} string representing the unique +#' identifier of the case associated to the profile.} +#' \item{Sample.Type}{ a \code{character} string describing the type of +#' profile.} +#' \item{Diagnosis}{ a \code{character} string describing the diagnosis of the +#' profile.} +#' \item{Source}{ a \code{character} string describing the source of the +#' profile.} +#' } +#' +#' @seealso +#' \describe{ +#' \item{\code{\link{runExomeAncestry}}}{ for running runs most +#' steps leading to the ancestry inference call on a specific exome +#' profile.} +#' } +#' +#' @usage data(demoPedigreeEx1) +#' +#' @keywords datasets +#' +#' @examples +#' +#' +#' ## Required library for GDS +#' library(SNPRelate) +#' +#' ## Path to the demo 1KG GDS file is located in this package +#' dataDir <- system.file("extdata", package="RAIDS") +#' +#' ################################################################# +#' ## Load the information about the profile +#' ################################################################# +#' data(demoPedigreeEx1) +#' head(demoPedigreeEx1) +#' +#' ################################################################# +#' ## The 1KG GDS file and the 1KG SNV Annotation GDS file +#' ## need to be located in the same directory +#' ## Note that the 1KG GDS file used for this example is a +#' ## simplified version and CANNOT be used for any real analysis +#' ################################################################# +#' path1KG <- file.path(dataDir, "tests") +#' +#' fileReferenceGDS <- file.path(path1KG, "ex1_good_small_1KG.gds") +#' fileAnnotGDS <- file.path(path1KG, "ex1_good_small_1KG_Annot.gds") +#' +#' ################################################################# +#' ## The Sample SNP pileup files (one per sample) need +#' ## to be located in the same directory. +#' ################################################################# +#' pathGeno <- file.path(dataDir, "example", "snpPileup") +#' +#' ################################################################# +#' ## The path where the Profile GDS Files (one per sample) +#' ## will be created need to be specified. +#' ################################################################# +#' pathProfileGDS <- file.path(tempdir(), "out.tmp") +#' +#' pathOut <- file.path(tempdir(), "res.out") +#' +#' ################################################################# +#' ## A data frame containing general information about the study +#' ## is also required. The data frame must have +#' ## those 3 columns: "studyID", "study.desc", "study.platform" +#' ################################################################# +#' studyDF <- data.frame(study.id="MYDATA", +#' study.desc="Description", +#' study.platform="PLATFORM", +#' stringsAsFactors=FALSE) +#' +#' #################################################################### +#' ## Fix seed to ensure reproducible results +#' #################################################################### +#' set.seed(2043) +#' +#' gds1KG <- snpgdsOpen(fileReferenceGDS) +#' dataRef <- select1KGPop(gds1KG, nbProfiles=2L) +#' closefn.gds(gds1KG) +#' +#' ## Required library for this example to run correctly +#' if (requireNamespace("GenomeInfoDb", quietly=TRUE) && +#' requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) { +#' +#' ## Chromosome length information +#' ## chr23 is chrX, chr24 is chrY and chrM is 25 +#' chrInfo <- GenomeInfoDb::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25] +#' +#' \donttest{ +#' runExomeAncestry(pedStudy=demoPedigreeEx1, studyDF=studyDF, +#' pathProfileGDS=pathProfileGDS, +#' pathGeno=pathGeno, pathOut=pathOut, +#' fileReferenceGDS=fileReferenceGDS, +#' fileReferenceAnnotGDS=fileAnnotGDS, +#' chrInfo=chrInfo, syntheticRefDF=dataRef, +#' genoSource="snp-pileup") +#' +#' unlink(pathProfileGDS, recursive=TRUE, force=TRUE) +#' unlink(pathOut, recursive=TRUE, force=TRUE) +#' } +#' } +#' NULL diff --git a/R/allelicFraction.R b/R/allelicFraction.R index 640becc19..a37e1647f 100644 --- a/R/allelicFraction.R +++ b/R/allelicFraction.R @@ -1,172 +1,13 @@ -#' @title TODO -#' -#' @description TODO -#' -#' @param snp.pos a \code{data.frame} containing the genotype information for -#' a SNV dataset. TODO -#' -#' @param w a single positive \code{numeric} representing the size of the -#' window to compute the allelic fraction. -#' Default: \code{10}. -#' -#' @param cutOff a single \code{numeric} representing TODO. Default: \code{-3}. -#' -#' @return a \code{matrix} of \code{numeric} with 3 columns where each -#' row represent a segment -#' of imbalanced SNVs. The first column represents the position, in -#' \code{snp.pos}, of the first -#' SNV in the segment. The second column represents the position, in the -#' \code{snp.pos}, of the last SNV in the segment. The third column represents -#' the lower allelic frequency of the segment and is \code{NA} when the value -#' cannot be calculated. The value \code{NULL} is -#' returned when none of the SNVs -#' tested positive for the imbalance. -#' -#' @examples -#' -#' ## Data frame with SNV information for the specified chromosome (chr 1) -#' snpInfo <- data.frame(cnt.tot=c(41, 17, 27, 15, 11, 37, 16, 32), -#' cnt.ref=c(40, 17, 27, 15, 4, 14, 16, 32), -#' cnt.alt=c(0, 0, 0, 0, 7, 23, 0, 0), -#' snp.pos=c(3722256, 3722328, 3767522, 3868160, 3869467, 4712655, -#' 6085318, 6213145), -#' snp.chr=c(rep(1, 8)), -#' normal.geno=c(rep(1, 8)), -#' pruned=c(TRUE, TRUE, FALSE, TRUE, FALSE, rep(TRUE, 3)), -#' snp.index=c(160, 162, 204, 256, 259, 288, 366, 465), -#' keep=rep(TRUE, 8), hetero=c(rep(FALSE, 4), TRUE, TRUE, rep(FALSE, 2)), -#' homo=c(rep(TRUE, 4), FALSE, FALSE, TRUE, TRUE), -#' lap=rep(-1, 8), LOH=rep(0, 8), imbAR=rep(-1, 8), -#' stringAsFactor=FALSE) -#' -#' ## The function returns NULL when there is not imbalanced SNVs -#' RAIDS:::computeAlleleFraction(snp.pos=snpInfo, w=10, cutOff=-3) -#' -#' -#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz -#' @importFrom stats median -#' @importFrom S4Vectors isSingleNumber -#' @encoding UTF-8 -#' @keywords internal -computeAlleleFraction <- function(snp.pos, w=10, cutOff=-3) { - - listBlockAR <- list() - j <- 1 - tmp <- as.integer(snp.pos$imbAR == 1) - z <- cbind(c(tmp[1], tmp[-1] - tmp[seq_len(length(tmp) -1)]), - c(tmp[-1] - tmp[seq_len(length(tmp) -1)], tmp[length(tmp)] * -1)) - - ## Split SNVs by segment of continuous imbalanced SNVs - ## There must be at least one segment with imbalanced SNVs to go one - if(length(which(z[,1] == 1)) > 0) { - ## Find segmentsof imbalanced SNVs - segImb <- data.frame(start=seq_len(nrow(snp.pos))[which(z[,1] > 0)], - end=seq_len(nrow(snp.pos))[which(z[,2] < 0)]) - - for(i in seq_len(nrow(segImb))) { - # index of the segment - listSeg <- (segImb$start[i]):(segImb$end[i]) - # index hetero segment - listHetero <- listSeg[snp.pos[listSeg,"hetero"] == TRUE] - # SNP hetero for the segment - snp.hetero <- snp.pos[listHetero,] - - if(nrow(snp.hetero) >= 2 * w) { - lapCur <- median(apply(snp.hetero[seq_len(w), - c("cnt.ref", "cnt.alt")], 1, min) / - (rowSums(snp.hetero[seq_len(w),c("cnt.ref", "cnt.alt")]))) - - start <- 1 - k <- w + 1 - while(k < nrow(snp.hetero)) { - # We have (k+w-1) <= nrow(snp.hetero) - # Case 1 true because (nrow(snp.hetero) >= 2 * w - # Other case nrow(snp.hetero) >= w+k - 1 - curWin <- testAlleleFractionChange(snp.hetero[k:(k+w-1), - c("cnt.ref", "cnt.alt")], cutOff, lapCur) - - if(curWin$pCut1 == 1){ # new Region the allelicFraction - # table of the index of the block with lapCur - listBlockAR[[j]] <- c(listHetero[start], - listHetero[k], lapCur) - - lapCur <- median(apply(snp.hetero[k:(k+w-1), - c("cnt.ref", "cnt.alt")], 1, min) / - (rowSums(snp.hetero[k:(k+w-1), - c("cnt.ref", "cnt.alt")]))) - - start <- k - - if(nrow(snp.hetero) - start < w) { # Close the segment - lapCur <- - median(apply(snp.hetero[start:nrow(snp.hetero), - c("cnt.ref", "cnt.alt")], 1, min) / - (rowSums(snp.hetero[start:nrow(snp.hetero), - c("cnt.ref", "cnt.alt")]))) - - listBlockAR[[j]] <- c(listHetero[start], - segImb$end[i], lapCur) - - j <- j+1 - k <- nrow(snp.hetero) - }else{ # nrow(snp.hetero) >= w+k - k<- k + 1 - j <- j + 1 - - } - }else{ # keep the same region - if((nrow(snp.hetero) - k ) < w){ # close - lapCur <- - median(apply(snp.hetero[start:nrow(snp.hetero), - c("cnt.ref", "cnt.alt")], 1, min) / - (rowSums(snp.hetero[start:nrow(snp.hetero), - c("cnt.ref", "cnt.alt")]))) - - listBlockAR[[j]] <- c(listHetero[start], - segImb$end[i], lapCur) - - j <- j + 1 - - k <- nrow(snp.hetero) - } else{ # continue nrow(snp.hetero) >= w+k - lapCur <- median(apply(snp.hetero[start:k, - c("cnt.ref", "cnt.alt")], 1, min) / - (rowSums(snp.hetero[start:k,c("cnt.ref", - "cnt.alt")]))) - - k <- k + 1 - } - } - }# End while - }else { - lapCur <- median(apply(snp.hetero[, c("cnt.ref", "cnt.alt")], - 1, min) / (rowSums(snp.hetero[,c("cnt.ref", - "cnt.alt")]))) - - listBlockAR[[j]] <- c(segImb$start[i], segImb$end[i], lapCur) - - j <- j + 1 - } - } - } - - # note NULL if length(listBlockAR) == 0 - listBlockAR <- do.call(rbind, listBlockAR) - - return(listBlockAR) -} - - #' @title Estimate the allelic fraction of the pruned SNVs for a specific #' profile #' #' @description The function estimates the allelic fraction of the -#' SNVs for a specific prfile and add the information to the associated +#' SNVs for a specific profile and add the information to the associated #' Profile GDS file. The allelic fraction estimation method is adapted to #' the type of study (DNA or RNA). #' #' @param gdsReference an object of class \code{\link[gdsfmt]{gds.class}} -#' (a GDS file), the opened 1KG GDS file. +#' (a GDS file), the opened Reference GDS file. #' #' @param gdsProfile an object of class \code{\link[gdsfmt]{gds.class}} #' (a GDS file), the opened Profile GDS file. @@ -188,7 +29,8 @@ computeAlleleFraction <- function(snp.pos, w=10, cutOff=-3) { #' @param minCov a single positive \code{integer} representing the minimum #' required coverage. Default: \code{10L}. #' -#' @param minProb a single \code{numeric} between 0 and 1 representing TODO. +#' @param minProb a single \code{numeric} between 0 and 1 representing the +#' probability that the calculated genotype call is correct. #' Default: \code{0.999}. #' #' @param eProb a single \code{numeric} between 0 and 1 representing the @@ -210,7 +52,7 @@ computeAlleleFraction <- function(snp.pos, w=10, cutOff=-3) { #' Default: \code{3}. #' #' @param gdsRefAnnot an object of class \code{\link[gdsfmt]{gds.class}} -#' (a GDS file), the1 1KG Annotation GDS file. +#' (a GDS file), the opened Reference SNV Annotation GDS file. #' This parameter is RNA specific. #' Default: \code{NULL}. #' @@ -227,22 +69,17 @@ computeAlleleFraction <- function(snp.pos, w=10, cutOff=-3) { #' #' The `chrInfo` parameter contains the length of the chromosomes. The #' length of the chromosomes can be obtain through the -#' \code{\link[BSgenome]{BSgenome-class}} +#' \code{\link[GenomeInfoDb]{seqlengths}} #' library. #' -#' As example: +#' As example, for hg38 genome: #' #' ``` #' -#' library(BSgenome.Hsapiens.UCSC.hg38) -#' -#' chrInfo <- integer(25L) -#' -#' for(i in seq_len(22L)){ chrInfo[i] <- length(Hsapiens[[paste0("chr", i)]])} -#' -#' chrInfo[23] <- length(Hsapiens[["chrX"]]) -#' chrInfo[24] <- length(Hsapiens[["chrY"]]) -#' chrInfo[25] <- length(Hsapiens[["chrM"]]) +#' if (requireNamespace("GenomeInfoDb", quietly=TRUE) && +#' requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) { +#' chrInfo <- GenomeInfoDb::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25] +#' } #' #' ``` #' @@ -251,61 +88,52 @@ computeAlleleFraction <- function(snp.pos, w=10, cutOff=-3) { #' ## Required library for GDS #' library(gdsfmt) #' -#' ## Path to the demo 1KG GDS file is located in this package +#' ## Path to the demo 1KG GDS file located in this package #' dataDir <- system.file("extdata/tests", package="RAIDS") -#' fileGDS <- file.path(dataDir, "ex1_good_small_1KG_GDS.gds") +#' fileGDS <- file.path(dataDir, "ex1_good_small_1KG.gds") +#' +#' ## Profile GDS file for one profile +#' fileProfile <- file.path(tempdir(), "ex1.gds") #' #' ## Copy the Profile GDS file demo that has been pruned and annotated -#' ## into a test directory (deleted after the example has been run) -#' dataDirAllelicFraction <- file.path(system.file("extdata", package="RAIDS"), -#' "demoAllelicFraction") -#' dir.create(dataDirAllelicFraction, showWarnings=FALSE, -#' recursive=FALSE, mode="0777") +#' ## into current directory #' file.copy(file.path(dataDir, "ex1_demo_with_pruning_and_1KG_annot.gds"), -#' file.path(dataDirAllelicFraction, "ex1.gds")) +#' fileProfile) #' #' ## Open the reference GDS file (demo version) #' gds1KG <- snpgdsOpen(fileGDS) #' #' ## Profile GDS file for one profile -#' fileProfile <- file.path(dataDirAllelicFraction, "ex1.gds") #' profileGDS <- openfn.gds(fileProfile, readonly=FALSE) #' -#' ## Chromosome length information -#' ## chr23 is chrX, chr24 is chrY and chrM is 25 -#' chrInfo <- c(248956422L, 242193529L, 198295559L, 190214555L, -#' 181538259L, 170805979L, 159345973L, 145138636L, 138394717L, 133797422L, -#' 135086622L, 133275309L, 114364328L, 107043718L, 101991189L, 90338345L, -#' 83257441L, 80373285L, 58617616L, 64444167L, 46709983L, 50818468L, -#' 156040895L, 57227415L, 16569L) -#' -#' ## A formal way to get the chormosome length information -#' ## library(BSgenome.Hsapiens.UCSC.hg38) -#' ## chrInfo <- integer(25L) -#' ## for(i in seq_len(22L)){ chrInfo[i] <- -#' ## length(Hsapiens[[paste0("chr", i)]])} -#' ## chrInfo[23] <- length(Hsapiens[["chrX"]]) -#' ## chrInfo[24] <- length(Hsapiens[["chrY"]]) -#' ## chrInfo[25] <- length(Hsapiens[["chrM"]]) -#' -#' ## Estimate the allelic fraction of the pruned SNVs -#' estimateAllelicFraction(gdsReference=gds1KG, gdsProfile=profileGDS, -#' currentProfile="ex1", studyID="MYDATA", chrInfo=chrInfo, -#' studyType="DNA", minCov=10L, minProb=0.999, eProb=0.001, -#' cutOffLOH=-5, cutOffHomoScore=-3, wAR=9, cutOffAR=3, -#' gdsRefAnnot=NULL, blockID=NULL) -#' -#' ## The allelic fraction is saved in the 'lap' node of the Profile GDS file -#' ## The 'lap' entry should be present -#' profileGDS -#' -#' ## Close both GDS files (important) -#' closefn.gds(profileGDS) -#' closefn.gds(gds1KG) -#' -#' ## Unlink Profile GDS file (created for demo purpose) -#' unlink(file.path(dataDirAllelicFraction, "ex1.gds")) -#' unlink(dataDirAllelicFraction) +#' ## Required library for this example to run correctly +#' if (requireNamespace("GenomeInfoDb", quietly=TRUE) && +#' requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) { +#' +#' ## Chromosome length information +#' ## chr23 is chrX, chr24 is chrY and chrM is 25 +#' chrInfo <- GenomeInfoDb::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25] +#' +#' ## Estimate the allelic fraction of the pruned SNVs +#' estimateAllelicFraction(gdsReference=gds1KG, gdsProfile=profileGDS, +#' currentProfile="ex1", studyID="MYDATA", chrInfo=chrInfo, +#' studyType="DNA", minCov=10L, minProb=0.999, eProb=0.001, +#' cutOffLOH=-5, cutOffHomoScore=-3, wAR=9, cutOffAR=3, +#' gdsRefAnnot=NULL, blockID=NULL) +#' +#' ## The allelic fraction is saved in the 'lap' node of Profile GDS file +#' ## The 'lap' entry should be present +#' profileGDS +#' +#' ## Close both GDS files (important) +#' closefn.gds(profileGDS) +#' closefn.gds(gds1KG) +#' +#' ## Remove Profile GDS file (created for demo purpose) +#' unlink(fileProfile, force=TRUE) +#' +#' } +#' #' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz #' @importFrom rlang arg_match @@ -326,39 +154,50 @@ estimateAllelicFraction <- function(gdsReference, gdsProfile, ## Set study type studyType <- arg_match(studyType) - snp.pos <- NULL + snpPos <- NULL ## The type of study affects the allelic fraction estimation if(studyType == "DNA") { - snp.pos <- computeAllelicFractionDNA(gdsReference=gdsReference, + snpPos <- computeAllelicFractionDNA(gdsReference=gdsReference, gdsSample=gdsProfile, currentProfile=currentProfile, studyID=studyID, chrInfo=chrInfo, minCov=minCov, minProb=minProb, eProb=eProb, cutOffLOH=cutOffLOH, cutOffHomoScore=cutOffHomoScore, wAR=wAR, verbose=verbose) } else if(studyType == "RNA") { - snp.pos <- computeAllelicFractionRNA(gdsReference=gdsReference, + snpPos <- computeAllelicFractionRNA(gdsReference=gdsReference, gdsSample=gdsProfile, gdsRefAnnot=gdsRefAnnot, currentProfile=currentProfile, studyID=studyID, blockID=blockID, chrInfo=chrInfo, minCov=minCov, minProb=minProb, eProb=eProb, cutOffLOH=cutOffLOH, cutOffAR=cutOffAR, verbose=verbose) } - snp.pos$seg <- rep(0, nrow(snp.pos)) - k <- 1 - # Find segment with same lap - for(chr in seq_len(22)) { - snpChr <- snp.pos[snp.pos$snp.chr == chr, ] + + + ## Calculate the cumulative sum for each chromosome + cumSumResult <- lapply(unique(snpPos$snp.chr), function(i) { + snpChr <- snpPos[snpPos$snp.chr == i, ] tmp <- c(0, abs(snpChr[2:nrow(snpChr), "lap"] - - snpChr[seq_len(nrow(snpChr)- 1), "lap"]) > 1e-3) - snp.pos$seg[snp.pos$snp.chr == chr] <- cumsum(tmp) + k - k <- max(snp.pos$seg[snp.pos$snp.chr == chr]) + 1 + snpChr[seq_len(nrow(snpChr)- 1), "lap"]) > 1e-3) + return(cumsum(tmp)) + }) + + # Find segment with same lap + snpPos$seg <- rep(0, nrow(snpPos)) + k <- 1 + for(i in seq_len(length(unique(snpPos$snp.chr)))) { + ##snpChr <- snpPos[snpPos$snp.chr == chr, ] + ##tmp <- c(0, abs(snpChr[2:nrow(snpChr), "lap"] - + ## snpChr[seq_len(nrow(snpChr)- 1), "lap"]) > 1e-3) + chr <- unique(snpPos$snp.chr)[i] + snpPos$seg[snpPos$snp.chr == chr] <- cumSumResult[[i]] + k + k <- max(snpPos$seg[snpPos$snp.chr == chr]) + 1 } ## Save information into the "lap" node in the Profile GDS file ## Save information into the "segment" node in the Profile GDS file ## Suppose we keep only the pruned SNVs - addUpdateLap(gdsProfile, snp.pos$lap[which(snp.pos$pruned == TRUE)]) - addUpdateSegment(gdsProfile, snp.pos$seg[which(snp.pos$pruned == TRUE)]) + addUpdateLap(gdsProfile, snpPos$lap[which(snpPos$pruned == TRUE)]) + addUpdateSegment(gdsProfile, snpPos$seg[which(snpPos$pruned == TRUE)]) return(0L) } diff --git a/R/allelicFraction_internal.R b/R/allelicFraction_internal.R index ba8a780c5..5ccd480bb 100644 --- a/R/allelicFraction_internal.R +++ b/R/allelicFraction_internal.R @@ -1,13 +1,13 @@ #' @title Extract the genotype information for a SNV dataset using -#' the Profile GDS file and the 1KG GDS file +#' the Profile GDS file and the Reference GDS file #' #' @description The function generates a \code{data.frame} containing the #' genotype information from a initial list of SNVs associated to a specific -#' profile. The function uses the information present in the 1KG GDS file -#' (reference) and the Profile GDS file. +#' profile. The function uses the information present in the Reference GDS file +#' and the Profile GDS file. #' #' @param gdsReference an object of class \code{\link[gdsfmt]{gds.class}} (a -#' GDS file), the opened 1KG GDS file. +#' GDS file), the opened Reference GDS file. #' #' @param gdsSample an object of class \code{\link[gdsfmt]{gds.class}} #' (a GDS file), the opened Profile GDS file. @@ -32,25 +32,25 @@ #' when the function is running. #' #' @return a \code{data.frame} containing: -#' \itemize{ -#' \item{cnt.tot} {a single \code{integer} representing the total coverage for +#' \describe{ +#' \item{cnt.tot}{ a single \code{integer} representing the total coverage for #' the SNV.} -#' \item{cnt.ref} {a single \code{integer} representing the coverage for +#' \item{cnt.ref}{ a single \code{integer} representing the coverage for #' the reference allele.} -#' \item{cnt.alt} {a single \code{integer} representing the coverage for +#' \item{cnt.alt}{ a single \code{integer} representing the coverage for #' the alternative allele.} -#' \item{snp.pos} {a single \code{integer} representing the SNV position.} -#' \item{snp.chr} {a single \code{integer} representing the SNV chromosome.} -#' \item{normal.geno} {a single \code{numeric} indicating the genotype of the +#' \item{snpPos}{ a single \code{integer} representing the SNV position.} +#' \item{snp.chr}{ a single \code{integer} representing the SNV chromosome.} +#' \item{normal.geno}{ a single \code{numeric} indicating the genotype of the #' SNV. The possibles are: \code{0} (wild-type homozygote), \code{1} #' (heterozygote), \code{2} (altenative homozygote), \code{3} indicating that #' the normal genotype is unknown.} -#' \item{pruned} { a \code{logical}} -#' \item{snp.index} {a \code{vector} of \code{integer} representing the -#' position of the SNVs in the 1KG GDS file.} -#' \item{keep} {a \code{logical} } -#' \item{hetero} {a \code{logical} } -#' \item{homo} {a \code{logical} } +#' \item{pruned}{ a \code{logical}} +#' \item{snp.index}{ a \code{vector} of \code{integer} representing the +#' position of the SNVs in the Reference GDS file.} +#' \item{keep}{ a \code{logical} } +#' \item{hetero}{ a \code{logical} } +#' \item{homo}{ a \code{logical} } #' } #' #' @examples @@ -60,37 +60,34 @@ #' #' ## Path to the demo 1KG GDS file is located in this package #' dataDir <- system.file("extdata/tests", package="RAIDS") -#' fileGDS <- file.path(dataDir, "ex1_good_small_1KG_GDS.gds") +#' fileGDS <- file.path(dataDir, "ex1_good_small_1KG.gds") +#' +#' ## Temporary Profile GDS file for one profile in temporary directory +#' fileProfile <- file.path(tempdir(), "ex1.gds") #' #' ## Copy the Profile GDS file demo that has been pruned and annotated -#' ## into a test directory (deleted after the example has been run) -#' dataDirAllelicFraction <- file.path(system.file("extdata", package="RAIDS"), -#' "demoAllelicFraction") -#' dir.create(dataDirAllelicFraction, showWarnings=FALSE, -#' recursive=FALSE, mode="0777") #' file.copy(file.path(dataDir, "ex1_demo_with_pruning_and_1KG_annot.gds"), -#' file.path(dataDirAllelicFraction, "ex1.gds")) +#' fileProfile) #' #' ## Open the reference GDS file (demo version) #' gds1KG <- snpgdsOpen(fileGDS) #' -#' ## Profile GDS file for one profile -#' fileProfile <- file.path(dataDirAllelicFraction, "ex1.gds") +#' ## Open Profile GDS file for one profile #' profileGDS <- openfn.gds(fileProfile) #' #' ## The function returns a data frame containing the SNVs information #' result <- RAIDS:::getTableSNV(gdsReference=gds1KG, gdsSample=profileGDS, -#' currentProfile="ex1", studyID="MYDATA", minCov=10L, minProb=0.999, -#' eProb=0.001, verbose=FALSE) +#' currentProfile="ex1", studyID="MYDATA", minCov=10L, minProb=0.999, +#' eProb=0.001, verbose=FALSE) #' head(result) #' #' ## Close both GDS files (important) #' closefn.gds(profileGDS) #' closefn.gds(gds1KG) #' -#' ## Unlink Profile GDS file (created for demo purpose) -#' unlink(file.path(dataDirAllelicFraction, "ex1.gds")) -#' unlink(dataDirAllelicFraction) +#' ## Remove Profile GDS file (created for demo purpose) +#' unlink(fileProfile, force=TRUE) +#' #' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz #' @importFrom gdsfmt index.gdsn read.gdsn @@ -115,7 +112,7 @@ getTableSNV <- function(gdsReference, gdsSample, currentProfile, studyID, listKeep <- cnt.total@i[which(cnt.total@x >= minCov)] + 1 ## Create the data.frame with the required information - snp.pos <- data.frame(cnt.tot=cnt.total[listKeep], + snpPos <- data.frame(cnt.tot=cnt.total[listKeep], cnt.ref=read.gdsn(index.gdsn(gdsSample, "Ref.count"), start=c(1, posCur), count=c(-1, 1))[listKeep], cnt.alt=read.gdsn(index.gdsn(gdsSample, "Alt.count"), @@ -131,7 +128,7 @@ getTableSNV <- function(gdsReference, gdsSample, currentProfile, studyID, snp.pruned <- read.gdsn(index.gdsn(node=gdsSample, "snp.index")) listKeepPruned <- which(listKeep %in% snp.pruned) - snp.pos$pruned[listKeepPruned] <- TRUE + snpPos$pruned[listKeepPruned] <- TRUE rm(cnt.total, snp.pruned, listKeepPruned) @@ -146,7 +143,7 @@ getTableSNV <- function(gdsReference, gdsSample, currentProfile, studyID, cnt.total <- read.gdsn(index.gdsn(gdsSample, "Total.count.o")) listKeep.o <- which(cnt.total >= minCov) - snp.pos.o <- data.frame(cnt.tot=cnt.total[listKeep.o], + snpPosO <- data.frame(cnt.tot=cnt.total[listKeep.o], cnt.ref=read.gdsn(index.gdsn(gdsSample, "Ref.count.o"))[listKeep.o], cnt.alt=read.gdsn(index.gdsn(gdsSample, "Alt.count.o"))[listKeep.o], snp.pos=read.gdsn(index.gdsn(gdsReference, @@ -156,36 +153,36 @@ getTableSNV <- function(gdsReference, gdsSample, currentProfile, studyID, normal.geno=read.gdsn(index.gdsn(gdsSample, "normal.geno"))[listKeep.o], pruned=rep(0, length(listKeep)), snp.index=rep(0, length(listKeep.o)), stringsAsFactors=FALSE) - listChr <- unique(snp.pos.o$snp.chr) + listChr <- unique(snpPosO$snp.chr) listUnion <- list() - # if snp.pos.o intersect snp.pos and normal.geno != 3 (we know - # the genotype of normal) change in snp.pos normal.geno - z <- cbind(c(snp.pos.o$snp.chr, snp.pos$snp.chr, snp.pos.o$snp.chr), - c(snp.pos.o$snp.pos, snp.pos$snp.pos, snp.pos.o$snp.pos), - c(seq_len(nrow(snp.pos.o)), 0, -1*seq_len(nrow(snp.pos.o))), - c(rep(0, nrow(snp.pos.o)), seq_len(nrow(snp.pos)), - rep(0, nrow(snp.pos.o)))) + # if snpPosO intersect snpPos and normal.geno != 3 (we know + # the genotype of normal) change in snpPos normal.geno + z <- cbind(c(snpPosO$snp.chr, snpPos$snp.chr, snpPosO$snp.chr), + c(snpPosO$snp.pos, snpPos$snp.pos, snpPosO$snp.pos), + c(seq_len(nrow(snpPosO)), 0, -1*seq_len(nrow(snpPosO))), + c(rep(0, nrow(snpPosO)), seq_len(nrow(snpPos)), + rep(0, nrow(snpPosO)))) z <- z[order(z[,1], z[,2], z[,3]), ] vCum <- cumsum(z[,3]) - snp.pos[z[ vCum < 0 & z[,3] == 0, 4], "normal.geno"] <- - snp.pos.o[vCum[vCum < 0 & z[, 3] == 0], "normal.geno"] + snpPos[z[ vCum < 0 & z[,3] == 0, 4], "normal.geno"] <- + snpPosO[vCum[vCum < 0 & z[, 3] == 0], "normal.geno"] rm(z) - # Keep the snp.pos.o not in snp.pos - z <- cbind(c(snp.pos$snp.chr, snp.pos.o$snp.chr, snp.pos$snp.chr), - c(snp.pos$snp.pos, snp.pos.o$snp.pos, snp.pos$snp.pos), - c(seq_len(nrow(snp.pos)), 0, -1*seq_len(nrow(snp.pos))), - c(rep(0, nrow(snp.pos)), seq_len(nrow(snp.pos.o)), - rep(0, nrow(snp.pos)))) + # Keep the snpPosO not in snpPos + z <- cbind(c(snpPos$snp.chr, snpPosO$snp.chr, snpPos$snp.chr), + c(snpPos$snp.pos, snpPosO$snp.pos, snpPos$snp.pos), + c(seq_len(nrow(snpPos)), 0, -1*seq_len(nrow(snpPos))), + c(rep(0, nrow(snpPos)), seq_len(nrow(snpPosO)), + rep(0, nrow(snpPos)))) z <- z[order(z[,1], z[,2], z[,3]), ] - # merge snp.pos with snp.pos.o - snp.pos <- rbind(snp.pos, - snp.pos.o[z[cumsum(z[,3] == 0 & z[,3] == 0),4],]) + # merge snpPos with snpPosO + snpPos <- rbind(snpPos, + snpPosO[z[cumsum(z[,3] == 0 & z[,3] == 0),4],]) } - listCnt <- unique(snp.pos$cnt.tot) + listCnt <- unique(snpPos$cnt.tot) listCnt <- listCnt[order(listCnt)] cutOffA <- data.frame(count=unlist(vapply(listCnt, @@ -198,32 +195,32 @@ getTableSNV <- function(gdsReference, gdsSample, currentProfile, studyID, FUN.VALUE = numeric(1), minProb=minProb, eProb=eProb))) row.names(cutOffA) <- as.character(listCnt) - snp.pos$keep <- rowSums(snp.pos[, c("cnt.ref", "cnt.alt")]) >= - snp.pos$cnt.tot - cutOffA[as.character(snp.pos$cnt.tot), "count"] + snpPos$keep <- rowSums(snpPos[, c("cnt.ref", "cnt.alt")]) >= + snpPos$cnt.tot - cutOffA[as.character(snpPos$cnt.tot), "count"] - snp.pos$hetero <- snp.pos$keep == TRUE & - rowSums(snp.pos[, c("cnt.ref", "cnt.alt")] >= - cutOffA[as.character(snp.pos$cnt.tot), "allele"]) == 2 + snpPos$hetero <- snpPos$keep == TRUE & + rowSums(snpPos[, c("cnt.ref", "cnt.alt")] >= + cutOffA[as.character(snpPos$cnt.tot), "allele"]) == 2 # We set to homo if 2th allele can be explain by error # can switch low allelic fraction to LOH which is less a problem # then reduce the allelic ratio by seq error - snp.pos$homo <- snp.pos$keep == TRUE & - rowSums(snp.pos[, c("cnt.ref", "cnt.alt")] >= - cutOffA[as.character(snp.pos$cnt.tot), "allele"]) == 1 + snpPos$homo <- snpPos$keep == TRUE & + rowSums(snpPos[, c("cnt.ref", "cnt.alt")] >= + cutOffA[as.character(snpPos$cnt.tot), "allele"]) == 1 ## If we know the normal is hetero then we call hetero ## if the cnt.alt and cnt.ref > 0 - listHeteroN <- which(snp.pos$homo == TRUE & - rowSums(snp.pos[, c("cnt.ref", "cnt.alt")] > 0) == 2 & - snp.pos$normal.geno == 1) + listHeteroN <- which(snpPos$homo == TRUE & + rowSums(snpPos[, c("cnt.ref", "cnt.alt")] > 0) == 2 & + snpPos$normal.geno == 1) if (length(listHeteroN) > 0) { - snp.pos$hetero[listHeteroN] <- TRUE - snp.pos$homo <- FALSE + snpPos$hetero[listHeteroN] <- TRUE + snpPos$homo <- FALSE } - return(snp.pos) + return(snpPos) } @@ -234,109 +231,123 @@ getTableSNV <- function(gdsReference, gdsSample, currentProfile, studyID, #' #' @param gdsReference an object of class #' \code{\link[SNPRelate:SNPGDSFileClass]{SNPRelate::SNPGDSFileClass}}, an -#' opened 1KG GDS file. +#' opened Reference GDS file. #' #' @param chrInfo a \code{vector} of \code{integer} representing the length of #' the chromosomes. As an example, the information ca be obtained from #' package 'BSgenome.Hsapiens.UCSC.hg38'. #' -#' @param snp.pos a \code{data.frame} containing the SNV information for the +#' @param snpPos a \code{data.frame} containing the SNV information for the #' chromosome specified by the \code{chr} argument. The \code{data.frame} must #' contain: -#' \itemize{ -#' \item{cnt.tot} {a single \code{integer} representing the total coverage for +#' \describe{ +#' \item{cnt.tot}{ a single \code{integer} representing the total coverage for #' the SNV.} -#' \item{cnt.ref} {a single \code{integer} representing the coverage for +#' \item{cnt.ref}{ a single \code{integer} representing the coverage for #' the reference allele.} -#' \item{cnt.alt} {a single \code{integer} representing the coverage for +#' \item{cnt.alt}{ a single \code{integer} representing the coverage for #' the alternative allele.} -#' \item{snp.pos} {a single \code{integer} representing the SNV position.} -#' \item{snp.chr} {a single \code{integer} representing the SNV chromosome.} -#' \item{normal.geno} {a single \code{numeric} indicating the genotype of the +#' \item{snp.pos}{ a single \code{integer} representing the SNV position.} +#' \item{snp.chr}{ a single \code{integer} representing the SNV chromosome.} +#' \item{normal.geno}{ a single \code{numeric} indicating the genotype of the #' SNV. The possibles are: \code{0} (wild-type homozygote), \code{1} #' (heterozygote), \code{2} (altenative homozygote), \code{3} indicating that #' the normal genotype is unknown.} -#' \item{pruned} {a \code{logical} indicating if the SNV is retained after +#' \item{pruned}{ a \code{logical} indicating if the SNV is retained after #' pruning} -#' \item{snp.index} {a \code{integer} representing the index position of the -#' SNV in the 1KG GDS file that contains all SNVs} -#' \item{keep} {a \code{logical} indicating if the genotype exists for the SNV} -#' \item{hetero} {a \code{logical} indicating if the SNV is heterozygote} -#' \item{homo} {a \code{logical} indicating if the SNV is homozygote} +#' \item{snp.index}{ a \code{integer} representing the index position of the +#' SNV in the Reference GDS file that contains all SNVs} +#' \item{keep}{ a \code{logical} indicating if the genotype exists for the SNV} +#' \item{hetero}{ a \code{logical} indicating if the SNV is heterozygote} +#' \item{homo}{ a \code{logical} indicating if the SNV is homozygote} #' } #' #' @param chr a single positive \code{integer} for the current chromosome. The #' \code{chrInfo} parameter must contain the value for the specified #' chromosome. #' -#' @param genoN a single \code{numeric} between 0 and 1 representing TODO. -#' Default: \code{0.0001}. +#' @param genoN a single \code{numeric} between 0 and 1 representing the +#' probability of sequencing error. Default: \code{0.0001}. #' #' @return a \code{data.frame} with the informations about LOH on a specific #' chromosome. The \code{data.frame} contains those columns: -#' \itemize{ -#' \item{chr} {a \code{integer} representing the current chromosome} -#' \item{start} {a \code{integer} representing the starting position on the +#' \describe{ +#' \item{chr}{ a \code{integer} representing the current chromosome} +#' \item{start}{ a \code{integer} representing the starting position on the #' box containing only homozygote SNVs (or not SNV). The first box starts at -#' position 1.} -#' \item{end} {a \code{integer} representing the end position on the +#' position \code{1}.} +#' \item{end}{ a \code{integer} representing the end position on the #' box containing only homozygote SNVs (or not SNV). The last box ends at the #' length of the chromosome.} -#' \item{logLHR} {TODO} -#' \item{LH1} {TODO} -#' \item{LM1} {TODO} -#' \item{homoScore} {a \code{numeric} representing \code{LH1} - \code{LM1}} -#' \item{nbSNV} {a \code{integer} representing th number of SNVs in +#' \item{logLHR}{ a \code{numeric} representing the LOH score basde on +#' population frequencies. It is the sum of +#' the log10 of the frequencies of the observed gegenotype minus the +#' the sum of the log10 of the higher frequent genotype. +#' (-100 when normal genotype are present)} +#' \item{LH1}{ a \code{numeric} representing the probability to be +#' heterozygote based on the coverage of each allele when normal +#' genotype is present} +#' \item{LM1}{ a \code{numeric} representing the max probability +#' for the read coverage at the position} +#' \item{homoScore}{ a \code{numeric} representing \code{LH1} - \code{LM1}} +#' \item{nbSNV}{ a \code{integer} representing th number of SNVs in #' the box} -#' \item{nbPruned} {a \code{integer} representing th number of pruned SNVs in +#' \item{nbPruned}{ a \code{integer} representing the number of pruned SNVs in #' the box} -#' \item{nbNorm} {TODO} -#' \item{LOH} {TODO} +#' \item{nbNorm}{ a \code{integer} representing of the number of +#' heterozygote genotypes for the normal SNVs in the block} +#' \item{LOH}{ a \code{integer} representing a flag, if \code{1} it means +#' the block is satisfying the criteria to be LOH. The value is not assigned +#' in this function; the value \code{0} is assigned} #' } #' #' @examples #' #' ## Required library for GDS -#' library(gdsfmt) +#' library(SNPRelate) #' -#' ## Path to the demo 1KG GDS file is located in this package +#' ## Path to the demo Reference GDS file is located in this package #' dataDir <- system.file("extdata/tests", package="RAIDS") -#' fileGDS <- file.path(dataDir, "ex1_good_small_1KG_GDS.gds") +#' fileGDS <- file.path(dataDir, "ex1_good_small_1KG.gds") #' -#' ## Open the reference GDS file (demo version) +#' ## Open the Reference GDS file (demo version) #' gds1KG <- snpgdsOpen(fileGDS) #' -#' ## Chromosome length information -#' ## chr23 is chrX, chr24 is chrY and chrM is 25 -#' chrInfo <- c(248956422L, 242193529L, 198295559L, 190214555L, -#' 181538259L, 170805979L, 159345973L, 145138636L, 138394717L, 133797422L, -#' 135086622L, 133275309L, 114364328L, 107043718L, 101991189L, 90338345L, -#' 83257441L, 80373285L, 58617616L, 64444167L, 46709983L, 50818468L, -#' 156040895L, 57227415L, 16569L) +#' ## Required library for this example to run correctly +#' if (requireNamespace("GenomeInfoDb", quietly=TRUE) && +#' requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) { +#' +#' ## Chromosome length information +#' ## chr23 is chrX, chr24 is chrY and chrM is 25 +#' chrInfo <- +#' GenomeInfoDb::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25] +#' +#' ## Data frame with SNV information for the specified chromosome (chr 1) +#' snpInfo <- data.frame(cnt.tot=c(41, 17, 27, 15, 11, 37, 16, 32), +#' cnt.ref=c(40, 17, 27, 15, 4, 14, 16, 32), +#' cnt.alt=c(0, 0, 0, 0, 7, 23, 0, 0), +#' snp.pos=c(3722256, 3722328, 3767522, 3868160, 3869467, 4712655, +#' 6085318, 6213145), +#' snp.chr=c(rep(1, 8)), +#' normal.geno=c(rep(3, 8)), pruned=c(TRUE, TRUE, FALSE, TRUE, FALSE, +#' TRUE, TRUE, TRUE), +#' pruned=c(TRUE, TRUE, FALSE, TRUE, FALSE, rep(TRUE, 3)), +#' snp.index=c(160, 162, 204, 256, 259, 288, 366, 465), +#' keep=rep(TRUE, 8), hetero=c(rep(FALSE, 4), TRUE, +#' TRUE, rep(FALSE, 2)), +#' homo=c(rep(TRUE, 4), FALSE, FALSE, TRUE, TRUE), +#' stringAsFactor=FALSE) +#' +#' ## The function returns a data frame containing the information about +#' ## the LOH regions in the specified chromosome +#' result <- RAIDS:::computeLOHBlocksDNAChr(gdsReference=gds1KG, +#' chrInfo=chrInfo, snpPos=snpInfo, chr=1L, genoN=0.0001) +#' head(result) +#' +#' ## Close Reference GDS file (important) +#' closefn.gds(gds1KG) #' -#' ## Data frame with SNV information for the specified chromosome (chr 1) -#' snpInfo <- data.frame(cnt.tot=c(41, 17, 27, 15, 11, 37, 16, 32), -#' cnt.ref=c(40, 17, 27, 15, 4, 14, 16, 32), -#' cnt.alt=c(0, 0, 0, 0, 7, 23, 0, 0), -#' snp.pos=c(3722256, 3722328, 3767522, 3868160, 3869467, 4712655, -#' 6085318, 6213145), -#' snp.chr=c(rep(1, 8)), -#' normal.geno=c(rep(3, 8)), pruned=c(TRUE, TRUE, FALSE, TRUE, FALSE, TRUE, -#' TRUE, TRUE), -#' pruned=c(TRUE, TRUE, FALSE, TRUE, FALSE, rep(TRUE, 3)), -#' snp.index=c(160, 162, 204, 256, 259, 288, 366, 465), -#' keep=rep(TRUE, 8), hetero=c(rep(FALSE, 4), TRUE, TRUE, rep(FALSE, 2)), -#' homo=c(rep(TRUE, 4), FALSE, FALSE, TRUE, TRUE), -#' stringAsFactor=FALSE) -#' -#' ## The function returns a data frame containing the information about the -#' ## LOH regions in the specified chromosome -#' result <- RAIDS:::computeLOHBlocksDNAChr(gdsReference=gds1KG, -#' chrInfo=chrInfo, snp.pos=snpInfo, chr=1L, genoN=0.0001) -#' head(result) -#' -#' ## Close GDS file (important) -#' closefn.gds(gds1KG) +#' } #' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz #' @importFrom gdsfmt index.gdsn read.gdsn @@ -344,24 +355,24 @@ getTableSNV <- function(gdsReference, gdsSample, currentProfile, studyID, #' @importFrom S4Vectors isSingleNumber #' @encoding UTF-8 #' @keywords internal -computeLOHBlocksDNAChr <- function(gdsReference, chrInfo, snp.pos, chr, +computeLOHBlocksDNAChr <- function(gdsReference, chrInfo, snpPos, chr, genoN=0.0001) { genoN1 <- 1 - 2 * genoN chrEnd <- chrInfo[chr] - listHetero <- snp.pos[snp.pos$hetero == TRUE, "snp.pos"] + listHetero <- snpPos[snpPos$hetero == TRUE, "snp.pos"] homoBlock <- data.frame(chr=rep(chr, length(listHetero) + 1), start=c(1, listHetero + 1), end=c(listHetero, chrEnd)) z <- cbind(c(homoBlock$start, homoBlock$end, - snp.pos$snp.pos[which(snp.pos$homo == TRUE)]), + snpPos$snp.pos[which(snpPos$homo == TRUE)]), c(seq_len(length(homoBlock$start)), -1*seq_len(length(homoBlock$start)), - rep(0, length(which(snp.pos$homo == TRUE)))), + rep(0, length(which(snpPos$homo == TRUE)))), c(rep(0, length(homoBlock$start)), rep(0, length(homoBlock$start)), - seq_len(length(which(snp.pos$homo == TRUE))))) + seq_len(length(which(snpPos$homo == TRUE))))) z <- z[order(z[, 1]), ] @@ -382,63 +393,242 @@ computeLOHBlocksDNAChr <- function(gdsReference, chrInfo, snp.pos, chr, # Include a field for LOH but will be fill elsewhere homoBlock$LOH <- rep(0, nrow(homoBlock)) - for (i in seq_len(nrow(homoBlock))) { - blcCur <- blcSNV[blcSNV$block == i, ] - snvH <- snp.pos[blcCur$snv, ] - lH1 <- 0 - lM1 <- 0 - logLHR <- 0 - homoBlock$nbSNV[i] <- nrow(blcCur) - homoBlock$nbPruned[i] <- length(which(snvH$pruned)) - if (length(which(snvH$normal.geno != 3)) > 0) { - listCount <- snvH$cnt.tot[which(snvH$normal.geno == 1)] - homoBlock$nbNorm[i] <- length(listCount) - - lH1 <-sum(log10(apply(snvH[which(snvH$normal.geno == 1), - c("cnt.ref", "cnt.tot"), drop=FALSE], - 1, FUN=function(x){ - return(dbinom(x[1], x[2], 0.5)) - # genoN1 * dbinom(x[1], x[2], 0.5) + genoN - }))) - - lM1 <- sum(log10(apply(snvH[which(snvH$normal.geno == 1), - c("cnt.ref", "cnt.tot"), drop=FALSE], - 1, FUN=function(x){ - return(dbinom((x[2] + x[2]%%2)/2, x[2], 0.5)) - #genoN1 *dbinom((x[2] + x[2]%%2)/2, x[2], 0.5) + genoN - }))) - logLHR <- -100 - - } else if (length(which(snvH$pruned)) > 2) { - afSNV <- listAF[snvH$snp.index[which(snvH$pruned)]] - afSNV <- apply(X=matrix(afSNV, ncol=1), MARGIN=1, - FUN=function(x){max(x, 0.01)}) - snvR <- snvH$cnt.ref[which(snvH$pruned)] > - snvH$cnt.alt[which(snvH$pruned)] - - # Check if it is unlikely the genotype are homo by error - lH1 <- -100 - # Freq of the more likely geno - - tmp <- apply(matrix(afSNV, ncol=1), 1, - FUN=function(x){max(max(x, 1-x)^2, 2* x *(1-x)) }) - # log10 (prod(FreqAllele^2) / prod(freq of more likely genotype)) - # snvR * 1 + (-1)^snvR * afSNV freq of the genotype - # (snvR = 1 homo ref - # and 0 if homo alt) - logLHR <- sum(2 * log10(snvR * 1 + (-1)^snvR * afSNV)) - - sum(log10(tmp)) - } + homoBlock <- lapply(seq_len(nrow(homoBlock)), + FUN=function(i, homoBlock, blcSNV, listAF, snpPos){ + blcCur <- blcSNV[blcSNV$block == i, ] + snvH <- snpPos[blcCur$snv, ] + lH1 <- 0 + lM1 <- 0 + logLHR <- 0 + homoBlock$nbSNV[i] <- nrow(blcCur) + homoBlock$nbPruned[i] <- length(which(snvH$pruned)) + if (length(which(snvH$normal.geno != 3)) > 0) { + listCount <- snvH$cnt.tot[which(snvH$normal.geno + == 1)] + homoBlock$nbNorm[i] <- length(listCount) + if(homoBlock$nbNorm[i] > 0){ + lH1 <-sum(log10( + apply(snvH[which(snvH$normal.geno == 1), + c("cnt.ref", "cnt.tot"), drop=FALSE], + 1, FUN=function(x){ + return(dbinom(x[1], x[2], 0.5)) + ## genoN1 * dbinom(x[1], x[2], 0.5) + genoN + }))) - homoBlock$logLHR[i] <- max(logLHR, -100) - homoBlock$LH1[i] <- lH1 - homoBlock$LM1[i] <- lM1 - homoBlock$homoScore[i] <- lH1 - lM1 - } # end for each block + lM1 <- sum(log10(apply(snvH[which(snvH$normal.geno == 1), + c("cnt.ref", "cnt.tot"), drop=FALSE], + 1, FUN=function(x){ + return(dbinom((x[2] + x[2]%%2)/2, + x[2], 0.5)) + ## genoN1 *dbinom((x[2] + x[2]%%2)/2, x[2], 0.5) + genoN + }))) + logLHR <- -100 + } + + } else if (length(which(snvH$pruned)) > 2) { + afSNV <- listAF[snvH$snp.index[which(snvH$pruned)]] + afSNV <- apply(X=matrix(afSNV, ncol=1), MARGIN=1, + FUN=function(x){max(x, 0.01)}) + snvR <- snvH$cnt.ref[which(snvH$pruned)] > + snvH$cnt.alt[which(snvH$pruned)] + + ## Check if it is unlikely the genotype are + ## homo by error + lH1 <- -100 + # Freq of the more likely geno + + tmp <- apply(matrix(afSNV, ncol=1), 1, + FUN=function(x){max(max(x, 1-x)^2, + 2* x *(1-x)) }) + # log10 (prod(FreqAllele^2) / prod(freq of more + # likely genotype)) + # snvR * 1 + (-1)^snvR * afSNV freq of the genotype + # (snvR = 1 homo ref + # and 0 if homo alt) + logLHR <- sum(2 * log10(snvR * 1 + + (-1)^snvR * afSNV)) - sum(log10(tmp)) + } + + homoBlock$logLHR[i] <- max(logLHR, -100) + homoBlock$LH1[i] <- lH1 + homoBlock$LM1[i] <- lM1 + homoBlock$homoScore[i] <- lH1 - lM1 + return(homoBlock[i,]) + }, homoBlock=homoBlock, + blcSNV=blcSNV, listAF=listAF, snpPos=snpPos) + homoBlock <- do.call(rbind,homoBlock) return(homoBlock) } +#' @title Compute the allelic fraction for +#' each imbalanced segment +#' +#' @description This function computes the allelic fraction for each segment +#' different than 0.5. The allelic fraction of the segment can be decomposed in +#' sub-segments. +#' +#' @param snpPos a \code{data.frame} containing the genotype information for +#' a SNV dataset. +#' +#' @param w a single positive \code{numeric} representing the size of the +#' window to compute the allelic fraction. +#' Default: \code{10}. +#' +#' @param cutOff a \code{numeric} representing the cut-off for considering +#' a region imbalanced when comparing likelihood to gave allelic fraction +#' change and likelihood not to have allelic fraction change. +#' Default: \code{-3}. +#' +#' @return a \code{matrix} of \code{numeric} with 3 columns where each +#' row represent a segment +#' of imbalanced SNVs. The first column represents the position, in +#' \code{snpPos}, of the first +#' SNV in the segment. The second column represents the position, in the +#' \code{snpPos}, of the last SNV in the segment. The third column represents +#' the lower allelic frequency of the segment and is \code{NA} when the value +#' cannot be calculated. The value \code{NULL} is +#' returned when none of the SNVs +#' tested positive for the imbalance. +#' +#' @examples +#' +#' ## Data frame with SNV information for the specified chromosome (chr 1) +#' snpInfo <- data.frame(cnt.tot=c(41, 17, 27, 15, 11, 37, 16, 32), +#' cnt.ref=c(40, 17, 27, 15, 4, 14, 16, 32), +#' cnt.alt=c(0, 0, 0, 0, 7, 23, 0, 0), +#' snp.pos=c(3722256, 3722328, 3767522, 3868160, 3869467, 4712655, +#' 6085318, 6213145), +#' snp.chr=c(rep(1, 8)), +#' normal.geno=c(rep(1, 8)), +#' pruned=c(TRUE, TRUE, FALSE, TRUE, FALSE, rep(TRUE, 3)), +#' snp.index=c(160, 162, 204, 256, 259, 288, 366, 465), +#' keep=rep(TRUE, 8), hetero=c(rep(FALSE, 4), TRUE, TRUE, rep(FALSE, 2)), +#' homo=c(rep(TRUE, 4), FALSE, FALSE, TRUE, TRUE), +#' lap=rep(-1, 8), LOH=rep(0, 8), imbAR=rep(-1, 8), +#' stringAsFactor=FALSE) +#' +#' ## The function returns NULL when there is not imbalanced SNVs +#' RAIDS:::computeAlleleFraction(snpPos=snpInfo, w=10, cutOff=-3) +#' +#' +#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz +#' @importFrom stats median +#' @importFrom S4Vectors isSingleNumber +#' @encoding UTF-8 +#' @keywords internal +computeAlleleFraction <- function(snpPos, w=10, cutOff=-3) { + + listBlockAR <- list() + + tmp <- as.integer(snpPos$imbAR == 1) + z <- cbind(c(tmp[1], tmp[-1] - tmp[seq_len(length(tmp) -1)]), + c(tmp[-1] - tmp[seq_len(length(tmp) -1)], tmp[length(tmp)] * -1)) + + ## Split SNVs by segment of continuous imbalanced SNVs + ## There must be at least one segment with imbalanced SNVs to go one + if(length(which(z[,1] == 1)) > 0) { + ## Find segmentsof imbalanced SNVs + segImb <- data.frame(start=seq_len(nrow(snpPos))[which(z[,1] > 0)], + end=seq_len(nrow(snpPos))[which(z[,2] < 0)]) + + listBlockAR <- lapply(seq_len(nrow(segImb)), + FUN=function(i, segImb, snpPos, w, cutOff){ + listBlockAR <- list() + j <- 1 + listSeg <- (segImb$start[i]):(segImb$end[i]) + # index hetero segment + listHetero <- listSeg[snpPos[listSeg,"hetero"] == TRUE] + # SNP hetero for the segment + snp.hetero <- snpPos[listHetero,] + + if(nrow(snp.hetero) >= 2 * w) { + lapCur <- median(apply(snp.hetero[seq_len(w), + c("cnt.ref", "cnt.alt")], 1, min) / + (rowSums(snp.hetero[seq_len(w), + c("cnt.ref", "cnt.alt")]))) + + start <- 1 + k <- w + 1 + while(k < nrow(snp.hetero)) { + # We have (k+w-1) <= nrow(snp.hetero) + # Case 1 true because (nrow(snp.hetero) >= 2 * w + # Other case nrow(snp.hetero) >= w+k - 1 + curWin <- testAlleleFractionChange(snp.hetero[k:(k+w-1), + c("cnt.ref", "cnt.alt")], cutOff, lapCur) + + if(curWin$pCut1 == 1){ # new Region the allelicFraction + # table of the index of the block with lapCur + listBlockAR[[j]] <- c(listHetero[start], + listHetero[k], lapCur) + + lapCur <- median(apply(snp.hetero[k:(k+w-1), + c("cnt.ref", "cnt.alt")], 1, min) / + (rowSums(snp.hetero[k:(k+w-1), + c("cnt.ref", "cnt.alt")]))) + + start <- k + + if(nrow(snp.hetero) - start < w) { # Close the segment + lapCur <- median(apply(snp.hetero[start:nrow(snp.hetero), + c("cnt.ref", "cnt.alt")], 1, min) / + (rowSums(snp.hetero[start:nrow(snp.hetero), + c("cnt.ref", "cnt.alt")]))) + + listBlockAR[[j]] <- c(listHetero[start], + segImb$end[i], lapCur) + + j <- j + 1 + k <- nrow(snp.hetero) + } else { # nrow(snp.hetero) >= w+k + k <- k + 1 + j <- j + 1 + } + } else { # keep the same region + if((nrow(snp.hetero) - k ) < w){ # close + lapCur <- median(apply(snp.hetero[start:nrow(snp.hetero), + c("cnt.ref", "cnt.alt")], 1, min) / + (rowSums(snp.hetero[start:nrow(snp.hetero), + c("cnt.ref", "cnt.alt")]))) + + listBlockAR[[j]] <- c(listHetero[start], + segImb$end[i], lapCur) + + j <- j + 1 + + k <- nrow(snp.hetero) + } else { # continue nrow(snp.hetero) >= w+k + lapCur <- median(apply(snp.hetero[start:k, + c("cnt.ref", "cnt.alt")], 1, min) / + (rowSums(snp.hetero[start:k,c("cnt.ref", + "cnt.alt")]))) + + k <- k + 1 + } + } + }# End while + } else { + lapCur <- + median(apply(snp.hetero[, c("cnt.ref", "cnt.alt")], + 1, min) / (rowSums(snp.hetero[,c("cnt.ref", + "cnt.alt")]))) + + listBlockAR[[j]] <- c(segImb$start[i], + segImb$end[i], lapCur) + + j <- j + 1 + } + listBlockAR <- do.call(rbind, listBlockAR) + return(listBlockAR) + }, segImb=segImb, snpPos=snpPos, w=w, cutOff=cutOff) + } + + # note NULL if length(listBlockAR) == 0 + listBlockAR <- do.call(rbind, listBlockAR) + # print(all.equal(listBlockAR, listBlockAR1)) + return(listBlockAR) +} + #' @title Estimate the allelic fraction of the pruned SNVs for a specific #' DNA-seq profile @@ -447,7 +637,7 @@ computeLOHBlocksDNAChr <- function(gdsReference, chrInfo, snp.pos, chr, #' allelic fraction for the pruned SNV dataset specific to a DNA-seq profile #' #' @param gdsReference an object of class \code{\link[gdsfmt]{gds.class}} -#' (a GDS file), the opened 1KG GDS file. +#' (a GDS file), the opened Reference GDS file. #' #' @param gdsSample an object of class \code{\link[gdsfmt]{gds.class}} #' (a GDS file), the opened Profile GDS file. @@ -489,74 +679,75 @@ computeLOHBlocksDNAChr <- function(gdsReference, chrInfo, snp.pos, chr, #' @return a \code{data.frame} containing the allelic information for the #' pruned SNV dataset with coverage > \code{minCov}. The \code{data.frame} #' contains those columns: -#' \itemize{ -#' \item{cnt.tot} {a \code{integer} representing the total allele count} -#' \item{cnt.ref} {a \code{integer} representing the reference allele count} -#' \item{cnt.alt} {a \code{integer} representing the alternative allele count} -#' \item{snp.pos} {a \code{integer} representing the position on the chromosome} -#' \item{snp.chr} {a \code{integer} representing the chromosome} -#' \item{normal.geno} {a \code{integer} representing the genotype +#' \describe{ +#' \item{cnt.tot}{ a \code{integer} representing the total allele count} +#' \item{cnt.ref}{ a \code{integer} representing the reference allele count} +#' \item{cnt.alt}{ a \code{integer} representing the alternative allele count} +#' \item{snp.pos}{ a \code{integer} representing the position on the chromosome} +#' \item{snp.chr}{ a \code{integer} representing the chromosome} +#' \item{normal.geno}{ a \code{integer} representing the genotype #' (0=wild-type reference; 1=heterozygote; 2=homozygote alternative; 3=unkown)} -#' \item{pruned} {a \code{logical} indicating if the SNV is retained after +#' \item{pruned}{ a \code{logical} indicating if the SNV is retained after #' pruning} -#' \item{snp.index} {a \code{integer} representing the index position of the -#' SNV in the 1KG GDS file that contains all SNVs} -#' \item{keep} {a \code{logical} indicating if the genotype exists for the SNV} -#' \item{hetero} {a \code{logical} indicating if the SNV is heterozygote} -#' \item{homo} {a \code{logical} indicating if the SNV is homozygote} -#' \item{lap} {a \code{numeric} indicating lower allelic fraction} -#' \item{LOH} {a \code{integer} indicating if the SNV is in an LOH region +#' \item{snp.index}{ a \code{integer} representing the index position of the +#' SNV in the Reference GDS file that contains all SNVs} +#' \item{keep}{ a \code{logical} indicating if the genotype exists for the SNV} +#' \item{hetero}{ a \code{logical} indicating if the SNV is heterozygote} +#' \item{homo}{ a \code{logical} indicating if the SNV is homozygote} +#' \item{lap}{ a \code{numeric} representing the lower allelic fraction} +#' \item{LOH}{ a \code{integer} indicating if the SNV is in an LOH region #' (0=not LOH, 1=in LOH)} -#' \item{imbAR} {a \code{integer} indicating if the SNV is in an imbalanced +#' \item{imbAR}{ a \code{integer} indicating if the SNV is in an imbalanced #' region (-1=not classified as imbalanced or LOH, 0=in LOH; 1=tested #' positive for imbalance in at least 1 window)} #' } #' #' @examples #' +#' ## Required library for GDS +#' library(SNPRelate) +#' #' ## Path to the demo 1KG GDS file is located in this package #' dataDir <- system.file("extdata/tests", package="RAIDS") -#' fileGDS <- file.path(dataDir, "ex1_good_small_1KG_GDS.gds") +#' fileGDS <- file.path(dataDir, "ex1_good_small_1KG.gds") +#' +#' ## Temporary Profile GDS file for one profile in temporary directory +#' fileProfile <- file.path(tempdir(), "ex1.gds") #' #' ## Copy the Profile GDS file demo that has been pruned and annotated -#' ## into a test directory (deleted after the example has been run) -#' dataDirAllelicFraction <- file.path(system.file("extdata", package="RAIDS"), -#' "demoAllelicFraction") -#' dir.create(dataDirAllelicFraction, showWarnings=FALSE, -#' recursive=FALSE, mode="0777") #' file.copy(file.path(dataDir, "ex1_demo_with_pruning_and_1KG_annot.gds"), -#' file.path(dataDirAllelicFraction, "ex1.gds")) +#' fileProfile) #' #' ## Open the reference GDS file (demo version) #' gds1KG <- snpgdsOpen(fileGDS) #' -#' ## Profile GDS file for one profile -#' fileProfile <- file.path(dataDirAllelicFraction, "ex1.gds") +#' ## Open Profile GDS file for one profile #' profileGDS <- openfn.gds(fileProfile) #' -#' ## Chromosome length information -#' ## chr23 is chrX, chr24 is chrY and chrM is 25 -#' chrInfo <- c(248956422L, 242193529L, 198295559L, 190214555L, -#' 181538259L, 170805979L, 159345973L, 145138636L, 138394717L, 133797422L, -#' 135086622L, 133275309L, 114364328L, 107043718L, 101991189L, 90338345L, -#' 83257441L, 80373285L, 58617616L, 64444167L, 46709983L, 50818468L, -#' 156040895L, 57227415L, 16569L) -#' -#' ## The function returns a data frame containing the allelic fraction info -#' result <- RAIDS:::computeAllelicFractionDNA(gdsReference=gds1KG, -#' gdsSample=profileGDS, -#' currentProfile="ex1", studyID="MYDATA", chrInfo=chrInfo, minCov=10L, -#' minProb=0.999, eProb=0.001, cutOffLOH=-5, -#' cutOffHomoScore=-3, wAR=9L, verbose=FALSE) -#' head(result) +#' ## Required library for this example to run correctly +#' if (requireNamespace("GenomeInfoDb", quietly=TRUE) && +#' requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) { #' -#' ## Close both GDS files (important) -#' closefn.gds(profileGDS) -#' closefn.gds(gds1KG) +#' ## Chromosome length information +#' ## chr23 is chrX, chr24 is chrY and chrM is 25 +#' chrInfo <- GenomeInfoDb::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25] #' -#' ## Unlink Profile GDS file (created for demo purpose) -#' unlink(file.path(dataDirAllelicFraction, "ex1.gds")) -#' unlink(dataDirAllelicFraction) +#' ## The function returns a data frame containing the allelic fraction info +#' result <- RAIDS:::computeAllelicFractionDNA(gdsReference=gds1KG, +#' gdsSample=profileGDS, currentProfile="ex1", studyID="MYDATA", +#' chrInfo=chrInfo, minCov=10L, +#' minProb=0.999, eProb=0.001, cutOffLOH=-5, +#' cutOffHomoScore=-3, wAR=9L, verbose=FALSE) +#' head(result) +#' +#' ## Close both GDS files (important) +#' closefn.gds(profileGDS) +#' closefn.gds(gds1KG) +#' +#' ## Remove Profile GDS file (created for demo purpose) +#' unlink(fileProfile, force=TRUE) +#' +#' } #' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz #' @importFrom gdsfmt index.gdsn read.gdsn @@ -569,75 +760,74 @@ computeAllelicFractionDNA <- function(gdsReference, gdsSample, currentProfile, wAR=9L, verbose) { ## Extract the genotype information for a SNV dataset using - ## the Profile GDS file and the 1KG GDS file - snp.pos <- getTableSNV(gdsReference=gdsReference, gdsSample=gdsSample, + ## the Profile GDS file and the Reference GDS file + snpPos <- getTableSNV(gdsReference=gdsReference, gdsSample=gdsSample, currentProfile=currentProfile, studyID=studyID, minCov=minCov, minProb=minProb, eProb=eProb, verbose=verbose) - snp.pos$lap <- rep(-1, nrow(snp.pos)) - snp.pos$LOH <- rep(0, nrow(snp.pos)) - snp.pos$imbAR <- rep(-1, nrow(snp.pos)) - - homoBlock <- list() - - for(chr in unique(snp.pos$snp.chr)) { + snpPos$lap <- rep(-1, nrow(snpPos)) + snpPos$LOH <- rep(0, nrow(snpPos)) + snpPos$imbAR <- rep(-1, nrow(snpPos)) - if (verbose) { - message("chr ", chr) - message("Step 1 ", Sys.time()) - } - - listChr <- which(snp.pos$snp.chr == chr) - - ## Identify LOH regions - homoBlock[[chr]] <- computeLOHBlocksDNAChr(gdsReference=gdsReference, - chrInfo=chrInfo, snp.pos=snp.pos[listChr,], chr=chr) + snpPos <- lapply(unique(snpPos$snp.chr), + FUN=function(chr,snpPos){ + if (verbose) { + message("chr ", chr) + message("Step 1 ", Sys.time()) + } + homoBlock <- list() + listChr <- which(snpPos$snp.chr == chr) + ## Identify LOH regions + homoBlock[[chr]] <- computeLOHBlocksDNAChr( + gdsReference=gdsReference, chrInfo=chrInfo, + snpPos=snpPos[listChr,], chr=chr) - if (verbose) { message("Step 2 ", Sys.time()) } + if (verbose) { message("Step 2 ", Sys.time()) } - homoBlock[[chr]]$LOH <- as.integer(homoBlock[[chr]]$logLHR <= + homoBlock[[chr]]$LOH <- as.integer(homoBlock[[chr]]$logLHR <= cutOffLOH & homoBlock[[chr]]$homoScore <= cutOffHomoScore) - z <- cbind(c(homoBlock[[chr]]$start, homoBlock[[chr]]$end, - snp.pos[listChr, "snp.pos"]), - c(rep(0, 2* nrow(homoBlock[[chr]])), + z <- cbind(c(homoBlock[[chr]]$start, homoBlock[[chr]]$end, + snpPos[listChr, "snp.pos"]), + c(rep(0, 2* nrow(homoBlock[[chr]])), rep(1, length(listChr))), - c(homoBlock[[chr]]$LOH, -1 * homoBlock[[chr]]$LOH, - rep(0, length(listChr)) ), - c(rep(0, 2 * nrow(homoBlock[[chr]])), + c(homoBlock[[chr]]$LOH, -1 * homoBlock[[chr]]$LOH, + rep(0, length(listChr)) ), + c(rep(0, 2 * nrow(homoBlock[[chr]])), seq_len(length(listChr)))) - z <- z[order(z[,1], z[,2]), ] - pos <- z[cumsum(z[,3]) > 0 & z[,4] > 0, 4] - snp.pos[listChr[pos], "lap"] <- 0 - snp.pos[listChr[pos], "LOH"] <- 1 + z <- z[order(z[,1], z[,2]), ] + pos <- z[cumsum(z[,3]) > 0 & z[,4] > 0, 4] + snpPos[listChr[pos], "lap"] <- 0 + snpPos[listChr[pos], "LOH"] <- 1 - if (verbose) { message("Step 3 ", Sys.time()) } + if (verbose) { message("Step 3 ", Sys.time()) } - ## Identify imbalanced SNVs in specified chromosome - snp.pos[listChr, "imbAR"] <- - computeAllelicImbDNAChr(snp.pos=snp.pos[listChr, ], chr=chr, - wAR=10, cutOffEmptyBox=-3) + ## Identify imbalanced SNVs in specified chromosome + snpPos[listChr, "imbAR"] <- + computeAllelicImbDNAChr(snpPos=snpPos[listChr, ], chr=chr, + wAR=10, cutOffEmptyBox=-3) - if (verbose) { message("Step 4 ", Sys.time()) } + if (verbose) { message("Step 4 ", Sys.time()) } - ## Compute allelic fraction for SNVs in specified chromosome - blockAF <- computeAlleleFraction(snp.pos=snp.pos[listChr, ], - w=10, cutOff=-3) + ## Compute allelic fraction for SNVs in specified chromosome + blockAF <- computeAlleleFraction(snpPos=snpPos[listChr, ], + w=10, cutOff=-3) - if (verbose) { message("Step 5 ", Sys.time()) } - - if(! is.null(blockAF)) { - for(i in seq_len(nrow(blockAF))) { - snp.pos[listChr[blockAF[i, 1]:blockAF[i, 2]], "lap"] <- - blockAF[i, 3] - } - } - } + if (verbose) { message("Step 5 ", Sys.time()) } - snp.pos[which(snp.pos[, "lap"] == -1), "lap"] <- 0.5 + if(! is.null(blockAF)) { + for(i in seq_len(nrow(blockAF))) { + snpPos[listChr[blockAF[i, 1]:blockAF[i, 2]], "lap"] <- + blockAF[i, 3] + } + } + return(snpPos[listChr,]) + }, snpPos=snpPos) + snpPos <- do.call(rbind, snpPos) + snpPos[which(snpPos[, "lap"] == -1), "lap"] <- 0.5 - return(snp.pos) + return(snpPos) } @@ -646,16 +836,15 @@ computeAllelicFractionDNA <- function(gdsReference, gdsSample, currentProfile, #' #' @description The function creates a \code{data.frame} containing the #' allelic fraction for the pruned SNV dataset specific to a RNA-seq sample. -#' TODO #' #' @param gdsReference an object of class \code{\link[gdsfmt]{gds.class}} -#' (a GDS file), the opened 1KG GDS file. +#' (a GDS file), the opened Reference GDS file. #' #' @param gdsSample an object of class \code{\link[gdsfmt]{gds.class}} #' (a GDS file), the opened Profile GDS file. #' #' @param gdsRefAnnot an object of class \code{\link[gdsfmt]{gds.class}} -#' (a GDS file), the opeoned 1KG SNV Annotation GDS file. +#' (a GDS file), the opened Reference SNV Annotation GDS file. #' #' @param currentProfile a \code{character} string corresponding to #' the sample identifier as used in \code{\link{pruningSample}} function. @@ -680,7 +869,7 @@ computeAllelicFractionDNA <- function(gdsReference, gdsSample, currentProfile, #' @param eProb a single \code{numeric} between 0 and 1 representing the #' probability of sequencing error. Default: \code{0.001}. #' -#' @param cutOffLOH a single log of the score to be LOH TODO. +#' @param cutOffLOH a single \code{numeric} log of the score to be LOH. #' Default: \code{-5}. #' #' @param cutOffAR a single \code{numeric} representing the cutoff, in @@ -691,15 +880,89 @@ computeAllelicFractionDNA <- function(gdsReference, gdsSample, currentProfile, #' @param verbose a \code{logicial} indicating if the function should print #' message when running. #' -#' @return a \code{data.frame} with lap for the pruned SNV dataset with -#' coverage > \code{minCov}. TODO +#' @return a \code{data.frame} containing the allelic information for the +#' pruned SNV dataset with coverage > \code{minCov}. The \code{data.frame} +#' contains those columns: +#' \describe{ +#' \item{cnt.tot}{ a \code{integer} representing the total allele count} +#' \item{cnt.ref}{ a \code{integer} representing the reference allele count} +#' \item{cnt.alt}{ a \code{integer} representing the alternative allele count} +#' \item{snp.pos}{ a \code{integer} representing the position on the chromosome} +#' \item{snp.chr}{ a \code{integer} representing the chromosome} +#' \item{normal.geno}{ a \code{integer} representing the genotype +#' (0=wild-type reference; 1=heterozygote; 2=homozygote alternative; 3=unkown)} +#' \item{pruned}{ a \code{logical} indicating if the SNV is retained after +#' pruning} +#' \item{snp.index}{ a \code{integer} representing the index position of the +#' SNV in the Reference GDS file that contains all SNVs} +#' \item{keep}{ a \code{logical} indicating if the genotype exists for the SNV} +#' \item{hetero}{ a \code{logical} indicating if the SNV is heterozygote} +#' \item{homo}{ a \code{logical} indicating if the SNV is homozygote} +#' \item{block.id}{ a \code{integer} indicating the unique identifier of the +#' block in the Population Reference Annotation +#' GDS file that contains the current SNV} +#' \item{phase}{ a \code{integer} indicating the phase of the variant +#' if known, \code{3} if not known} +#' \item{lap}{ a \code{numeric} indicating lower allelic fraction} +#' \item{LOH}{ a \code{integer} indicating if the SNV is in an LOH region +#' (0=not LOH, 1=in LOH)} +#' \item{imbAR}{ a \code{integer} indicating if the SNV is in an imbalanced +#' region (-1=not classified as imbalanced or LOH, 0=in LOH; 1=tested +#' positive for imbalance in at least 1 window)} +#' \item{freq}{ a \code{numeric} indicating the frequency of the variant +#' in the the reference} +#' } #' #' @examples #' -#' ## Path to the demo pedigree file is located in this package -#' dataDir <- system.file("extdata", package="RAIDS") +#' ## Required library for GDS +#' library(SNPRelate) +#' +#' #' ## Path to the demo 1KG GDS file is located in this package +#' dataDir <- system.file("extdata/tests", package="RAIDS") +#' fileGDS <- file.path(dataDir, "ex1_good_small_1KG.gds") +#' fileAnnotGDS <- file.path(dataDir, "ex1_good_small_1KG_Annot.gds") +#' +#' ## Temporary Profile GDS file for one profile in temporary directory +#' fileProfile <- file.path(tempdir(), "ex1.gds") +#' +#' ## Copy the Profile GDS file demo that has been pruned and annotated +#' file.copy(file.path(dataDir, "ex1_demo_with_pruning_and_1KG_annot.gds"), +#' fileProfile) +#' +#' ## Open the reference GDS file (demo version) +#' gds1KG <- snpgdsOpen(fileGDS) +#' gdsRefAnnot <- openfn.gds(fileAnnotGDS) +#' +#' ## Open Profile GDS file for one profile +#' profileGDS <- openfn.gds(fileProfile) +#' +#' ## Required library for this example to run correctly +#' if (requireNamespace("GenomeInfoDb", quietly=TRUE) && +#' requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) { +#' +#' ## Chromosome length information +#' ## chr23 is chrX, chr24 is chrY and chrM is 25 +#' chrInfo <- GenomeInfoDb::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25] #' -#' ## TODO +#' ## The function returns a data frame containing the allelic fraction info +#' result <- RAIDS:::computeAllelicFractionRNA(gdsReference=gds1KG, +#' gdsSample=profileGDS, gdsRefAnnot=gdsRefAnnot, +#' currentProfile="ex1", studyID="MYDATA", +#' blockID="GeneS.Ensembl.Hsapiens.v86", +#' chrInfo=chrInfo, minCov=10L, minProb=0.999, eProb=0.001, +#' cutOffLOH=-5, cutOffAR=3, verbose=FALSE) +#' head(result) +#' +#' ## Close both GDS files (important) +#' closefn.gds(profileGDS) +#' closefn.gds(gds1KG) +#' closefn.gds(gdsRefAnnot) +#' +#' ## Remove Profile GDS file (created for demo purpose) +#' unlink(fileProfile, force=TRUE) +#' +#' } #' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz #' @importFrom gdsfmt index.gdsn read.gdsn ls.gdsn @@ -712,63 +975,68 @@ computeAllelicFractionRNA <- function(gdsReference, gdsSample, gdsRefAnnot, cutOffAR=3, verbose) { ## Extract the genotype information for a SNV dataset using - ## the GDS Sample file and the 1KG GDS file - snp.pos <- getTableSNV(gdsReference, gdsSample, currentProfile, studyID, + ## the GDS Sample file and the Reference GDS file + snpPos <- getTableSNV(gdsReference, gdsSample, currentProfile, studyID, minCov, minProb, eProb) # Keep only SNV in GDS ref because to reduce SNV artefact from RNA - snp.pos <- snp.pos[which(snp.pos$snp.index > 0),] + snpPos <- snpPos[which(snpPos$snp.index > 0),] # Get the block structure base on genes from gdsRefAnnot - snp.pos$block.id <- getGeneBlock(gdsRefAnnot=gdsRefAnnot, - snp.index=snp.pos$snp.index, blockID=blockID) + snpPos$block.id <- getBlockIDs(gdsRefAnnot=gdsRefAnnot, + snpIndex=snpPos$snp.index, blockTypeID=blockID) - snp.pos$phase <- rep(3, nrow(snp.pos)) + snpPos$phase <- rep(3, nrow(snpPos)) if ("phase" %in% ls.gdsn(node=gdsSample)) { - snp.pos$phase <- read.gdsn(index.gdsn(gdsSample, - "phase"))[snp.pos$snp.index] + snpPos$phase <- read.gdsn(index.gdsn(gdsSample, + "phase"))[snpPos$snp.index] } - snp.pos$lap <- rep(-1, nrow(snp.pos)) - snp.pos$LOH <- rep(0, nrow(snp.pos)) - snp.pos$imbAR <- rep(-1, nrow(snp.pos)) - snp.pos$freq <- read.gdsn(index.gdsn(gdsReference, - "snp.AF"))[snp.pos$snp.index] + snpPos$lap <- rep(-1, nrow(snpPos)) + snpPos$LOH <- rep(0, nrow(snpPos)) + snpPos$imbAR <- rep(-1, nrow(snpPos)) + snpPos$freq <- read.gdsn(index.gdsn(gdsReference, + "snp.AF"))[snpPos$snp.index] # for each chromosome - listBlock <- list() - for(chr in unique(snp.pos$snp.chr)) { + # listBlock <- list() - if (verbose) { - message("chr ", chr) - message("Step 1 ", Sys.time()) - } - - #listHetero <- dfHetero[dfHetero$snp.chr == chr, "snp.pos"] - listChr <- which(snp.pos$snp.chr == chr) - # snp.pos.chr <- snp.pos[listChr,] + listBlock <- lapply(unique(snpPos$snp.chr), + FUN=function(x, snpPos, verbose){ + if (verbose) { + message("chr ", x) + message("Step 1 ", Sys.time()) + } + listChr <- which(snpPos$snp.chr == x) + blockAF <- tableBlockAF(snpPos=snpPos[listChr,]) + blockAF$aRF[blockAF$lRhomo <= cutOffLOH] <- 0 + blockAF$aRF[blockAF$lR >= cutOffAR] <- blockAF$aFraction[blockAF$lR + >= cutOffAR] + blockAF$aRF[blockAF$lR < cutOffAR & blockAF$nbHetero > 1] <- 0.5 - blockAF <- tableBlockAF(snp.pos=snp.pos[listChr,]) - # LOH - blockAF$aRF[blockAF$lRhomo <= cutOffLOH] <- 0 - blockAF$aRF[blockAF$lR >= cutOffAR] <- blockAF$aFraction[blockAF$lR - >= cutOffAR] - blockAF$aRF[blockAF$lR < cutOffAR & blockAF$nbHetero > 1] <- 0.5 + #listBlock[[x]] <- blockAF - listBlock[[chr]] <- blockAF + if (verbose) { + message("Step 1 done ", Sys.time()) + } + return(blockAF) + }, snpPos=snpPos, + verbose=verbose) + blockAF <- do.call(rbind, listBlock) + listMissing <- which(abs(blockAF$aRF + 1) < 1e-6) - if (verbose) { - message("Step 1 done ", Sys.time()) + if(length(listMissing) > 0){ + if(length(blockAF$aRF[-1*listMissing]) == 0){ + blockAF[listMissing, "aRF"] <- 0.5 + }else{ + blockAF[listMissing, "aRF"] <- sample(blockAF$aRF[-1*listMissing], + length(listMissing), replace=TRUE) } - } - blockAF <- do.call(rbind, listBlock) - listMissing <- which(abs(blockAF$aRF + 1) < 1e-6) - blockAF[listMissing, "aRF"] <- sample(blockAF$aRF[-1*listMissing], - length(listMissing), replace=TRUE) + } for(b in seq_len(nrow(blockAF))) { - snp.pos$lap[snp.pos$block.id == blockAF$block[b]] <- blockAF$aRF[b] + snpPos$lap[snpPos$block.id == blockAF$block[b]] <- blockAF$aRF[b] } - return(snp.pos) + return(snpPos) } @@ -777,31 +1045,31 @@ computeAllelicFractionRNA <- function(gdsReference, gdsSample, gdsRefAnnot, #' @description The function verifies, for each SNV present in the data frame, #' if the SNV is in an imbalance region. #' -#' @param snp.pos a \code{data.frame} containing the SNV information for the +#' @param snpPos a \code{data.frame} containing the SNV information for the #' chromosome specified by the \code{chr} argument. The \code{data.frame} must #' contain: -#' \itemize{ -#' \item{cnt.tot} {a single \code{integer} representing the total coverage for +#' \describe{ +#' \item{cnt.tot}{ a single \code{integer} representing the total coverage for #' the SNV.} -#' \item{cnt.ref} {a single \code{integer} representing the coverage for +#' \item{cnt.ref}{ a single \code{integer} representing the coverage for #' the reference allele.} -#' \item{cnt.alt} {a single \code{integer} representing the coverage for +#' \item{cnt.alt}{ a single \code{integer} representing the coverage for #' the alternative allele.} -#' \item{snp.pos} {a single \code{integer} representing the SNV position.} -#' \item{snp.chr} {a single \code{integer} representing the SNV chromosome.} -#' \item{normal.geno} {a single \code{numeric} indicating the genotype of the +#' \item{snp.pos}{ a single \code{integer} representing the SNV position.} +#' \item{snp.chr}{ a single \code{integer} representing the SNV chromosome.} +#' \item{normal.geno}{ a single \code{numeric} indicating the genotype of the #' SNV. The possibles are: \code{0} (wild-type homozygote), \code{1} #' (heterozygote), \code{2} (altenative homozygote), \code{3} indicating that #' the normal genotype is unknown.} -#' \item{pruned} {a \code{logical} indicating if the SNV is retained after +#' \item{pruned}{ a \code{logical} indicating if the SNV is retained after #' pruning} -#' \item{snp.index} {a \code{integer} representing the index position of the -#' SNV in the 1KG GDS file that contains all SNVs} -#' \item{keep} {a \code{logical} indicating if the genotype exists for the SNV} -#' \item{hetero} {a \code{logical} indicating if the SNV is heterozygote} -#' \item{homo} {a \code{logical} indicating if the SNV is homozygote} -#' \item{lap} {a \code{numeric} indicating lower allelic fraction} -#' \item{LOH} {a \code{integer} indicating if the SNV is in an LOH region +#' \item{snp.index}{ a \code{integer} representing the index position of the +#' SNV in the Reference GDS file that contains all SNVs} +#' \item{keep}{ a \code{logical} indicating if the genotype exists for the SNV} +#' \item{hetero}{ a \code{logical} indicating if the SNV is heterozygote} +#' \item{homo}{ a \code{logical} indicating if the SNV is homozygote} +#' \item{lap}{ a \code{numeric} indicating lower allelic fraction} +#' \item{LOH}{ a \code{integer} indicating if the SNV is in an LOH region #' (0=not LOH, 1=in LOH)} #' } #' @@ -818,7 +1086,7 @@ computeAllelicFractionRNA <- function(gdsReference, gdsSample, gdsRefAnnot, #' imbalanced region (-1=not classified as imbalanced or LOH, 0=in LOH; 1=tested #' positive for imbalance in at least 1 window). The vector as an entry for #' each SNV present in the -#' input \code{snp.pos}. +#' input \code{snpPos}. #' #' @examples #' @@ -827,80 +1095,83 @@ computeAllelicFractionRNA <- function(gdsReference, gdsSample, gdsRefAnnot, #' #' ## Path to the demo 1KG GDS file is located in this package #' dataDir <- system.file("extdata/tests", package="RAIDS") -#' fileGDS <- file.path(dataDir, "ex1_good_small_1KG_GDS.gds") +#' fileGDS <- file.path(dataDir, "ex1_good_small_1KG.gds") #' #' ## Open the reference GDS file (demo version) #' gds1KG <- snpgdsOpen(fileGDS) #' -#' ## Chromosome length information -#' ## chr23 is chrX, chr24 is chrY and chrM is 25 -#' chrInfo <- c(248956422L, 242193529L, 198295559L, 190214555L, -#' 181538259L, 170805979L, 159345973L, 145138636L, 138394717L, 133797422L, -#' 135086622L, 133275309L, 114364328L, 107043718L, 101991189L, 90338345L, -#' 83257441L, 80373285L, 58617616L, 64444167L, 46709983L, 50818468L, -#' 156040895L, 57227415L, 16569L) -#' -#' ## Data frame with SNV information for the specified chromosome (chr 1) -#' snpInfo <- data.frame(cnt.tot=c(41, 17, 27, 15, 11, 37, 16, 32), -#' cnt.ref=c(40, 17, 27, 15, 4, 14, 16, 32), -#' cnt.alt=c(0, 0, 0, 0, 7, 23, 0, 0), -#' snp.pos=c(3722256, 3722328, 3767522, 3868160, 3869467, 4712655, -#' 6085318, 6213145), -#' snp.chr=c(rep(1, 8)), -#' normal.geno=c(rep(1, 8)), pruned=c(TRUE, TRUE, FALSE, TRUE, FALSE, TRUE, -#' TRUE, TRUE), -#' pruned=c(TRUE, TRUE, FALSE, TRUE, FALSE, rep(TRUE, 3)), -#' snp.index=c(160, 162, 204, 256, 259, 288, 366, 465), -#' keep=rep(TRUE, 8), hetero=c(rep(FALSE, 4), TRUE, TRUE, rep(FALSE, 2)), -#' homo=c(rep(TRUE, 4), FALSE, FALSE, TRUE, TRUE), -#' lap=rep(-1, 8), LOH=rep(0, 8), imbAR=rep(-1, 8), -#' stringAsFactor=FALSE) -#' -#' ## The function returns a data frame containing the information about the -#' ## LOH regions in the specified chromosome -#' result <- RAIDS:::computeAllelicImbDNAChr(snp.pos=snpInfo, chr=1, wAR=10, -#' cutOffEmptyBox=-3) -#' head(result) +#' ## Required library for this example to run correctly +#' if (requireNamespace("GenomeInfoDb", quietly=TRUE) && +#' requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) { +#' +#' ## Chromosome length information +#' ## chr23 is chrX, chr24 is chrY and chrM is 25 +#' chrInfo <- GenomeInfoDb::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25] +#' +#' ## Data frame with SNV information for the specified chromosome (chr 1) +#' snpInfo <- data.frame(cnt.tot=c(41, 17, 27, 15, 11, 37, 16, 32), +#' cnt.ref=c(40, 17, 27, 15, 4, 14, 16, 32), +#' cnt.alt=c(0, 0, 0, 0, 7, 23, 0, 0), +#' snp.pos=c(3722256, 3722328, 3767522, 3868160, 3869467, 4712655, +#' 6085318, 6213145), +#' snp.chr=c(rep(1, 8)), +#' normal.geno=c(rep(1, 8)), pruned=c(TRUE, TRUE, FALSE, TRUE, +#' FALSE, TRUE, TRUE, TRUE), +#' pruned=c(TRUE, TRUE, FALSE, TRUE, FALSE, rep(TRUE, 3)), +#' snp.index=c(160, 162, 204, 256, 259, 288, 366, 465), +#' keep=rep(TRUE, 8), +#' hetero=c(rep(FALSE, 4), TRUE, TRUE, rep(FALSE, 2)), +#' homo=c(rep(TRUE, 4), FALSE, FALSE, TRUE, TRUE), +#' lap=rep(-1, 8), LOH=rep(0, 8), imbAR=rep(-1, 8), +#' stringAsFactor=FALSE) +#' +#' ## The function returns a data frame containing the information about +#' ## the LOH regions in the specified chromosome +#' result <- RAIDS:::computeAllelicImbDNAChr(snpPos=snpInfo, chr=1, wAR=10, +#' cutOffEmptyBox=-3) +#' head(result) +#' +#' ## Close GDS file (important) +#' closefn.gds(gds1KG) #' -#' ## Close GDS file (important) -#' closefn.gds(gds1KG) +#' } #' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz #' @importFrom gdsfmt index.gdsn read.gdsn #' @importFrom S4Vectors isSingleNumber #' @encoding UTF-8 #' @keywords internal -computeAllelicImbDNAChr <- function(snp.pos, chr, wAR=10, cutOffEmptyBox=-3) { +computeAllelicImbDNAChr <- function(snpPos, chr, wAR=10, cutOffEmptyBox=-3) { # We use wAR - 1 because # process the window ex: 1 to 1+wAR wAR <- wAR - 1 listHetero <- NULL - if(length(which(snp.pos$normal.geno != 3) > 0)) { - listHetero <- which(snp.pos$keep == TRUE & snp.pos$normal.geno == 1) + if(length(which(snpPos$normal.geno != 3) > 0)) { + listHetero <- which(snpPos$keep == TRUE & snpPos$normal.geno == 1) } else{ - listHetero <- which(snp.pos$hetero == TRUE) + listHetero <- which(snpPos$hetero == TRUE) } - heteroSNV <- snp.pos[listHetero, ] + heteroSNV <- snpPos[listHetero, ] if(nrow(heteroSNV) > wAR) { for(i in seq_len(nrow(heteroSNV)-wAR)) { - if(sum(snp.pos[listHetero[i]:listHetero[(i+wAR-1)], "LOH"]) == 0 ) { + if(sum(snpPos[listHetero[i]:listHetero[(i+wAR-1)], "LOH"]) == 0 ) { ## Check for imbalance regions for heterozygote SNVs cur <- testEmptyBox(heteroSNV[i:(i+wAR), c ("cnt.alt", "cnt.ref")], cutOffEmptyBox) if(cur$pCut == 1) { # Set all snv from tmpA (include homozygotes) # in the window to 1 - snp.pos[listHetero[i]:listHetero[(i+wAR)], "imbAR"] <- 1 + snpPos[listHetero[i]:listHetero[(i+wAR)], "imbAR"] <- 1 } } } } - snp.pos$imbAR[which(snp.pos$LOH == 1)] <- 0 + snpPos$imbAR[which(snpPos$LOH == 1)] <- 0 - return(snp.pos$imbAR) + return(snpPos$imbAR) } @@ -914,10 +1185,10 @@ computeAllelicImbDNAChr <- function(snp.pos, chr, wAR=10, cutOffEmptyBox=-3) { #' #' @param matCov a \code{data.frame} containing only heterozygote SNVs. The #' \code{data.frame} must contain those columns: -#' \itemize{ -#' \item{cnt.ref} {a single \code{integer} representing the coverage for +#' \describe{ +#' \item{cnt.ref}{ a single \code{integer} representing the coverage for #' the reference allele.} -#' \item{cnt.alt} {a single \code{integer} representing the coverage for +#' \item{cnt.alt}{ a single \code{integer} representing the coverage for #' the alternative allele.} #' } #' @@ -931,16 +1202,16 @@ computeAllelicImbDNAChr <- function(snp.pos, chr, wAR=10, cutOffEmptyBox=-3) { #' reference to see if there is a allelic fraction change. #' #' @return a \code{list} containing 4 entries: -#' \itemize{ +#' \describe{ #' \item{pWin}{ a \code{vector} of \code{numeric} representing the #' probability (x2) of obtaining the current #' alternative/(alternative+reference) ratio from a reference distribution #' specified by user.} -#' \item{p}{a \code{integer} indicating if all SNVs tested +#' \item{p}{ a \code{integer} indicating if all SNVs tested #' positive (1=TRUE, 0=FALSE). The cut-off is 0.5. } -#' \item{pCut}{a \code{integer} indicating if all SNVs tested +#' \item{pCut}{ a \code{integer} indicating if all SNVs tested #' positive (1=TRUE, 0-FALSE). } -#' \item{pCut1}{a \code{integer} indicating if the region tested +#' \item{pCut1}{ a \code{integer} indicating if the region tested #' positive (1=TRUE, 0=FALSE) for allelic ratio change.} #' } #' @@ -964,36 +1235,34 @@ testAlleleFractionChange <- function(matCov, pCutOff=-3, vMean) { matCov$pWin <- rep(1, nrow(matCov)) - for(i in seq_len(nrow(matCov))) { + matTmp <- apply(matCov[, c("cnt.alt", "cnt.ref")], 1, + FUN=function(x, vMean){ + vCur <- ifelse(x[1] <= x[2], x[1], x[2]) - vCur <- ifelse(matCov$cnt.alt[i] <= matCov$cnt.ref[i], - matCov$cnt.alt[i], matCov$cnt.ref[i]) + diff2Mean <- abs(vMean * (x[1] + x[2]) - vCur) + pCur1 <- pbinom(round(vMean * (x[1] + x[2]) - + diff2Mean), size=x[2] + x[1], vMean) + pCur2 <- 1 - pbinom(round(vMean * (x[1] + + x[2]) + diff2Mean), size=x[2] + x[1], vMean) - diff2Mean <- abs(vMean * (matCov$cnt.alt[i] + - matCov$cnt.ref[i]) - vCur) - pCur1 <- pbinom(round(vMean * (matCov$cnt.alt[i] + - matCov$cnt.ref[i]) - diff2Mean), - size=matCov$cnt.ref[i] + matCov$cnt.alt[i], vMean) - pCur2 <- 1 - pbinom(round(vMean * (matCov$cnt.alt[i] + - matCov$cnt.ref[i]) + diff2Mean), - size=matCov$cnt.ref[i] + matCov$cnt.alt[i], vMean) + pCur <- pCur1 + pCur2 - pCur <- pCur1 + pCur2 + pCurO <- max(1 - max(pCur, 0.01), 0.01) + pCurMax <- max(pCur, 0.01) + return(c(pCur, pCurMax, pCurO)) + }, vMean=vMean) + matCov$pWin <- matTmp[1, ] - matCov$pWin[i] <- pCur + p <- sum(log10(matTmp[2,])) + p0 <- sum(log10(matTmp[3,])) - pCurO <- max(1 - max(pCur, 0.01), 0.01) - p <- p + log10(max(pCur, 0.01)) - pO <- pO + log10(pCurO) - } pCut1 <- as.integer((sum(matCov$pWin < 0.5) >= nrow(matCov)-1) & matCov$pWin[1] < 0.5 & (matCov$pWin[nrow(matCov)] < 0.5) & ((p-pO) <= pCutOff)) res <- list(pWin=matCov$pWin, p=p, - pCut=as.integer(sum(matCov$pWin < 0.5) == nrow(matCov)), - pCut1=pCut1) + pCut=as.integer(sum(matCov$pWin < 0.5) == nrow(matCov)), pCut1=pCut1) return(res) } @@ -1007,10 +1276,10 @@ testAlleleFractionChange <- function(matCov, pCutOff=-3, vMean) { #' #' @param matCov a \code{data.frame} containing only heterozygote SNVs. The #' \code{data.frame} must contain those columns: -#' \itemize{ -#' \item{cnt.ref} {a single \code{integer} representing the coverage for +#' \describe{ +#' \item{cnt.ref}{ a single \code{integer} representing the coverage for #' the reference allele.} -#' \item{cnt.alt} {a single \code{integer} representing the coverage for +#' \item{cnt.alt}{ a single \code{integer} representing the coverage for #' the alternative allele.} #' } #' @@ -1019,7 +1288,7 @@ testAlleleFractionChange <- function(matCov, pCutOff=-3, vMean) { #' likelihood not to be imbalanced. Default: \code{-3}. #' #' @return a \code{list} containing 4 entries: -#' \itemize{ +#' \describe{ #' \item{pWin}{ a \code{vector} of \code{numeric} representing the #' probability (x2) of obtaining the current #' alternative/(alternative+reference) ratio from a 0.5 distribution.} @@ -1052,27 +1321,20 @@ testEmptyBox <- function(matCov, pCutOff=-3) { vMean <- 0.5 matCov$pWin <- rep(1, nrow(matCov)) - for (i in seq_len(nrow(matCov))) { - - ## Always use the small count as input - vCur1 <- ifelse(matCov$cnt.alt[i] <= matCov$cnt.ref[i], - matCov$cnt.alt[i], matCov$cnt.ref[i]) + matTmp <- apply(matCov[, c("cnt.alt", "cnt.ref")], 1, + FUN=function(x){ + vCur1 <- ifelse(x[1] <= x[2], x[1], x[2]) - ## Calculate the probability with assumption 0f 0.5 ratio - pCur <- pbinom(q=vCur1, size=matCov$cnt.ref[i] + matCov$cnt.alt[i], - prob=vMean) + pCur <- pbinom(q=vCur1, size=x[2] + x[1], prob=vMean) - ## Ensure value is not below 0.01 - pCurO <- max(1 - max(2 * pCur, 0.01), 0.01) + pCurO <- max(1 - max(2 * pCur, 0.01), 0.01) - ## Twice the probability (2 tails) - matCov$pWin[i] <- pCur * 2 - - ## Similar to likelihood for imbalance - p <- p + log10(max(pCur, 0.01)) - ## Similar to likelihood for not imbalance ( 1- probability) - pO <- pO + log10(pCurO) - } + return(c(vCur1, pCur, pCurO)) + }) + matCov$pWin <- matTmp[2, ] * 2 + tmp <- which(matTmp[2,] > 0.01) + p <- sum(log10(matTmp[2,tmp])) + (ncol(matTmp) - length(tmp)) * log10(0.01) + p0 <- sum(log10(matTmp[3,])) ## Calculate a global statistic using all SNVs ## The region is imbalance or not @@ -1082,8 +1344,7 @@ testEmptyBox <- function(matCov, pCutOff=-3) { ## Return a list res <- list(pWin=matCov$pWin, p=p, - pCut=as.integer(sum(matCov$pWin < 0.5) == nrow(matCov)), - pCut1=pCut1) + pCut=as.integer(sum(matCov$pWin < 0.5) == nrow(matCov)), pCut1=pCut1) return(res) } @@ -1093,48 +1354,80 @@ testEmptyBox <- function(matCov, pCutOff=-3) { ############################################### -#' @title TODO +#' @title Compute the log likelihood ratio based on the coverage of +#' each allele in a specific block (gene in the case of RNA-seq) #' -#' @description TODO +#' @description This function sums the log of read depth of the lowest depth +#' divide by the total depth of the position minus of likelihood of the allelic +#' fraction of 0.5 for a block. If the phase is known, the SNVs in the same +#' haplotype are grouped together. #' -#' @param snp.pos.Hetero For a specific gene (block) a \code{data.frame} with -#' lap for the SNV heterozygote dataset with -#' coverage > \code{minCov}. The \code{data.frame} must contain those columns: -#' 'phase', 'cnt.ref', 'cnt.alt'. TODO +#' @param snpPosHetero a \code{data.frame} +#' containing the SNV information for a specific block (gene if RNA-seq). +#' The \code{data.frame} must contain those columns: +#' \describe{ +#' \item{cnt.ref}{ a single \code{integer} representing the coverage for +#' the reference allele.} +#' \item{cnt.alt}{ a single \code{integer} representing the coverage for +#' the alternative allele.} +#' \item{phase}{ a single \code{integer} indicating the phase of the variant +#' if known, \code{3} if not known} +#' } #' -#' @return TODO a \code{list} of \code{numeric} for the gene lR the score -#' for aFraction different than 0.5 -#' aFraction allele estimation, nPhase number of SNV phase, -#' sumAlleleLow number of read overlapping the allele low -#' sumAlleleHigh number of read overlapping the allele high TODO +#' @return a \code{list} for the block with the information +#' relative to the heterozygotes. +#' The \code{list} contains: +#' \describe{ +#' \item{lR}{ a single \code{numeric} representing the sum of the log of +#' read depth of the lowest depth divide by the total depth of the position +#' minus of likelihood of the allelic fraction of 0.5.} +#' \item{aFraction}{ a single \code{numeric} representing the allele +#' fraction estimation.} +#' \item{sumAlleleLow}{ a \code{integer} representing the +#' sum of the allele read depth of the lowest read allele depth} +#' \item{sumAlleleHigh}{ a \code{integer} representing the +#' sum of the allele read depth +#' of the highest read allele depth} +#' } #' #' @examples #' -#' # TODO -#' gds <- "Demo GDS TODO" +#' ## Loading demo dataset containing SNV information +#' data(snpPositionDemo) +#' +#' ## Only use a subset of heterozygote SNVs related to one block +#' subset <- snpPositionDemo[which(snpPositionDemo$block.id == 2750 & +#' snpPositionDemo$hetero), c("cnt.ref", "cnt.alt", "phase")] +#' +#' result <- RAIDS:::calcAFMLRNA(subset) +#' +#' head(result) #' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz #' @encoding UTF-8 #' @keywords internal -calcAFMLRNA <- function(snp.pos.Hetero) { - - listPhase <- which(snp.pos.Hetero$phase < 2) - m <- data.frame(aL=rep(0, nrow(snp.pos.Hetero)), - aH=rep(0, nrow(snp.pos.Hetero))) +calcAFMLRNA <- function(snpPosHetero) { + + listPhase <- which(snpPosHetero$phase < 2) + m <- data.frame(aL=rep(0, nrow(snpPosHetero)), + aH=rep(0, nrow(snpPosHetero))) + # For the vairants phase + # sum the coverage for each haplotype + # if (length(listPhase) > 0) { mPhase <- data.frame(a1=rep(0, length(listPhase)), a2=rep(0, length(listPhase))) - if(length(which(snp.pos.Hetero$phase == 0)) > 0){ - mPhase[which(snp.pos.Hetero$phase == 0), "a1"] <- - snp.pos.Hetero[which(snp.pos.Hetero$phase == 0),"cnt.ref"] - mPhase[which(snp.pos.Hetero$phase == 0), "a2"] <- - snp.pos.Hetero[which(snp.pos.Hetero$phase == 0),"cnt.alt"] + if(length(which(snpPosHetero$phase == 0)) > 0){ + mPhase[which(snpPosHetero$phase == 0), "a1"] <- + snpPosHetero[which(snpPosHetero$phase == 0),"cnt.ref"] + mPhase[which(snpPosHetero$phase == 0), "a2"] <- + snpPosHetero[which(snpPosHetero$phase == 0),"cnt.alt"] } - if(length(which(snp.pos.Hetero$phase == 1)) > 0){ - mPhase[which(snp.pos.Hetero$phase == 1), "a2"] <- - snp.pos.Hetero[which(snp.pos.Hetero$phase == 1),"cnt.ref"] - mPhase[which(snp.pos.Hetero$phase == 1), "a1"] <- - snp.pos.Hetero[which(snp.pos.Hetero$phase == 1),"cnt.alt"] + if(length(which(snpPosHetero$phase == 1)) > 0){ + mPhase[which(snpPosHetero$phase == 1), "a2"] <- + snpPosHetero[which(snpPosHetero$phase == 1),"cnt.ref"] + mPhase[which(snpPosHetero$phase == 1), "a1"] <- + snpPosHetero[which(snpPosHetero$phase == 1),"cnt.alt"] } m1 <- sum(mPhase[,"a1"]) @@ -1144,138 +1437,186 @@ calcAFMLRNA <- function(snp.pos.Hetero) { m[listPhase, "aH"] <- mPhase[, (minPhase+1)%%2] } - listUnphase <- which(snp.pos.Hetero$phase > 1) + listUnphase <- which(snpPosHetero$phase > 1) if(length(listUnphase) > 0){ - minUnphase <- apply(snp.pos.Hetero[,c("cnt.ref", "cnt.alt")], 1, - FUN=min) - maxUnphase <- apply(snp.pos.Hetero[,c("cnt.ref", "cnt.alt")], 1, - FUN=max) + minUnphase <- apply(snpPosHetero[,c("cnt.ref", "cnt.alt")], 1, FUN=min) + maxUnphase <- apply(snpPosHetero[,c("cnt.ref", "cnt.alt")], 1, FUN=max) m[listUnphase, "aL"] <- minUnphase m[listUnphase, "aH"] <- maxUnphase } - d <- sum(rowSums(snp.pos.Hetero[,c("cnt.ref", "cnt.alt")])) + d <- sum(rowSums(snpPosHetero[,c("cnt.ref", "cnt.alt")])) aF <- sum(m[,"aL"]) / d lM <- log10(aF) * sum(m[,"aL"]) + log10(1- aF) * sum(m[,"aH"]) lR <- lM - log10(0.5) * d res <- list(lR = lR, aFraction=aF, nPhase = length(listPhase), - sumAlleleLow = sum(m[,"aL"]), - sumAlleleHigh = sum(m[,"aH"])) + sumAlleleLow = sum(m[,"aL"]), sumAlleleHigh = sum(m[,"aH"])) return(res) } -#' @title TODO -#' -#' @description TODO +#' @title Compile the information about the SNVs +#' for each block +#' +#' @description The function evaluates a score +#' about loss of heterozygosity and allelic fraction for each block. It +#' generates specific information about the variants in the block, like the +#' number of homozygotes or heterozygotes. +#' In the case of RNA-seq, the blocks are genes. +#' +#' @param snpPos a \code{data.frame} with lower allelic fraction (lap) for +#' the SNVs with coverage > \code{minCov}, for a specific chromosome. +#' +#' @return a \code{data.frame} containing only heterozygote +#' SNV information. The +#' \code{data.frame} contain those columns: +#' \describe{ +#' \item{block}{ a single \code{integer} representing the unique identifier +#' of the block.} +#' \item{aRF}{ a single \code{numeric} representing the final allelic +#' fraction; not computed yet, \code{-1} value assigned to all entries.} +#' \item{aFraction}{a single \code{integer} representing the possible allelic +#' fraction in absence of loss of heterozygosity (LOH).} +#' \item{lR}{ a single \code{integer} representing the coverage for +#' the alternative allele.} +#' \item{nPhase}{ a single \code{integer} representing the number of SNV +#' phases.} +#' \item{sumAlleleLow}{ a single \code{integer} representing the sum of the +#' alleles with the less coverage.} +#' \item{sumAlleleHigh}{ a single \code{integer} representing the sum of +#' the alleles with more coverage.} +#' \item{lH}{ a single \code{numeric} for the homozygotes log10 of the product +#' frequencies of the allele not found in the profile (not a probability).} +#' \item{lM}{ a single \code{numeric} log10 product frequency allele +#' in population.} +#' \item{lRhomo}{a single \code{numeric} representing the score +#' \code{lH} - \code{lM}.} +#' \item{nbHomo}{ a single \code{integer} representing the number of +#' homozygote SNVs per block.} +#' \item{nbKeep}{ a single \code{integer} representing the number of +#' SNVs retained per block.} +#' \item{nbHetero}{ a single \code{integer} representing the number of +#' heterozygote SNVs per block.} +#' } #' -#' @param snp.pos For a specific chromosome a \code{data.frame} with lap for -#' the SNV dataset with -#' coverage > \code{minCov}. +#' @examples #' -#' @return TODO a \code{data.frame} with the information related to allelic -#' fraction for each block gene +#' ## Loading demo dataset containing SNV information +#' data(snpPositionDemo) #' -#' @examples +#' ## Retain SNVs on chromosome 1 +#' subset <- snpPositionDemo[which(snpPositionDemo$snp.chr == 1),] #' -#' # TODO -#' gds <- "Demo GDS TODO" +#' ##Compile the information about the SNVs for each block +#' result <- RAIDS:::tableBlockAF(subset) +#' head(result) #' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz #' @importFrom S4Vectors aggregate #' @encoding UTF-8 #' @keywords internal -tableBlockAF <- function(snp.pos) { - - listBlocks <- unique(snp.pos$block.id) - - resBlock <- data.frame(block = listBlocks, - aRF = rep(-1, length(listBlocks)), - aFraction = rep(-1, length(listBlocks)), - lR = rep(-1, length(listBlocks)), - nPhase = rep(-1, length(listBlocks)), - sumAlleleLow = rep(-1, length(listBlocks)), - sumAlleleHigh = rep(-1, length(listBlocks)), - lH = rep(-1, length(listBlocks)), - lM = rep(-1, length(listBlocks)), - lRhomo = rep(1, length(listBlocks))) - - tmp <- aggregate(snp.pos[, c( "homo"), drop = FALSE], - by = list(block=snp.pos$block.id) ,sum) - row.names(tmp) <- as.character(tmp[,1]) - resBlock$nbHomo <- tmp[as.character(listBlocks),2] - tmp <- aggregate(snp.pos[, c( "keep"), drop = FALSE], - by = list(block=snp.pos$block.id) ,sum) - row.names(tmp) <- as.character(tmp[,1]) - resBlock$nbKeep <- tmp[as.character(listBlocks),2] - - tmp <- aggregate(snp.pos[, c( "hetero"), drop = FALSE], - by = list(block=snp.pos$block.id) ,sum) - row.names(tmp) <- as.character(tmp[,1]) - resBlock$nbHetero <- tmp[as.character(listBlocks),2] - - for (i in seq_len(length(listBlocks))) { - # start with LOH - +tableBlockAF <- function(snpPos) { + + listBlocks <- unique(snpPos$block.id) + + resBlock <- data.frame(block=listBlocks) + + # Number of homozygotes per block + tmp <- aggregate(snpPos[, c( "homo"), drop=FALSE], + by = list(block=snpPos$block.id), sum) + row.names(tmp) <- as.character(tmp[, 1]) + resBlock$nbHomo <- tmp[as.character(listBlocks), 2] + + # Number of SNVs keep per block + tmp <- aggregate(snpPos[, c( "keep"), drop=FALSE], + by = list(block=snpPos$block.id), sum) + row.names(tmp) <- as.character(tmp[, 1]) + resBlock$nbKeep <- tmp[as.character(listBlocks), 2] + + # Number of heterozygotes per block + tmp <- aggregate(snpPos[, c( "hetero"), drop=FALSE], + by = list(block=snpPos$block.id), sum) + row.names(tmp) <- as.character(tmp[, 1]) + resBlock$nbHetero <- tmp[as.character(listBlocks), 2] + + resBlock <- apply(resBlock, 1, FUN=function(x, snpPos) { + resBlock <- data.frame(block=x[1], nbHomo=x[2], nbKeep=x[3], + nbHetero=x[4], aRF=-1, aFraction=-1, + lR=-1, nPhase=-1, sumAlleleLow=-1, + sumAlleleHigh=-1, lH=-1, lM=-1, lRhomo=1) lH <- 1 lM <- 1 - if (resBlock[i, "nbKeep"] > 0 & - (resBlock[i, "nbKeep"] == resBlock[i, "nbHomo"] | - (resBlock[i, "nbHomo"] > 0 & resBlock[i, "nbHetero"] == 1)) ) { + if (resBlock[1, "nbKeep"] > 0 & + (resBlock[1, "nbKeep"] == resBlock[1, "nbHomo"] | + (resBlock[1, "nbHomo"] > 0 & resBlock[1, "nbHetero"] == 1))) { # Check if 1 hetero with allelic fraction (<=0.05) # it is considered as all homozygote flag <- TRUE - if (resBlock[i, "nbHetero"] == 1) { - tmp <- min(snp.pos[snp.pos$block.id == resBlock$block[i] & - snp.pos$hetero, c("cnt.ref" , "cnt.alt")])/ - sum(snp.pos[snp.pos$block.id == resBlock$block[i] & - snp.pos$hetero, c("cnt.ref" , "cnt.alt")]) - flag <- ifelse(tmp > 0.05, FALSE,TRUE) + if (resBlock[1, "nbHetero"] == 1) { + tmp <- min(snpPos[snpPos$block.id == resBlock$block[1] & + snpPos$hetero, c("cnt.ref" , "cnt.alt")])/ + sum(snpPos[snpPos$block.id == resBlock$block[1] & + snpPos$hetero, c("cnt.ref" , "cnt.alt")]) + + # flag is true if allelic fraction <= 0.05 + flag <- ifelse(tmp > 0.05, FALSE, TRUE) } - if(flag){ - listRef <- which(snp.pos$block.id == resBlock$block[i] & - snp.pos$homo & - snp.pos$cnt.ref > snp.pos$cnt.alt) - listAlt <- which(snp.pos$block.id == resBlock$block[i] & - snp.pos$homo & - snp.pos$cnt.ref < snp.pos$cnt.alt) - tmp <- snp.pos$freq[listRef] + + if(flag) { + # List homozygote ref + listRef <- which(snpPos$block.id == resBlock$block[1] & + snpPos$homo & snpPos$cnt.ref > snpPos$cnt.alt) + # list homozygote alt + listAlt <- which(snpPos$block.id == resBlock$block[1] & + snpPos$homo & snpPos$cnt.ref < snpPos$cnt.alt) + # freq of the Ref allele in population of listRef + tmp <- snpPos$freq[listRef] + ## min freq is 0.01 tmp[which(tmp < 0.01)] <- 0.01 + ## log10 of the product of the frequency of the alternative + ## allele in pop for listRef lH <- ifelse(length(listRef) > 0, sum(log10(1-tmp)*2), 0) - - tmp <- snp.pos$freq[listAlt] + ## freq of the Ref allele in population of listAlt + tmp <- snpPos$freq[listAlt] tmp[which(tmp < 0.01)] <- 0.01 + ## log10 of the product of the frequency of the + ## alternative allele in pop for listRef + ## plus log10 of the product of the frequency of + ##the reference allele in pop for listAlt lH <- lH + ifelse(length(listAlt) > 0, sum(log10(tmp)*2), 0) - lM <- sum(log10(apply(snp.pos[which(snp.pos$block.id == - resBlock$block[i] & snp.pos$homo), - "freq", drop=FALSE], 1, - FUN = function(x) { - return(max(x^2, 2*(x * (1-x)), (1-x)^2)) - }))) - resBlock$sumAlleleLow[i] <- 0 - resBlock$sumAlleleHigh[i] <- sum(snp.pos[listRef, "cnt.ref"]) + - sum(snp.pos[listAlt, "cnt.alt"]) + lM <- sum(log10(apply(snpPos[which(snpPos$block.id == + resBlock$block[1] & snpPos$homo), "freq", drop=FALSE], + 1, FUN=function(x) { + return(max(x^2, 2*(x * (1-x)), (1-x)^2))}))) + resBlock$sumAlleleLow[1] <- 0 + resBlock$sumAlleleHigh[1] <- + sum(snpPos[listRef, "cnt.ref"]) + + sum(snpPos[listAlt, "cnt.alt"]) + } } - } - resBlock[i, c("lH", "lM", "lRhomo")] <- c(lH, lM, lH - lM) + ## compute the score of the homozygote on the block + ## if heterozygote present lH = lM = 1 and lRhomo = 0 + resBlock[1, c("lH", "lM", "lRhomo")] <- c(lH, lM, lH-lM) - # get hetero and compute AF - if (resBlock[i, "nbKeep"] > 0 & resBlock[i, "nbHetero"] > 1) { + ## get hetero and compute AF nbHetero > 1 + if (resBlock[1, "nbKeep"] > 0 & resBlock[1, "nbHetero"] > 1) { - resML <- calcAFMLRNA(snp.pos[which(snp.pos$block.id == - resBlock$block[i] & snp.pos$hetero),]) + resML <- calcAFMLRNA(snpPos[which(snpPos$block.id == + resBlock$block[1] & snpPos$hetero),]) - resBlock$aFraction[i] <- resML$aFraction - resBlock$lR[i] <- resML$lR - resBlock$nPhase[i] <- resML$nPhase - resBlock$sumAlleleLow[i] <- resML$sumAlleleLow - resBlock$sumAlleleHigh[i] <- resML$sumAlleleHigh - } - } + resBlock$aFraction[1] <- resML$aFraction + resBlock$lR[1] <- resML$lR + resBlock$nPhase[1] <- resML$nPhase + resBlock$sumAlleleLow[1] <- resML$sumAlleleLow + resBlock$sumAlleleHigh[1] <- resML$sumAlleleHigh + } + return(resBlock) + }, snpPos=snpPos) + resBlock <- do.call(rbind, resBlock) return(resBlock) } diff --git a/R/gdsWrapper.R b/R/gdsWrapper.R index 53b25dee0..16ce32cc1 100644 --- a/R/gdsWrapper.R +++ b/R/gdsWrapper.R @@ -1,70 +1,12 @@ -#' @title Append sample names into a GDS file +#' @title Add information related to SNVs into a Population Reference GDS file #' -#' @description This function append the sample identifiers into the -#' "samples.id" node of a GDS file. -#' -#' @param gds an object of class -#' \link[gdsfmt]{gds.class} (a GDS file), the opened GDS file. -#' -#' @param listSample a \code{vector} of \code{character} string representing -#' the sample identifiers to be added to GDS file. -#' -#' -#' @return The integer \code{0L} when successful. -#' -#' @examples -#' -#' ## Create a temporary GDS file in an test directory -#' dataDir <- system.file("extdata/tests", package="RAIDS") -#' gdsFilePath <- file.path(dataDir, "GDS_TEMP_04.gds") -#' -#' ## Create and open the GDS file -#' GDS_file_tmp <- createfn.gds(filename=gdsFilePath) -#' -#' ## Create "sample.id" node (the node must be present) -#' add.gdsn(node=GDS_file_tmp, name="sample.id", val=c("sample_01", -#' "sample_02")) -#' -#' sync.gds(gdsfile=GDS_file_tmp) -#' -#' ## Add information about 2 samples to the GDS file -#' RAIDS:::appendGDSSampleOnly(gds=GDS_file_tmp, -#' listSamples=c("sample_03", "sample_04")) -#' -#' ## Read sample identifier list -#' ## Only "sample_03" and "sample_04" should have been added -#' read.gdsn(index.gdsn(node=GDS_file_tmp, path="sample.id")) -#' -#' ## Close GDS file -#' closefn.gds(gdsfile=GDS_file_tmp) -#' -#' ## Delete the temporary GDS file -#' unlink(x=gdsFilePath, force=TRUE) -#' -#' -#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz -#' @importFrom gdsfmt index.gdsn append.gdsn -#' @encoding UTF-8 -#' @keywords internal -appendGDSSampleOnly <- function(gds, listSamples) { - - sampleGDS <- index.gdsn(gds, "sample.id") - - append.gdsn(sampleGDS, val=listSamples, check=TRUE) - - return(0L) -} - - -#' @title Add information related to SNVs into a Reference GDS file -#' -#' @description the function adds the SNV information into a Reference -#' GDS file. +#' @description The function adds the SNV information into a Population +#' Reference GDS file. #' #' @param gdsReference an object of class #' \link[gdsfmt]{gds.class} (a GDS file), the opened Reference GDS file. #' -#' @param fileFREQ a \code{character} string representing the path and file +#' @param fileFreq a \code{character} string representing the path and file #' name of the RDS file with the filtered SNP information. #' #' @param verbose a \code{logical} indicating if messages should be printed @@ -75,35 +17,33 @@ appendGDSSampleOnly <- function(gds, listSamples) { #' @examples #' #' ## Required package -#' library(withr) +#' library(gdsfmt) #' -#' ## Path to the demo pedigree file is located in this package +#' ## The RDS file containing the filtered SNP information #' dataDir <- system.file("extdata", package="RAIDS") +#' fileFilerterSNVs <- file.path(dataDir, "mapSNVSelected_Demo.rds") #' -#' ## Temporary Reference GDS file -#' file1KG <- local_file(file.path(dataDir, "1KG_TEMP_002.gds")) +#' ## Temporary Reference GDS file in temporary directory +#' file1KG <- file.path(tempdir(), "1KG_TEMP_002.gds") #' filenewGDS <- createfn.gds(file1KG) #' -#' ## The RDS file containing the filtered SNP information -#' fileFilerterSNVs <- file.path(dataDir, "mapSNVSelected_Demo.rds") -#' #' ## Add SNV information to Reference GDS #' RAIDS:::generateGDSSNPinfo(gdsReference=filenewGDS, -#' fileFREQ=fileFilerterSNVs, verbose=TRUE) +#' fileFreq=fileFilerterSNVs, verbose=TRUE) #' #' ## Close GDS file (important) #' closefn.gds(filenewGDS) #' -#' ## Remove temporary files -#' deferred_run() +#' ## Remove temporary 1KG_TEMP_002.gds file +#' unlink(file1KG, force=TRUE) #' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz #' @importFrom gdsfmt add.gdsn #' @encoding UTF-8 #' @keywords internal -generateGDSSNPinfo <- function(gdsReference, fileFREQ, verbose) { +generateGDSSNPinfo <- function(gdsReference, fileFreq, verbose) { - mapSNVSel <- readRDS(file=fileFREQ) + mapSNVSel <- readRDS(file=fileFreq) if(verbose) { message("Read mapSNVSel DONE ", Sys.time()) } @@ -148,16 +88,19 @@ generateGDSSNPinfo <- function(gdsReference, fileFREQ, verbose) { } -#' @title Add information related to profile genotype into a Reference GDS file +#' @title Add information related to profile genotypes into a Population +#' Reference GDS file #' #' @description This function adds the genotype fields with the associated -#' information into the Reference GDS file for the selected profiles. +#' information into the Population Reference GDS file for the selected +#' profiles. #' #' @param gds an object of class -#' \link[gdsfmt]{gds.class} (a GDS file), the opened Reference GDS file. +#' \link[gdsfmt]{gds.class} (a GDS file), the opened Population Reference +#' GDS file. #' #' @param pathGeno a \code{character} string representing the path where -#' the 1K genotyping files for each sample are located. The name of the +#' the reference genotyping files for each sample are located. The name of the #' genotyping files must correspond to #' the individual identification (Individual.ID) in the pedigree file. #' @@ -176,7 +119,51 @@ generateGDSSNPinfo <- function(gdsReference, fileFREQ, verbose) { #' #' @examples #' -#' # TODO +#' ## Required library +#' library(gdsfmt) +#' +#' ## Path to the demo pedigree file is located in this package +#' dataDir <- system.file("extdata", package="RAIDS") +#' +#' ## Path where the demo genotype CSV files are located +#' pathGeno <- file.path(dataDir, "demoProfileGenotypes") +#' +#' ## The RDS file containing the pedigree information +#' pedigreeFile <- file.path(dataDir, "PedigreeReferenceDemo.rds") +#' +#' ## The RDS file containing the indexes of the retained SNPs +#' snpIndexFile <- file.path(dataDir, "listSNPIndexes_Demo.rds") +#' +#' ## The RDS file containing the filtered SNP information +#' filterSNVFile <- file.path(dataDir, "mapSNVSelected_Demo.rds") +#' +#' ## Temporary Reference GDS file in temporary directory +#' tempRefGDS <- file.path(tempdir(), "Ref_TEMP01.gds") +#' +#' ## Create temporary Reference GDS file +#' newGDS <- createfn.gds(tempRefGDS) +#' put.attr.gdsn(newGDS$root, "FileFormat", "SNP_ARRAY") +#' +#' ## Read the pedigree file +#' ped1KG <- readRDS(pedigreeFile) +#' +#' ## Add information about samples to the Reference GDS file +#' listSampleGDS <- RAIDS:::generateGDSRefSample(gdsReference=newGDS, +#' dfPedReference=ped1KG, listSamples=NULL) +#' +#' ## Add SNV information to the Reference GDS +#' RAIDS:::generateGDSSNPinfo(gdsReference=newGDS, fileFreq=filterSNVFile, +#' verbose=FALSE) +#' +#' ## Add genotype information to the Reference GDS +#' RAIDS:::generateGDSgenotype(gds=newGDS, pathGeno=pathGeno, +#' fileSNPsRDS=snpIndexFile, listSamples=listSampleGDS, verbose=FALSE) +#' +#' ## Close file +#' closefn.gds(newGDS) +#' +#' ## Remove temporary files +#' unlink(tempRefGDS, force=TRUE) #' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz #' @importFrom gdsfmt add.gdsn write.gdsn @@ -187,8 +174,8 @@ generateGDSgenotype <- function(gds, pathGeno, fileSNPsRDS, listSamples, verbose) { # File with the description of the SNP keep - listMat1k <- dir(pathGeno, pattern=".+.csv.bz2") - listSample1k <- gsub(".csv.bz2", "", listMat1k) + listMatRef <- dir(pathGeno, pattern=".+.csv.bz2") + listSample1k <- gsub(".csv.bz2", "", listMatRef) listSNP <- readRDS(fileSNPsRDS) @@ -198,7 +185,7 @@ generateGDSgenotype <- function(gds, pathGeno, fileSNPsRDS, listSamples, if(verbose) { message(listSamples[i]) } if( length(pos) == 1) { - matSample <- read.csv2(file.path(pathGeno, listMat1k[pos]), + matSample <- read.csv2(file.path(pathGeno, listMatRef[pos]), row.names=NULL) matSample <- matSample[listSNP,, drop=FALSE] if(i == 1) { @@ -219,7 +206,7 @@ generateGDSgenotype <- function(gds, pathGeno, fileSNPsRDS, listSamples, rm(matSample) - if(verbose) { message(listMat1k[pos], " ", i) } + if(verbose) { message(listMatRef[pos], " ", i) } }else{ stop("Missing samples genotype in ", listSamples[i]) } @@ -228,28 +215,89 @@ generateGDSgenotype <- function(gds, pathGeno, fileSNPsRDS, listSamples, return(0L) } -#' @title This function append the field genotype in the gds file +#' @title Append information related to profile genotypes into a Population +#' Reference GDS file (associated node already present in the GDS) #' -#' @description TODO +#' @description This function appends the genotype fields with the associated +#' information into the Population Reference GDS file for the selected +#' profiles. The associated node must already present in the GDS file. #' -#' @param gds a \code{gds} object. +#' @param gds an object of class +#' \link[gdsfmt]{gds.class} (a GDS file), the opened Population Reference +#' GDS file. #' -#' @param pathGeno TODO a PATH to a directory with the a file for each -#' samples with the genotype. +#' @param pathGeno a \code{character} string representing the path where +#' the reference genotyping files for each sample are located. The name of the +#' genotyping files must correspond to +#' the individual identification (Individual.ID) in the pedigree file. #' -#' @param fileSNPsRDS TODO list of SNP to keep in the file genotype +#' @param fileSNPsRDS a \code{character} string representing the path and file +#' name of the RDS file that contains the indexes of the retained SNPs. The +#' file must exist. The file must be a RDS file. #' -#' @param listSamples a \code{array} with the sample to keep +#' @param listSample a \code{character} string representing the path and file +#' name of the RDS file that contains the indexes of the retained SNPs. The +#' file must exist. The file must be a RDS file. #' #' @param verbose a \code{logical} indicating if the function must print -#' messages when running. Default: \code{FALSE}. +#' messages when running. #' #' @return The integer \code{0} when successful. #' #' @examples #' -#' # TODO -#' gds <- "Demo GDS TODO" +#' ## Required library +#' library(gdsfmt) +#' +#' ## Path to the demo pedigree file is located in this package +#' dataDir <- system.file("extdata", package="RAIDS") +#' +#' ## Path where the demo genotype CSV files are located +#' pathGeno <- file.path(dataDir, "demoProfileGenotypes") +#' +#' ## The RDS file containing the pedigree information +#' pedigreeFile <- file.path(dataDir, "PedigreeReferenceDemo.rds") +#' +#' ## The RDS file containing the indexes of the retained SNPs +#' snpIndexFile <- file.path(dataDir, "listSNPIndexes_Demo.rds") +#' +#' ## The RDS file containing the filtered SNP information +#' filterSNVFile <- file.path(dataDir, "mapSNVSelected_Demo.rds") +#' +#' ## Temporary Reference GDS file +#' tempRefGDS <- file.path(tempdir(), "Ref_TEMP02.gds") +#' +#' ## Create temporary Reference GDS file +#' newGDS <- createfn.gds(tempRefGDS) +#' put.attr.gdsn(newGDS$root, "FileFormat", "SNP_ARRAY") +#' +#' ## Read the pedigree file +#' ped1KG <- readRDS(pedigreeFile) +#' +#' ## Add information about samples to the Reference GDS file +#' listSampleGDS <- RAIDS:::generateGDSRefSample(gdsReference=newGDS, +#' dfPedReference=ped1KG, listSamples=NULL) +#' +#' ## Add SNV information to the Reference GDS +#' RAIDS:::generateGDSSNPinfo(gdsReference=newGDS, fileFreq=filterSNVFile, +#' verbose=FALSE) +#' +#' ## Add genotype information to the Reference GDS for the 3 first samples +#' RAIDS:::generateGDSgenotype(gds=newGDS, pathGeno=pathGeno, +#' fileSNPsRDS=snpIndexFile, listSamples=listSampleGDS[1:3], +#' verbose=FALSE) +#' +#' ## Append genotype information to the Reference GDS for the other samples +#' RAIDS:::appendGDSgenotype(gds=newGDS, pathGeno=pathGeno, +#' fileSNPsRDS=snpIndexFile, +#' listSample=listSampleGDS[4:length(listSampleGDS)], +#' verbose=FALSE) +#' +#' ## Close file +#' closefn.gds(newGDS) +#' +#' ## Remove temporary files +#' unlink(tempRefGDS, force=TRUE) #' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz #' @importFrom gdsfmt index.gdsn read.gdsn @@ -257,11 +305,11 @@ generateGDSgenotype <- function(gds, pathGeno, fileSNPsRDS, listSamples, #' @encoding UTF-8 #' @keywords internal appendGDSgenotype <- function(gds, listSample, pathGeno, fileSNPsRDS, - verbose=FALSE) { + verbose) { # File with the description of the SNP keep - listMat1k <- dir(pathGeno, pattern = ".+.csv.bz2") - listSample1k <- gsub(".csv.bz2", "", listMat1k) + listMatRef <- dir(pathGeno, pattern=".+.csv.bz2") + listSample1k <- gsub(".csv.bz2", "", listMatRef) listSNP <- readRDS(file=fileSNPsRDS) geno.var <- index.gdsn(gds, "genotype") @@ -271,7 +319,7 @@ appendGDSgenotype <- function(gds, listSample, pathGeno, fileSNPsRDS, for(i in seq_len(length(listSample))) { pos <- which(listSample1k == listSample[i]) if( length(pos) == 1) { - matSample <- read.csv2(file.path(pathGeno, listMat1k[pos]), + matSample <- read.csv2(file.path(pathGeno, listMatRef[pos]), row.names = NULL) matSample <- matSample[listSNP,, drop=FALSE] @@ -285,12 +333,12 @@ appendGDSgenotype <- function(gds, listSample, pathGeno, fileSNPsRDS, matSample[matSample[,1] == "1|1",1] <- 2 g <- as.matrix(matSample)[,1] - append.gdsn(geno.var,g, check=TRUE) + append.gdsn(geno.var, g, check=TRUE) rm(matSample) - if(verbose) { message(listMat1k[pos], " ", i) } + if(verbose) { message(listMatRef[pos], " ", i) } }else { - stop("Missing 1k samples ", listSample[i]) + stop("Missing reference samples ", listSample[i]) } } @@ -298,221 +346,107 @@ appendGDSgenotype <- function(gds, listSample, pathGeno, fileSNPsRDS, } -#' @title create a file tfam file for plink from the gdsReference file -#' -#' @description TODO -#' -#' @param gdsReference an object of class -#' \link[gdsfmt]{gds.class} (a GDS file), the opened 1KG GDS file. -#' -#' @param listSample a \code{array} with the sample to keep TODO -#' -#' @param pedOUT TODO a PATH and file name to the output file -#' -#' @return TODO a \code{vector} of \code{numeric} -#' -#' @examples -#' -#' # TODO -#' gds <- "Demo GDS TODO" -#' -#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz -#' @importFrom gdsfmt index.gdsn read.gdsn -#' @importFrom utils write.table -#' @encoding UTF-8 -#' @keywords export -gds2tfam <- function(gdsReference, listSample, pedOUT) { - - sampleGDS <- index.gdsn(gdsReference, "sample.id") - sampleId <-read.gdsn(sampleGDS) - listS <- which(sampleId %in% listSample) - - sampleGDS <- index.gdsn(gdsReference, "sample.annot") - sampleANNO <-read.gdsn(sampleGDS) - - dfPed <- data.frame(famId=paste0("F", seq_len(length(listSample))), - id=sampleId[listS], - fa=rep("0",length(listSample)), - mo=rep("0",length(listSample)), - sex=sampleANNO$sex[listS], - pheno=rep(1,length(listSample)), - stringsAsFactors=FALSE) - - write.table(dfPed, pedOUT, - quote=FALSE, sep="\t", - row.names=FALSE, - col.names=FALSE) - -} - -#' @title create a file tfam file for plink from the gdsProfile file +#' @title Add block information in a Population Reference GDS Annotation file #' -#' @description TODO +#' @description This function appends the information for one specific type +#' of blocks into a Population Reference GDS Annotation file. More +#' specifically, the node 'block.annot' is created if it does not exists. This +#' node contains a \code{data.frame} which will be append the description of +#' the current block. The node 'block' is also created if it does not exists. +#' This node is a \code{matrix} that will contain all the entries for the +#' current block. All the values for a specific block type are contained in a +#' single column that corresponds to the row number in the 'block.annot' node. #' -#' @param gdsProfile an object of class \link[gdsfmt]{gds.class} (a GDS -#' file), the open Profile GDS file. +#' @param gds an object of class \code{gds} opened in writing mode. #' -#' @param listSample a \code{array} with the sample to keep +#' @param listBlock a \code{array} of \code{integer} representing all the +#' entries for the current block. #' -#' @param sampleANNO a \code{data.frame} with at least column sex and the name -#' must be sample.id +#' @param blockName a \code{character} string representing the unique +#' block name. #' -#' @param pedOUT TODO a PATH and file name to the output file +#' @param blockDesc a \code{character} string representing the description of +#' the current block. #' -#' -#' @return TODO a \code{vector} of \code{numeric} +#' @return The integer \code{0L} when successful. #' #' @examples #' -#' # TODO -#' gds <- "Demo GDS TODO" -#' -#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz -#' @importFrom gdsfmt index.gdsn read.gdsn -#' @importFrom utils write.table -#' @encoding UTF-8 -#' @keywords export -gds2tfamSample <- function(gdsProfile, listSample, sampleANNO, pedOUT) { - - sampleGDS <- index.gdsn(gdsProfile, "sample.id") - sampleId <-read.gdsn(sampleGDS) - listS <- which(sampleId %in% listSample) - - dfPed <- data.frame(famId=paste0("F", seq_len(length(listSample))), - id=sampleId[listS], - fa=rep("0",length(listSample)), - mo=rep("0",length(listSample)), - sex=sampleANNO[sampleId[listS], "sex"], - pheno=rep(1,length(listSample)), - stringsAsFactors=FALSE) - - write.table(dfPed, pedOUT, quote=FALSE, sep="\t", - row.names=FALSE, col.names=FALSE) -} - - -#' @title create a file tped file for plink from the gds file -#' -#' @description TODO -#' -#' @param gds a \code{gds} object. -#' -#' @param listSample a \code{array} with the sample to keep #' -#' @param listSNP a \code{array} with the snp.id to keep +#' ## Required library +#' library(gdsfmt) #' -#' @param pedOUT TODO a PATH and file name to the output file +#' ## Temporary GDS Annotation file in current directory +#' gdsFilePath <- file.path(tempdir(), "GDS_TEMP_Annot_14.gds") #' -#' @return TODO a \code{vector} of \code{numeric} +#' ## Create and open the GDS file +#' GDS_file_tmp <- createfn.gds(filename=gdsFilePath) #' -#' @examples +#' ## One block +#' blockType <- "EAS.0.05.500k" #' -#' # TODO -#' gds <- "Demo GDS TODO" +#' ## The description of the block +#' blockDescription <- "EAS population blocks based on 500k windows" #' -#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz -#' @importFrom gdsfmt index.gdsn read.gdsn -#' @importFrom utils write.table -#' @encoding UTF-8 -#' @keywords export -gds2tped <- function(gds, listSample, listSNP, pedOUT) { - - sampleGDS <- index.gdsn(gds, "sample.id") - sampleId <-read.gdsn(sampleGDS) - listS <- which(sampleId %in% listSample) - - snpGDS <- index.gdsn(gds, "snp.id") - snpId <- read.gdsn(snpGDS) - listKeep <- which(snpId %in% listSNP) - snpId <- snpId[listKeep] - - snpGDS <- index.gdsn(gds, "snp.chromosome") - snpChr <- read.gdsn(snpGDS) - snpChr <- snpChr[listKeep] - - snpGDS <- index.gdsn(gds, "snp.position") - snpPos <- read.gdsn(snpGDS) - snpPos <- snpPos[listKeep] - - tped <- list() - tped[[1]] <- snpChr - tped[[2]] <- snpId - tped[[3]] <- rep(0,length(snpId)) - tped[[4]] <- snpPos - k<-4 - geno.var <- index.gdsn(gds, "genotype") - for(i in listS){ - k <- k + 1 - - tmp <- read.gdsn(geno.var, start=c(1, i), count=c(-1,1))[listKeep] - - # 0 1 0 1 0 1 - tped[[k]] <- (tmp == 2) + 1 - k <- k + 1 - tped[[k]] <- (tmp > 0) + 1 - - } - - write.table(tped, pedOUT, quote=FALSE, sep="\t", row.names=FALSE, - col.names=FALSE) - -} - - -#' @title TODO +#' ## The values for each entry related to the block (integers) +#' blockEntries <- c(1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3) #' -#' @description TODO +#' RAIDS:::addBlockInGDSAnnot(gds=GDS_file_tmp, listBlock=blockEntries, +#' blockName=blockType, blockDesc=blockDescription) #' -#' @param gds an object of class \code{gds} opened for the sample +#' ## Read 'block.annot' node +#' read.gdsn(index.gdsn(GDS_file_tmp, "block.annot")) #' -#' @param listBlock TODO +#' ## Read 'block' node +#' read.gdsn(index.gdsn(GDS_file_tmp, "block")) #' -#' @param blockName TODO -#' -#' @param blockDesc TODO -#' -#' @return The integer \code{0L} when successful. +#' ## Close GDS file +#' closefn.gds(gdsfile=GDS_file_tmp) #' -#' @examples +#' ## Delete the temporary GDS file +#' unlink(x=gdsFilePath, force=TRUE) #' -#' # TODO -#' gds <- "Demo GDS TODO" #' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz #' @importFrom gdsfmt add.gdsn index.gdsn ls.gdsn compression.gdsn #' @importFrom gdsfmt append.gdsn sync.gds #' @encoding UTF-8 #' @keywords internal -addGDS1KGLDBlock <- function(gds, listBlock, blockName, blockDesc) { +addBlockInGDSAnnot <- function(gds, listBlock, blockName, blockDesc) { - block.annot <- data.frame(block.id=blockName, + blockAnnot <- data.frame(block.id=blockName, block.desc=blockDesc, stringsAsFactors=FALSE) if(! ("block.annot" %in% ls.gdsn(gds))) { - var.block.annot <- add.gdsn(gds, "block.annot", block.annot) + ## Create 'block.annot' node when not existing + varBlockAnnot <- add.gdsn(gds, "block.annot", blockAnnot) }else { + ## Append content to 'block.annot' node when existing curAnnot <- index.gdsn(gds, "block.annot/block.id") - append.gdsn(curAnnot,block.annot$block.id) + append.gdsn(curAnnot, blockAnnot$block.id) curAnnot <- index.gdsn(gds, "block.annot/block.desc") - append.gdsn(curAnnot, block.annot$block.desc) + append.gdsn(curAnnot, blockAnnot$block.desc) } - var.block <- NULL - if(! ("block" %in% ls.gdsn(gds))){ - var.block <- add.gdsn(gds, "block", + varBlock <- NULL + if(!("block" %in% ls.gdsn(gds))) { + ## Create 'block' node that will contain a matrix of integers + ## stored in compressed mode + varBlock <- add.gdsn(node=gds, name="block", valdim=c(length(listBlock), 1), listBlock, storage="int32", - compress = "LZ4_RA") - readmode.gdsn(var.block) + compress="LZ4_RA") + readmode.gdsn(varBlock) - }else { - if(is.null(var.block)) { - var.block <- index.gdsn(gds, "block") - var.block <- compression.gdsn(var.block, "") + } else { + if(is.null(varBlock)) { + varBlock <- index.gdsn(gds, "block") + varBlock <- compression.gdsn(varBlock, "") } - append.gdsn(var.block, listBlock) - var.block <- compression.gdsn(var.block, "LZ4_RA") + append.gdsn(varBlock, listBlock) + ## Compressed data using LZ4_RA method + varBlock <- compression.gdsn(varBlock, "LZ4_RA") } sync.gds(gds) @@ -521,105 +455,5 @@ addGDS1KGLDBlock <- function(gds, listBlock, blockName, blockDesc) { } -#' @title Add information related to segments associated to the SNV -#' dataset for a specific sample into a GDS file -#' -#' @description The function adds the information related to segments -#' associated to the SNV dataset for a specific sample into a -#' GDS file, more specifically, in the "segment" node. If the "segment" node -#' already exists, the previous information is erased. -#' -#' @param gdsProfile an object of class \code{\link[gdsfmt]{gds.class}} -#' (a GDS file), a GDS Sample file. -#' -#' @param snp.seg a \code{vector} of \code{integer} representing the segment -#' identifiers associated to each SNV selected for the specific sample. The -#' length of the \code{vector} should correspond to the number of SNVs -#' present in the "snp.id" entry of the GDS sample file. -#' -#' @return The integer \code{0L} when successful. -#' -#' @examples -#' -#' ## Create a temporary GDS file in an test directory -#' dataDir <- system.file("extdata/tests", package="RAIDS") -#' gdsFilePath <- file.path(dataDir, "GDS_TEMP.gds") -#' -#' ## Create and open the GDS file -#' GDS_file_tmp <- createfn.gds(filename=gdsFilePath) -#' -#' ## Vector of segment identifiers -#' segments <- c(1L, 1L, 1L, 2L, 2L, 3L, 3L) -#' -#' ## Add segments to the GDS file -#' RAIDS:::addUpdateSegment(gdsProfile=GDS_file_tmp, snp.seg=segments) -#' -#' ## Read segments information from GDS file -#' read.gdsn(index.gdsn(node=GDS_file_tmp, path="segment")) -#' -#' ## Close GDS file -#' closefn.gds(gdsfile=GDS_file_tmp) -#' -#' ## Delete the temporary GDS file -#' unlink(x=gdsFilePath, force=TRUE) -#' -#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz -#' @importFrom gdsfmt add.gdsn index.gdsn delete.gdsn sync.gds ls.gdsn -#' @encoding UTF-8 -#' @keywords internal -addUpdateSegment <- function(gdsProfile, snp.seg) { - - if("segment" %in% ls.gdsn(gdsProfile)) { - snpLap <- write.gdsn(index.gdsn(gdsProfile, "segment"), snp.seg) - } else{ - snpLap <- add.gdsn(gdsProfile, "segment", snp.seg, storage="uint32") - } - - sync.gds(gdsProfile) - ## Successful - return(0L) -} - - -#' @title Get the block number for each SNV in snp.index -#' -#' @description TODO -#' -#' @param gdsRefAnnot an object of class \code{\link[gdsfmt]{gds.class}} -#' (a GDS file), the opened 1KG SNV Annotation GDS file. RNA specific -#' Default: \code{NULL}. -#' -#' @param snp.index TODO -#' -#' @param blockID a \code{character} string corresponding to the block -#' identifier in \code{gdsRefAnnot}. RNA specific -#' Default: \code{NULL} -#' -#' @return TODO a \code{vector} of \code{numeric} corresponding to the -#' block identifier -#' -#' @examples -#' -#' # TODO -#' gds <- "Demo GDS TODO" -#' -#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz -#' @importFrom gdsfmt index.gdsn read.gdsn -#' @encoding UTF-8 -#' @keywords internal -getGeneBlock <- function(gdsRefAnnot, snp.index, blockID) { - - block.annot <- read.gdsn(index.gdsn(gdsRefAnnot, "block.annot")) - pos <- which(block.annot$block.id == blockID) - - if(length(pos) != 1) { - stop("Try to get Gene.Block with blockID problematic ", blockID) - } - - b <- read.gdsn(index.gdsn(gdsRefAnnot, "block"), start=c(1,pos), - count = c(-1,1))[snp.index] - - return(b) -} diff --git a/R/gdsWrapper_internal.R b/R/gdsWrapper_internal.R index 9ab8b5f24..57672c9c9 100644 --- a/R/gdsWrapper_internal.R +++ b/R/gdsWrapper_internal.R @@ -31,28 +31,27 @@ #' ## Required library #' library(gdsfmt) #' -#' ## Create a temporary GDS file in an test directory -#' dataDir <- system.file("extdata/tests", package="RAIDS") -#' gdsFilePath <- file.path(dataDir, "GDS_TEMP_10.gds") +#' ## Temporary GDS file in current directory +#' gdsFilePath <- file.path(tempdir(), "GDS_TEMP_10.gds") #' #' ## Create and open the GDS file #' tmpGDS <- createfn.gds(filename=gdsFilePath) #' #' ## Create "sample.annot" node (the node must be present) #' pedInformation <- data.frame(sample.id=c("sample_01", "sample_02"), -#' Name.ID=c("sample_01", "sample_02"), -#' sex=c(1,1), # 1:Male 2: Female -#' pop.group=c("ACB", "ACB"), -#' superPop=c("AFR", "AFR"), -#' batch=c(1, 1), -#' stringsAsFactors=FALSE) +#' Name.ID=c("sample_01", "sample_02"), +#' sex=c(1,1), # 1:Male 2: Female +#' pop.group=c("ACB", "ACB"), +#' superPop=c("AFR", "AFR"), +#' batch=c(1, 1), +#' stringsAsFactors=FALSE) #' #' ## The row names must be the sample identifiers #' rownames(pedInformation) <- pedInformation$Name.ID #' #' ## Add information about 2 samples to the GDS file #' RAIDS:::generateGDSRefSample(gdsReference=tmpGDS, -#' dfPedReference=pedInformation, listSamples=NULL) +#' dfPedReference=pedInformation, listSamples=NULL) #' #' ## Read sample identifier list #' read.gdsn(index.gdsn(node=tmpGDS, path="sample.id")) @@ -66,6 +65,7 @@ #' ## Delete the temporary GDS file #' unlink(x=gdsFilePath, force=TRUE) #' +#' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz #' @importFrom gdsfmt add.gdsn #' @encoding UTF-8 @@ -119,13 +119,12 @@ generateGDSRefSample <- function(gdsReference, dfPedReference, #' ## Required library #' library(gdsfmt) #' -#' ## Locate RDS with unrelated/related status for 1KG samples +#' ## Locate RDS with unrelated/related status for Reference samples #' dataDir <- system.file("extdata", package="RAIDS") #' rdsFilePath <- file.path(dataDir, "unrelatedPatientsInfo_Demo.rds") #' -#' ## Create a temporary GDS file in an test directory -#' dataDir <- system.file("extdata/tests", package="RAIDS") -#' gdsFilePath <- file.path(dataDir, "GDS_TEMP_11.gds") +#' ## Temporary GDS file +#' gdsFilePath <- file.path(tempdir(), "GDS_TEMP_11.gds") #' #' ## Create and open the GDS file #' tmpGDS <- createfn.gds(filename=gdsFilePath) @@ -134,7 +133,7 @@ generateGDSRefSample <- function(gdsReference, dfPedReference, #' sampleIDs <- c("HG00104", "HG00109", "HG00110") #' add.gdsn(node=tmpGDS, name="sample.id", val=sampleIDs) #' -#' ## Create "sample.ref" node in GDS file using RDS information +#' ## Create "sample.ref" node in GDS file using RDS information #' RAIDS:::addGDSRef(gdsReference=tmpGDS, filePart=rdsFilePath) #' #' ## Read sample reference data.frame @@ -146,6 +145,7 @@ generateGDSRefSample <- function(gdsReference, dfPedReference, #' ## Delete the temporary GDS file #' unlink(x=gdsFilePath, force=TRUE) #' +#' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz #' @importFrom gdsfmt add.gdsn #' @encoding UTF-8 @@ -195,9 +195,8 @@ addGDSRef <- function(gdsReference, filePart) { #' ## Required library #' library(gdsfmt) #' -#' ## Create a temporary GDS file in an test directory -#' dataDir <- system.file("extdata/tests", package="RAIDS") -#' gdsFilePath <- file.path(dataDir, "GDS_TEMP_06.gds") +#' ## Create a temporary GDS file +#' gdsFilePath <- file.path(tempdir(), "GDS_TEMP_06.gds") #' #' ## Create and open the GDS file #' tmpGDS <- createfn.gds(filename=gdsFilePath) @@ -225,6 +224,7 @@ addGDSRef <- function(gdsReference, filePart) { #' ## Delete the temporary GDS file #' unlink(x=gdsFilePath, force=TRUE) #' +#' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz #' @importFrom gdsfmt index.gdsn read.gdsn #' @importFrom utils read.csv2 @@ -246,9 +246,14 @@ appendGDSgenotypeMat <- function(gds, matG) { #' profile into the Profile GDS file. The genotype information is extracted #' from a SNV file as generated by SNP-pileup or other tools. #' -#' @param pathGeno a \code{character} string representing the path to a -#' directory with the genotype files for the profiles, as generated by -#' snp-pileup. The path must exist. +#' @param pathGeno a \code{character} string representing the path to the +#' directory containing the VCF output of SNP-pileup for each sample. The +#' SNP-pileup files must be compressed (gz files) and have the name identifiers +#' of the samples. A sample with "Name.ID" identifier would have an +#' associated file called +#' if genoSource is "VCF", then "Name.ID.vcf.gz", +#' if genoSource is "generic", then "Name.ID.generic.txt.gz" +#' if genoSource is "snp-pileup", then "Name.ID.txt.gz". #' #' @param listSamples a \code{vector} of \code{character} string corresponding #' to the sample identifiers that will have a Profile GDS file created. The @@ -276,8 +281,8 @@ appendGDSgenotypeMat <- function(gds, matG) { #' @param dfPedProfile a \code{data.frame} with the information about #' the sample(s). #' Those are mandatory columns: "Name.ID", -#' "Case.ID", "Sample.Type", "Diagnosis", "Source". All columns must be in -#' \code{character} strings. The \code{data.frame} +#' "Case.ID", "Sample.Type", "Diagnosis" and "Source". All columns must be in +#' \code{character} strings format. The \code{data.frame} #' must contain the information for all the samples passed in the #' \code{listSamples} parameter. #' @@ -293,13 +298,15 @@ appendGDSgenotypeMat <- function(gds, matG) { #' the directory where the GDS Sample files will be created. #' #' @param genoSource a \code{character} string with two possible values: -#' 'snp-pileup' or 'generic'. It specifies if the genotype files +#' 'snp-pileup', 'generic' or 'VCF'. It specifies if the genotype files #' are generated by snp-pileup (Facets) or are a generic format CSV file #' with at least those columns: #' 'Chromosome', 'Position', 'Ref', 'Alt', 'Count', 'File1R' and 'File1A'. #' The 'Count' is the depth at the specified position; #' 'FileR' is the depth of the reference allele and #' 'File1A' is the depth of the specific alternative allele. +#' Finally the file can be a VCF file with at least those genotype +#' fields: GT, AD, DP. #' #' @param verbose a \code{logical} indicating if the function must print #' messages when running. @@ -308,8 +315,12 @@ appendGDSgenotypeMat <- function(gds, matG) { #' #' @examples #' -#' ## Path to the files in this package -#' dataDir <- system.file("extdata/tests", package="RAIDS") +#' ## Current directory +#' dataDir <- file.path(tempdir()) +#' +#' ## Copy required file into current directory +#' file.copy(from=file.path(system.file("extdata/tests", package="RAIDS"), +#' "ex1.txt.gz"), to=dataDir) #' #' ## The data.frame containing the information about the study #' ## The 3 mandatory columns: "study.id", "study.desc", "study.platform" @@ -330,8 +341,8 @@ appendGDSgenotypeMat <- function(gds, matG) { #' #' ## List of SNV positions #' listPositions <- data.frame(snp.chromosome=c(rep(1, 10)), -#' snp.position=c(3467333, 3467428, 3469375, 3469387, 3469502, 3469527, -#' 3469737, 3471497, 3471565, 3471618)) +#' snp.position=c(3467333, 3467428, 3469375, 3469387, 3469502, 3469527, +#' 3469737, 3471497, 3471565, 3471618)) #' #' ## Append genotype information to the Profile GDS file #' result <- RAIDS:::generateGDS1KGgenotypeFromSNPPileup(pathGeno=dataDir, @@ -350,6 +361,8 @@ appendGDSgenotypeMat <- function(gds, matG) { #' #' ## Unlink Profile GDS file (created for demo purpose) #' unlink(file.path(dataDir, "ex1.gds")) +#' unlink(file.path(dataDir, "ex1.txt.gz")) +#' #' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz #' @importFrom gdsfmt add.gdsn write.gdsn openfn.gds @@ -363,8 +376,14 @@ generateGDS1KGgenotypeFromSNPPileup <- function(pathGeno, genoSource, verbose) { # File with the description of the SNP keep - listMat <- dir(pathGeno, pattern = ".+.txt.gz") - listSampleFile <- gsub(".txt.gz", "", listMat) + if(genoSource == "VCF"){ + listMat <- dir(pathGeno, pattern = ".+.vcf.gz") + listSampleFile <- gsub(".vcf.gz", "", listMat) + }else{ + listMat <- dir(pathGeno, pattern = ".+.txt.gz") + listSampleFile <- gsub(".txt.gz", "", listMat) + } + g <- as.matrix(rep(-1, nrow(listPos))) @@ -382,6 +401,11 @@ generateGDS1KGgenotypeFromSNPPileup <- function(pathGeno, } else if(genoSource == "generic") { matSample <- readSNVFileGeneric(file.path(pathGeno, listMat[pos]), offset) + } else if(genoSource == "VCF") { + tmpProfile <- gsub(".vcf.gz", "",listMat[pos]) + matSample <- readSNVVCF(file.path(pathGeno, + listMat[pos]), + profileName=tmpProfile, offset) } # matAll <- merge(matSample[,c( "Chromosome", "Position", # "File1R", "File1A", @@ -527,6 +551,308 @@ generateGDS1KGgenotypeFromSNPPileup <- function(pathGeno, return(0L) } +#' @title Append the genotype information from a profile into the associated +#' Profile GDS File +#' +#' @description This function append the genotype information from a specific +#' profile into the Profile GDS file. The genotype information is extracted +#' from a SNV file as generated by SNP-pileup or other tools. +#' +#' @param profileFile a \code{character} string representing the path and the +#' file name of the genotype file or the bam if genoSource is snp-pileup the +#' fine extension must be .txt.gz, if VCF the extension must be .vcf.gz +#' +#' @param profileName a \code{character} string representing the profileName +#' +#' @param listPos a \code{data.frame} containing 2 columns. The first column, +#' called "snp.chromosome" contains the name of the chromosome where the +#' SNV is located. The second column, called "snp.position" contains the +#' position of the SNV on the chromosome. +#' +#' @param offset a \code{integer} to adjust if the genome start at 0 or 1. +#' +#' @param minCov a single positive \code{integer} representing the minimum +#' coverage needed to keep the SNVs in the analysis. Default: \code{10}. +#' +#' @param minProb a single positive \code{numeric} between 0 and 1 +#' representing the probability that the base change at the SNV position +#' is not an error. +#' Default: \code{0.999}. +#' +#' @param seqError a single positive \code{numeric} between 0 and 1 +#' representing the sequencing error rate. Default: \code{0.001}. +#' +#' @param dfPedProfile a \code{data.frame} with the information about +#' the sample(s). +#' Those are mandatory columns: "Name.ID", +#' "Case.ID", "Sample.Type", "Diagnosis" and "Source". All columns must be in +#' \code{character} strings format. The \code{data.frame} +#' must contain the information for all the samples passed in the +#' \code{listSamples} parameter. +#' +#' @param batch a single positive \code{integer} representing the current +#' identifier for the batch. Beware, this field is not stored anymore. +#' +#' @param studyDF a \code{data.frame} containing the information about the +#' study associated to the analysed sample(s). The \code{data.frame} must have +#' those 3 columns: "study.id", "study.desc", "study.platform". All columns +#' must be in \code{character} strings. +#' +#' @param pathProfileGDS a \code{character} string representing the path to +#' the directory where the GDS Sample files will be created. +#' +#' @param genoSource a \code{character} string with two possible values: +#' 'snp-pileup', 'generic' or 'VCF'. It specifies if the genotype files +#' are generated by snp-pileup (Facets) or are a generic format CSV file +#' with at least those columns: +#' 'Chromosome', 'Position', 'Ref', 'Alt', 'Count', 'File1R' and 'File1A'. +#' The 'Count' is the depth at the specified position; +#' 'FileR' is the depth of the reference allele and +#' 'File1A' is the depth of the specific alternative allele. +#' Finally the file can be a VCF file with at least those genotype +#' fields: GT, AD, DP. +#' +#' @param paramProfileGDS a \code{list} parameters ... +#' +#' @param verbose a \code{logical} indicating if the function must print +#' messages when running. +#' +#' @return The function returns \code{0L} when successful. +#' +#' @examples +#' +#' ## Current directory +#' dataDir <- file.path(tempdir()) +#' +#' ## Copy required file into current directory +#' file.copy(from=file.path(system.file("extdata/tests", package="RAIDS"), +#' "ex1.txt.gz"), to=dataDir) +#' +#' ## The data.frame containing the information about the study +#' ## The 3 mandatory columns: "study.id", "study.desc", "study.platform" +#' ## The entries should be strings, not factors (stringsAsFactors=FALSE) +#' studyDF <- data.frame(study.id = "MYDATA", +#' study.desc = "Description", +#' study.platform = "PLATFORM", +#' stringsAsFactors = FALSE) +#' +#' ## The data.frame containing the information about the samples +#' ## The entries should be strings, not factors (stringsAsFactors=FALSE) +#' samplePED <- data.frame(Name.ID=c("ex1", "ex2"), +#' Case.ID=c("Patient_h11", "Patient_h12"), +#' Diagnosis=rep("Cancer", 2), +#' Sample.Type=rep("Primary Tumor", 2), +#' Source=rep("Databank B", 2), stringsAsFactors=FALSE) +#' rownames(samplePED) <- samplePED$Name.ID +#' +#' ## List of SNV positions +#' listPositions <- data.frame(snp.chromosome=c(rep(1, 10)), +#' snp.position=c(3467333, 3467428, 3469375, 3469387, 3469502, 3469527, +#' 3469737, 3471497, 3471565, 3471618)) +#' +#' ## Append genotype information to the Profile GDS file +#' result <- RAIDS:::generateProfileGDS(profileFile=file.path(dataDir, "ex1.txt.gz"), +#' profileName="ex1", listPos=listPositions, +#' offset=-1, minCov=10, minProb=0.999, seqError=0.001, +#' dfPedProfile=samplePED, batch=1, studyDF=studyDF, +#' pathProfileGDS=dataDir, genoSource="snp-pileup", +#' verbose=FALSE) +#' +#' ## The function returns OL when successful +#' result +#' +#' ## The Profile GDS file 'ex1.gds' has been created in the +#' ## specified directory +#' list.files(dataDir) +#' +#' ## Unlink Profile GDS file (created for demo purpose) +#' unlink(file.path(dataDir, "ex1.gds")) +#' unlink(file.path(dataDir, "ex1.txt.gz")) +#' +#' +#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz +#' @importFrom gdsfmt add.gdsn write.gdsn openfn.gds +#' @importFrom stats qbinom +#' @importFrom utils read.csv +#' @encoding UTF-8 +#' @keywords internal +generateProfileGDS <- function(profileFile, profileName, listPos, offset, + minCov=10, minProb=0.999, seqError=0.001, dfPedProfile, batch, studyDF, + pathProfileGDS, genoSource, paramProfileGDS, verbose) { + + # File with the description of the SNP keep + # if(genoSource == "VCF"){ + # listMat <- dir(pathGeno, pattern = ".+.vcf.gz") + # listSampleFile <- gsub(".vcf.gz", "", listMat) + # }else{ + # listMat <- dir(pathGeno, pattern = ".+.txt.gz") + # listSampleFile <- gsub(".txt.gz", "", listMat) + # } + + + + + # for(i in seq_len(length(listSamples))) { + #pos <- which(listSampleFile == listSamples[i]) + + if(verbose) { message("generateProfileGDS start ", " ", Sys.time()) } + +# if(length(pos) == 1) { + + # + if(genoSource == "snp-pileup") { + matSample <- readSNVPileupFile(profileFile, offset) + } else if(genoSource == "generic") { + matSample <- readSNVFileGeneric(profileFile, offset) + } else if(genoSource == "VCF") { + # tmpProfile <- gsub(".vcf.gz", "",listMat[pos]) + matSample <- readSNVVCF(profileFile, + profileName=profileName, offset) + } else if(genoSource == "bam"){ + + matSample <- readSNVBAM(fileName=profileFile, + varSelected=listPos, paramSNVBAM=paramProfileGDS, + offset, verbose=verbose) + # listPos <- do.call(rbind, listPos) + colnames(listPos)[seq_len(2)] <- c("snp.chromosome", "snp.position") + + } + # matAll <- merge(matSample[,c( "Chromosome", "Position", + # "File1R", "File1A", + # "count" )], + # listPos, + # by.x = c("Chromosome", "Position"), + # by.y = c("snp.chromosome", "snp.position"), + # all.y = TRUE, + # all.x = FALSE) + # + # below same as the merge above but faster + + if(verbose) { message("End read ", Sys.time()) } + g <- as.matrix(rep(-1, nrow(listPos))) + z <- cbind(c(listPos$snp.chromosome, matSample$Chromosome, + matSample$Chromosome), + c(listPos$snp.position, matSample$Position, + matSample$Position), + c(rep(1,nrow(listPos)), rep(0,nrow(matSample)), + rep(2,nrow(matSample))), + c(rep(0,nrow(listPos)), matSample[, "File1R"], + -1 * matSample[, "File1R"]), + c(rep(0,nrow(listPos)), matSample[, "File1A"], + -1 * matSample[, "File1A"]), + c(rep(0,nrow(listPos)), matSample[, "count"], + -1 * matSample[, "count"])) + rm(matSample) + z <- z[order(z[,1], z[,2], z[,3]),] + + matAll <- data.frame(Chromosome=z[z[, 3] == 1, 1], + Position=z[z[, 3] == 1, 2], File1R=cumsum(z[, 4])[z[, 3] == 1], + File1A=cumsum(z[,5])[z[, 3] == 1], + count=cumsum(z[, 6])[z[, 3] == 1]) + rm(z) + + if(is.null(pathProfileGDS)){ + stop("pathProfileGDS is NULL in ", + "generateGDS1KGgenotypeFromSNPPileup\n") + } else{ + if(!dir.exists(pathProfileGDS)) { dir.create(pathProfileGDS) } + } + fileGDSSample <- file.path(pathProfileGDS, paste0(profileName, ".gds")) + + if(file.exists(fileGDSSample)) { + gdsSample <- openfn.gds(fileGDSSample, readonly=FALSE) + } else{ + gdsSample <- createfn.gds(fileGDSSample) + } + + if (! "Ref.count" %in% ls.gdsn(gdsSample)) { + var.Ref <- add.gdsn(gdsSample, "Ref.count", matAll$File1R, + valdim=c( nrow(listPos), 1), storage="sp.int16") + var.Alt <- add.gdsn(gdsSample, "Alt.count", matAll$File1A, + valdim=c( nrow(listPos), 1), storage="sp.int16") + var.Count <- add.gdsn(gdsSample, "Total.count", matAll$count, + valdim=c( nrow(listPos), 1), storage="sp.int16") + } else { + # you must append + var.Ref <- append.gdsn(index.gdsn(gdsSample, "Ref.count"), + matAll$File1R) + var.Alt <- append.gdsn(index.gdsn(gdsSample, "Alt.count"), + matAll$File1A) + var.Count <- append.gdsn(index.gdsn(gdsSample, "Total.count"), + matAll$count) + } + + listSampleGDS <- addStudyGDSSample(gdsSample, pedProfile=dfPedProfile, + batch=batch, listSamples=c(profileName), studyDF=studyDF, + verbose=verbose) + + listCount <- table(matAll$count[matAll$count >= minCov]) + cutOffA <- + data.frame(count=unlist(vapply(as.integer(names(listCount)), + FUN=function(x, minProb, eProb){ + return(max(2,qbinom(minProb, x, eProb))) }, + FUN.VALUE=numeric(1), minProb=minProb, + eProb=2 * seqError)), + allele=unlist(vapply(as.integer(names(listCount)), + FUN=function(x, minProb, eProb){ + return(max(2,qbinom(minProb, x, eProb))) }, + FUN.VALUE=numeric(1), minProb=minProb, + eProb=seqError))) + + row.names(cutOffA) <- names(listCount) + # Initialize the genotype array at -1 + + # Select the position where the coverage of the 2 alleles is enough + listCov <- which(rowSums(matAll[, c("File1R", "File1A")]) >= minCov) + + matAllC <- matAll[listCov,] + + # The difference depth - (nb Ref + nb Alt) can be realistically + # explain by sequencing error + listCov <- listCov[(matAllC$count - + (matAllC$File1R + matAllC$File1A)) < + cutOffA[as.character(matAllC$count), "count"]] + + matAllC <- matAll[listCov,] + rm(matAll) + + g <- as.matrix(rep(-1, nrow(listPos))) + # The sample is homozygote if the other known allele have a + # coverage of 0 + g[listCov][which(matAllC$File1A == 0)] <- 0 + g[listCov][which(matAllC$File1R == 0)] <- 2 + + # The sample is heterozygote if explain the coverage of + # the minor allele by sequencing error is not realistic. + g[listCov][which(matAllC$File1A >= + cutOffA[as.character(matAllC$count), "allele"] & + matAllC$File1R >= cutOffA[as.character(matAllC$count), + "allele"])] <- 1 + + #g <- as.matrix(g) + if("geno.ref" %in% ls.gdsn(gdsSample)){ + var.geno <- index.gdsn(gdsSample, "geno.ref") + + compression.gdsn(var.geno, compress="") + append.gdsn(var.geno, g, check=TRUE) + compression.gdsn(var.geno, compress="LZMA_RA.fast") + readmode.gdsn(var.geno) + + }else{ + var.geno <- add.gdsn(gdsSample, "geno.ref", valdim=c(length(g), 1), + g, storage="bit2", compress = "LZMA_RA.fast") + readmode.gdsn(var.geno) + } + + rm(g) + closefn.gds(gdsfile=gdsSample) + + if (verbose) { message("End ", profileName, " ", Sys.time()) } + + ## Success + return(0L) +} + #' @title Add information related to a specific study and specific samples #' into a GDS Sample file @@ -574,30 +900,29 @@ generateGDS1KGgenotypeFromSNPPileup <- function(pathGeno, #' ## Required library #' library(gdsfmt) #' -#' ## Create a temporary GDS file in an test directory -#' dataDir <- system.file("extdata/tests", package="RAIDS") -#' gdsFilePath <- file.path(dataDir, "GDS_TEMP_11.gds") +#' ## Create a temporary GDS file in an current directory +#' gdsFilePath <- file.path(tempdir(), "GDS_TEMP_11.gds") #' #' ## Create and open the GDS file #' tmpGDS <- createfn.gds(filename=gdsFilePath) #' #' ## Create a PED data frame with sample information #' ped1KG <- data.frame(Name.ID=c("1KG_sample_01", "1KG_sample_02"), -#' Case.ID=c("1KG_sample_01", "1KG_sample_02"), -#' Sample.Type=rep("Reference", 2), Diagnosis=rep("Reference", 2), -#' Source=rep("IGSR", 2), stringsAsFactors=FALSE) +#' Case.ID=c("1KG_sample_01", "1KG_sample_02"), +#' Sample.Type=rep("Reference", 2), Diagnosis=rep("Reference", 2), +#' Source=rep("IGSR", 2), stringsAsFactors=FALSE) #' #' ## Create a Study data frame with information about the study #' ## All samples are associated to the same study #' studyInfo <- data.frame(study.id="Ref.1KG", -#' study.desc="Unrelated samples from 1000 Genomes", -#' study.platform="GRCh38 1000 genotypes", -#' stringsAsFactors=FALSE) +#' study.desc="Unrelated samples from 1000 Genomes", +#' study.platform="GRCh38 1000 genotypes", +#' stringsAsFactors=FALSE) #' #' ## Add the sample information to the GDS Sample file #' ## The information for all samples is added (listSamples=NULL) #' RAIDS:::addStudyGDSSample(gdsProfile=tmpGDS, pedProfile=ped1KG, batch=1, -#' listSamples=NULL, studyDF=studyInfo, verbose=FALSE) +#' listSamples=NULL, studyDF=studyInfo, verbose=FALSE) #' #' ## Read study information from GDS Sample file #' read.gdsn(index.gdsn(node=tmpGDS, path="study.list")) @@ -611,6 +936,7 @@ generateGDS1KGgenotypeFromSNPPileup <- function(pathGeno, #' ## Delete the temporary GDS file #' unlink(x=gdsFilePath, force=TRUE) #' +#' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz #' @importFrom gdsfmt index.gdsn append.gdsn #' @encoding UTF-8 @@ -621,8 +947,14 @@ addStudyGDSSample <- function(gdsProfile, pedProfile, batch, listSamples, ## Used only the selected samples (all when listSamples == NULL) if(!(is.null(listSamples))) { if(length(listSamples) == length(intersect(listSamples, - rownames(pedProfile)))) { - pedProfile <- pedProfile[listSamples,] + pedProfile$Name.ID))) { + # if we remove the names we should manage the listSamples order + # something like + + tmp <- order(as.character(listSamples)) + pedProfile <- pedProfile[which(pedProfile$Name.ID %in% listSamples), ] + pedProfile <- pedProfile[order(pedProfile$Name.ID), ][order(tmp),] + } else { stop("List of samples includes samples not present in ", "the \'pedProfile\' data frame. The sample names must be ", @@ -721,7 +1053,7 @@ addStudyGDSSample <- function(gdsProfile, pedProfile, batch, listSamples, #' the process in the \code{\link[SNPRelate]{snpgdsIBDKING}}() function. #' #' @return a \code{list} containing: -#' \itemize{ +#' \describe{ #' \item{sample.id}{a \code{character} string representing the sample #' ids used in the analysis} #' \item{snp.id}{a \code{character} string representing the SNP ids @@ -763,7 +1095,7 @@ runIBDKING <- function(gds, profileID=NULL, snpID=NULL, maf=0.05, verbose) { ## Calculate IBD coefficients by KING method of moment ibd.robust <- snpgdsIBDKING(gdsobj=gds, sample.id=profileID, - snp.id=snpID, maf=maf, + snp.id=snpID, maf=maf, missing.rate=0.01, type="KING-robust", verbose=verbose) @@ -822,6 +1154,9 @@ runIBDKING <- function(gds, profileID=NULL, snpID=NULL, maf=0.05, verbose) { #' #' @examples #' +#' ## Required +#' library(SNPRelate) +#' #' ## Open an example dataset (HapMap) #' genoFile <- snpgdsOpen(snpgdsExampleFileName()) #' @@ -894,40 +1229,41 @@ runLDPruning <- function(gds, method, #' library(gdsfmt) #' #' ## Create a temporary GDS file in an test directory -#' dataDir <- system.file("extdata/tests", package="RAIDS") -#' gdsFilePath <- file.path(dataDir, "GDS_TEMP_03.gds") +#' gdsFilePath <- file.path(tempdir(), "GDS_TEMP_03.gds") #' #' ## Create and open the GDS file #' tmpGDS <- createfn.gds(filename=gdsFilePath) #' #' ## Create "sample.id" node (the node must be present) #' add.gdsn(node=tmpGDS, name="sample.id", val=c("sample_01", -#' "sample_02")) +#' "sample_02")) #' #' ## Create "sample.annot" node (the node must be present) #' add.gdsn(node=tmpGDS, name="sample.annot", val=data.frame( -#' Name.ID=c("sample_01", "sample_02"), -#' sex=c(1,1), # 1:Male 2: Female -#' pop.group=c("ACB", "ACB"), -#' superPop=c("AFR", "AFR"), -#' batch=c(1, 1), -#' stringsAsFactors=FALSE)) +#' Name.ID=c("sample_01", "sample_02"), +#' sex=c(1,1), # 1:Male 2: Female +#' pop.group=c("ACB", "ACB"), +#' superPop=c("AFR", "AFR"), +#' batch=c(1, 1), +#' stringsAsFactors=FALSE)) #' #' sync.gds(gdsfile=tmpGDS) #' #' ## Create a data.frame with information about samples -#' sample_info <- data.frame(Name.ID=c("sample_04", "sample_05", "sample_06"), -#' sex=c(1,2,1), # 1:Male 2: Female -#' pop.group=c("ACB", "ACB", "ACB"), -#' superPop=c("AFR", "AFR", "AFR"), -#' stringsAsFactors=FALSE) +#' sample_info <- data.frame(Name.ID=c("sample_04", "sample_05", +#' "sample_06"), +#' sex=c(1,2,1), # 1:Male 2: Female +#' pop.group=c("ACB", "ACB", "ACB"), +#' superPop=c("AFR", "AFR", "AFR"), +#' stringsAsFactors=FALSE) #' #' ## The row names must be the sample identifiers #' rownames(sample_info) <- sample_info$Name.ID #' #' ## Add information about 2 samples to the GDS file -#' RAIDS:::appendGDSRefSample(gdsReference=tmpGDS, dfPedReference=sample_info, -#' batch=2, listSamples=c("sample_04", "sample_06"), verbose=FALSE) +#' RAIDS:::appendGDSRefSample(gdsReference=tmpGDS, +#' dfPedReference=sample_info, +#' batch=2, listSamples=c("sample_04", "sample_06"), verbose=FALSE) #' #' ## Read sample identifier list #' ## Only "sample_04" and "sample_06" should have been added @@ -943,6 +1279,7 @@ runLDPruning <- function(gds, method, #' ## Delete the temporary GDS file #' unlink(x=gdsFilePath, force=TRUE) #' +#' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz #' @importFrom gdsfmt index.gdsn append.gdsn #' @encoding UTF-8 @@ -1006,8 +1343,7 @@ appendGDSRefSample <- function(gdsReference, dfPedReference, batch=1, #' library(gdsfmt) #' #' ## Create a temporary GDS file in an test directory -#' dataDir <- system.file("extdata/tests", package="RAIDS") -#' gdsFilePath <- file.path(dataDir, "GDS_TEMP_1.gds") +#' gdsFilePath <- file.path(tempdir(), "GDS_TEMP_1.gds") #' #' ## Create and open the GDS file #' tmpGDS <- createfn.gds(filename=gdsFilePath) @@ -1027,6 +1363,7 @@ appendGDSRefSample <- function(gdsReference, dfPedReference, batch=1, #' ## Delete the temporary GDS file #' unlink(x=gdsFilePath, force=TRUE) #' +#' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz #' @importFrom gdsfmt add.gdsn index.gdsn delete.gdsn sync.gds ls.gdsn #' @encoding UTF-8 @@ -1073,9 +1410,8 @@ addGDSStudyPruning <- function(gdsProfile, pruned) { #' ## Required library #' library(gdsfmt) #' -#' ## Create a temporary GDS file in an test directory -#' dataDir <- system.file("extdata/tests", package="RAIDS") -#' gdsFilePath <- file.path(dataDir, "GDS_TEMP.gds") +#' ## Create a temporary GDS file +#' gdsFilePath <- file.path(tempdir(), "GDS_TEMP.gds") #' #' ## Create and open the GDS file #' gdsFile <- createfn.gds(filename=gdsFilePath) @@ -1099,6 +1435,7 @@ addGDSStudyPruning <- function(gdsProfile, pruned) { #' ## Delete the temporary GDS file #' unlink(x=gdsFilePath, force=TRUE) #' +#' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz #' @importFrom gdsfmt add.gdsn index.gdsn delete.gdsn sync.gds ls.gdsn #' @encoding UTF-8 @@ -1111,3 +1448,297 @@ addUpdateLap <- function(gdsProfile, snpLap) { return(0L) } + + +#' @title Extract the block identifiers for a list of SNVs +#' +#' @description The function uses the GDS Reference Annotation file to extract +#' the unique block identifiers for a list of SNVs. The block type that is +#' going to be used to extract the information has to be provided by the +#' user. +#' +#' @param gdsRefAnnot an object of class \code{\link[gdsfmt]{gds.class}} +#' (a GDS file), the opened Reference SNV Annotation GDS file. +#' +#' @param snpIndex a \code{vectcor} of \code{integer} representing the +#' indexes of the SNVs of interest. +#' +#' @param blockTypeID a \code{character} string corresponding to the block +#' type used to extract the block identifiers. The block type must be +#' present in the GDS Reference Annotation file. +#' +#' @return a \code{vector} of \code{numeric} corresponding to the +#' block identifiers for the SNVs of interest. +#' +#' @examples +#' +#' # Required library +#' library(gdsfmt) +#' +#' ## Path to the demo 1KG Annotation GDS file located in this package +#' dataDir <- system.file("extdata", package="RAIDS") +#' +#' path1KG <- file.path(dataDir, "tests") +#' fileAnnotGDS <- file.path(path1KG, "ex1_good_small_1KG_Annot.gds") +#' +#' gdsRefAnnotation <- openfn.gds(fileAnnotGDS) +#' +#' ## The indexes for the SNVs of interest +#' snpIndex <- c(1,3,5,6,9) +#' +#' ## Extract the block identifiers for the SNVs represented by their indexes +#' ## for the block created using the genes from Hsapiens Ensembl v86 +#' RAIDS:::getBlockIDs(gdsRefAnnot=gdsRefAnnotation, snpIndex=snpIndex, +#' blockTypeID="GeneS.Ensembl.Hsapiens.v86") +#' +#' closefn.gds(gdsRefAnnotation) +#' +#' +#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz +#' @importFrom gdsfmt index.gdsn read.gdsn +#' @encoding UTF-8 +#' @keywords internal +getBlockIDs <- function(gdsRefAnnot, snpIndex, blockTypeID) { + + block.annot <- read.gdsn(index.gdsn(gdsRefAnnot, "block.annot")) + pos <- which(block.annot$block.id == blockTypeID) + + if(length(pos) != 1) { + stop("The following block type is not found in the ", + "GDS Annotation file: \'", blockTypeID, "\'") + } + + b <- read.gdsn(index.gdsn(gdsRefAnnot, "block"), start=c(1, pos), + count = c(-1, 1))[snpIndex] + + return(b) +} + + +#' @title Add information related to segments associated to the SNV +#' dataset for a specific sample into a GDS file +#' +#' @description The function adds the information related to segments +#' associated to the SNV dataset for a specific sample into a +#' GDS file, more specifically, in the "segment" node. If the "segment" node +#' already exists, the previous information is erased. +#' +#' @param gdsProfile an object of class \code{\link[gdsfmt]{gds.class}} +#' (a GDS file), a GDS Sample file. +#' +#' @param snpSeg a \code{vector} of \code{integer} representing the segment +#' identifiers associated to each SNV selected for the specific sample. The +#' length of the \code{vector} should correspond to the number of SNVs +#' present in the "snp.id" entry of the GDS sample file. +#' +#' @return The integer \code{0L} when successful. +#' +#' @examples +#' +#' ## Required library +#' library(gdsfmt) +#' +#' ## Temporary GDS file +#' gdsFilePath <- file.path(tempdir(), "GDS_TEMP.gds") +#' +#' ## Create and open the GDS file +#' GDS_file_tmp <- createfn.gds(filename=gdsFilePath) +#' +#' ## Vector of segment identifiers +#' segments <- c(1L, 1L, 1L, 2L, 2L, 3L, 3L) +#' +#' ## Add segments to the GDS file +#' RAIDS:::addUpdateSegment(gdsProfile=GDS_file_tmp, snpSeg=segments) +#' +#' ## Read segments information from GDS file +#' read.gdsn(index.gdsn(node=GDS_file_tmp, path="segment")) +#' +#' ## Close GDS file +#' closefn.gds(gdsfile=GDS_file_tmp) +#' +#' ## Delete the temporary GDS file +#' unlink(x=gdsFilePath, force=TRUE) +#' +#' +#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz +#' @importFrom gdsfmt add.gdsn index.gdsn delete.gdsn sync.gds ls.gdsn +#' @encoding UTF-8 +#' @keywords internal +addUpdateSegment <- function(gdsProfile, snpSeg) { + + if("segment" %in% ls.gdsn(gdsProfile)) { + snpLap <- write.gdsn(index.gdsn(gdsProfile, "segment"), snpSeg) + } else{ + snpLap <- add.gdsn(gdsProfile, "segment", snpSeg, storage="uint32") + } + + sync.gds(gdsProfile) + + ## Successful + return(0L) +} + + +#' @title Append sample names into a GDS file +#' +#' @description This function append the sample identifiers into the +#' "samples.id" node of a GDS file. +#' +#' @param gds an object of class +#' \link[gdsfmt]{gds.class} (a GDS file), the opened GDS file. +#' +#' @param listSamples a \code{vector} of \code{character} string representing +#' the sample identifiers to be added to GDS file. +#' +#' +#' @return The integer \code{0L} when successful. +#' +#' @examples +#' +#' ## Required library +#' library(gdsfmt) +#' +#' ## Temporary GDS file in current directory +#' gdsFilePath <- file.path(tempdir(), "GDS_TEMP_04.gds") +#' +#' ## Create and open the GDS file +#' GDS_file_tmp <- createfn.gds(filename=gdsFilePath) +#' +#' ## Create "sample.id" node (the node must be present) +#' add.gdsn(node=GDS_file_tmp, name="sample.id", val=c("sample_01", +#' "sample_02")) +#' +#' sync.gds(gdsfile=GDS_file_tmp) +#' +#' ## Add information about 2 samples to the GDS file +#' RAIDS:::appendGDSSampleOnly(gds=GDS_file_tmp, +#' listSamples=c("sample_03", "sample_04")) +#' +#' ## Read sample identifier list +#' ## Only "sample_03" and "sample_04" should have been added +#' read.gdsn(index.gdsn(node=GDS_file_tmp, path="sample.id")) +#' +#' ## Close GDS file +#' closefn.gds(gdsfile=GDS_file_tmp) +#' +#' ## Delete the temporary GDS file +#' unlink(x=gdsFilePath, force=TRUE) +#' +#' +#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz +#' @importFrom gdsfmt index.gdsn append.gdsn +#' @encoding UTF-8 +#' @keywords internal +appendGDSSampleOnly <- function(gds, listSamples) { + + sampleGDS <- index.gdsn(gds, "sample.id") + + append.gdsn(sampleGDS, val=listSamples, check=TRUE) + + return(0L) +} + +#' @title Append information associated to ld blocks, as indexes, into the +#' Population Reference SNV Annotation GDS file +#' +#' @description The function appends the information about the ld blocks into +#' the Population Reference SNV Annotation GDS file. The information is +#' extracted from the parameter listBlock. +#' +#' @param gds an object of class \link[gdsfmt]{gds.class} +#' (GDS file), an opened Reference Annotation GDS file. +#' +#' @param listBlock a \code{array} of integer +#' representing the linkage disequilibrium block for +#' each SNV in the in the same order than the variant +#' in Population reference dataset. +#' +#' @param blockName a \code{character} string representing the id of the block. +#' The blockName should not exist in \'gdsRefAnnotFile\'. +#' +#' @param blockDesc a \code{character} string representing the description of +#' the block. +#' +#' @return The integer \code{0L} when successful. +#' +#' @examples +#' +#' ## Required library for GDS +#' library(gdsfmt) +#' ## Path to the demo pedigree file is located in this package +#' dataDir <- system.file("extdata", package="RAIDS") +#' +# ## Temporary file +#' fileAnnotGDS <- file.path(tempdir(), "ex1_good_small_1KG_Ann_GDS.gds") +#' +#' +#' file.copy(file.path(dataDir, "tests", +#' "ex1_NoBlockGene.1KG_Annot_GDS.gds"), fileAnnotGDS) +#' +#' +#' fileReferenceGDS <- file.path(dataDir, "tests", +#' "ex1_good_small_1KG.gds") +#' \donttest{ +#' gdsRef <- openfn.gds(fileReferenceGDS) +#' listBlock <- read.gdsn(index.gdsn(gdsRef, "snp.position")) +#' listBlock <- rep(-1, length(listBlock)) +#' closefn.gds(gdsRef) +#' gdsAnnot1KG <- openfn.gds(fileAnnotGDS, readonly=FALSE) +#' ## Append information associated to blocks +#' RAIDS:::addGDS1KGLDBlock(gds=gdsAnnot1KG, +#' listBlock=listBlock, +#' blockName="blockEmpty", +#' blockDesc="Example") +#' +#' closefn.gds(gdsAnnot1KG) +#' +#' gdsAnnot1KG <- openfn.gds(fileAnnotGDS) +#' print(gdsAnnot1KG) +#' +#' closefn.gds(gdsAnnot1KG) +#' } +#' +#' ## Remove temporary file +#' unlink(fileAnnotGDS, force=TRUE) +#' +#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz +#' @importFrom gdsfmt add.gdsn index.gdsn ls.gdsn compression.gdsn +#' @importFrom gdsfmt append.gdsn sync.gds +#' @encoding UTF-8 +#' @keywords internal +addGDS1KGLDBlock <- function(gds, listBlock, blockName, blockDesc) { + + blockAnnot <- data.frame(block.id=blockName, + block.desc=blockDesc, + stringsAsFactors=FALSE) + + if(! ("block.annot" %in% ls.gdsn(gds))) { + varBlockAnnot <- add.gdsn(gds, "block.annot", blockAnnot) + }else { + curAnnot <- index.gdsn(gds, "block.annot/block.id") + append.gdsn(curAnnot,blockAnnot$block.id) + curAnnot <- index.gdsn(gds, "block.annot/block.desc") + append.gdsn(curAnnot, blockAnnot$block.desc) + } + + varBlock <- NULL + if(! ("block" %in% ls.gdsn(gds))){ + varBlock <- add.gdsn(gds, "block", + valdim=c(length(listBlock), 1), + listBlock, storage="int32", + compress = "LZ4_RA") + readmode.gdsn(varBlock) + + }else { + if(is.null(varBlock)) { + varBlock <- index.gdsn(gds, "block") + varBlock <- compression.gdsn(varBlock, "") + } + append.gdsn(varBlock, listBlock) + varBlock <- compression.gdsn(varBlock, "LZ4_RA") + } + + sync.gds(gds) + + return(0L) +} diff --git a/R/process1KG.R b/R/process1KG.R index eb7742ae7..a88864044 100644 --- a/R/process1KG.R +++ b/R/process1KG.R @@ -1,6 +1,6 @@ -#' @title Prepare the pedigree file using pedigree information from 1KG +#' @title Prepare the pedigree file using pedigree information from Reference #' -#' @description Using the pedigree file from 1KG, this function extracts +#' @description Using the pedigree file from Reference, this function extracts #' needed information and formats it into a \code{data.frame} so in can #' be used in following steps of the ancestry inference process. The #' function also requires that the genotyping files associated to each @@ -8,32 +8,33 @@ #' #' @param filePed a \code{character} string representing the path and #' file name of the pedigree file (PED file) that contains the information -#' related to the profiles present in the 1KG GDS file. The PED file must +#' related to the profiles present in the Reference GDS file. The PED file must #' exist. #' #' @param pathGeno a \code{character} string representing the path where -#' the 1KG genotyping files for each profile are located. Only the profiles -#' with associated genotyping files are retained in the creation of the final -#' \code{data.frame}. The name of the genotyping files must correspond to -#' the individual identification (Individual.ID) in the pedigree file -#' (PED file). +#' the Reference genotyping files for each profile are located. Only the +#' profiles with associated genotyping files are retained in the creation of +#' the final \code{data.frame}. The name of the genotyping files must +#' correspond to the individual identification (Individual.ID) in the +#' pedigree file (PED file). #' Default: \code{"./data/sampleGeno"}. #' #' @param batch a\code{integer} that uniquely identifies the source of the -#' pedigree information. The 1KG is usually \code{0L}. Default: \code{0L}. +#' pedigree information. The Reference is usually \code{0L}. +#' Default: \code{0L}. #' #' @return a \code{data.frame} containing the needed pedigree information -#' from 1KG. The \code{data.frame} contains those columns: -#' \itemize{ -#' \item{sample.id}{a \code{character} string representing the profile unique +#' from Reference. The \code{data.frame} contains those columns: +#' \describe{ +#' \item{sample.id}{ a \code{character} string representing the profile unique #' ID.} -#' \item{Name.ID}{a \code{character} string representing the profile name.} +#' \item{Name.ID}{ a \code{character} string representing the profile name.} #' \item{sex}{a \code{character} string representing the sex of the profile.} -#' \item{pop.group}{a \code{character} string representing the +#' \item{pop.group}{ a \code{character} string representing the #' sub-continental ancestry of the profile.} -#' \item{superPop }{a \code{character} string representing the continental +#' \item{superPop }{ a \code{character} string representing the continental #' ancestry of the profile.} -#' \item{superPop }{a \code{integer} representing the batch of the profile.} +#' \item{superPop }{ a \code{integer} representing the batch of the profile.} #' } #' #' @examples @@ -41,12 +42,15 @@ #' ## Path to the demo pedigree file is located in this package #' dataDir <- system.file("extdata", package="RAIDS") #' +#' ## Path where the demo genotype CSV files are located +#' pathGeno <- file.path(dataDir, "demoProfileGenotypes") +#' #' ## Demo pedigree file #' pedDemoFile <- file.path(dataDir, "PedigreeDemo.ped") #' #' ## Create a data.frame containing the information of the retained #' ## samples (samples with existing genotyping files) -#' prepPed1KG(filePed=pedDemoFile, pathGeno=dataDir, batch=0L) +#' prepPed1KG(filePed=pedDemoFile, pathGeno=pathGeno, batch=0L) #' #' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz @@ -59,7 +63,7 @@ prepPed1KG <- function(filePed, pathGeno=file.path("data", "sampleGeno"), ## Validate parameters validatePrepPed1KG(filePed=filePed, pathGeno=pathGeno, batch=batch) - ## Read the pedigree file from 1KG + ## Read the pedigree file from Reference ped1KG <- read.delim(filePed) ## Create a data.frame containing the needed information @@ -71,8 +75,8 @@ prepPed1KG <- function(filePed, pathGeno=file.path("data", "sampleGeno"), stringsAsFactors=FALSE) ## Create a list with all populations associated to each super-population - ## TODO The population versus super.population is hard-coded - ## TODO change to parameters + ## NOTE The population versus super.population is hard-coded + ## NOTE change to parameters listSuperPop1000G <- list() listSuperPop1000G[['EAS']] <- c("CHB", "JPT", "CHS", "CDX", "KHV") listSuperPop1000G[['EUR']] <- c("CEU", "TSI", "FIN", "GBR", "IBS") @@ -92,11 +96,11 @@ prepPed1KG <- function(filePed, pathGeno=file.path("data", "sampleGeno"), row.names(pedAll) <- pedAll$sample.id ## Change column format for Sex information - ## TODO: could be done when the data.frame is created + ## NOTE: could be done when the data.frame is created pedAll$sex <- as.character(pedAll$sex) - ## Only retained samples with existing genotyping file - listMat1k <- dir(pathGeno, pattern = ".+.csv.bz2") + ## Only retained samples with existing genotyping files + listMat1k <- dir(pathGeno, pattern=".+.csv.bz2") listSample1k <- gsub(".csv.bz2", "", listMat1k) pedAll <- pedAll[listSample1k, ] @@ -117,8 +121,8 @@ prepPed1KG <- function(filePed, pathGeno=file.path("data", "sampleGeno"), #' for the frequency in at least one super population. Default: \code{0.01}. #' #' @param fileSNV a \code{character} string representing the path and -#' file name of the bulk SNP information file from 1KG. The file must be in -#' text format. The file must exist. +#' file name of the bulk SNP information file from Reference. The file must +#' be in text format. The file must exist. #' #' @param fileSNPsRDS a \code{character} string representing the path and #' file name of the RDS file that will contain the indexes of the retained @@ -134,30 +138,27 @@ prepPed1KG <- function(filePed, pathGeno=file.path("data", "sampleGeno"), #' #' The filtered SNP information RDS file (parameter \code{fileFREQ}), contains #' a \code{data.frame} with those columns: -#' \itemize{ -#' \item{CHROM} {a \code{character} string representing the chromosome where +#' \describe{ +#' \item{CHROM}{ a \code{character} string representing the chromosome where #' the SNV is located.} -#' \item{POS} {a \code{character} string representing the SNV position on the +#' \item{POS}{ a \code{character} string representing the SNV position on the #' chromosome.} -#' \item{REF} {a \code{character} string representing the reference DNA base +#' \item{REF}{ a \code{character} string representing the reference DNA base #' for the SNV.} -#' \item{ALT} {a \code{character} string representing the alternative DNA base +#' \item{ALT}{ a \code{character} string representing the alternative DNA base #' for the SNV.}\ -#' \item{EAS_AF} {a \code{character} string representing the allele frequency +#' \item{EAS_AF}{ a \code{character} string representing the allele frequency #' of the EAS super population.} -#' \item{AFR_AF} {a \code{character} string representing the allele frequency +#' \item{AFR_AF}{ a \code{character} string representing the allele frequency #' of the AFR super population.} -#' \item{AMR_AF} {a \code{character} string representing the allele frequency +#' \item{AMR_AF}{ a \code{character} string representing the allele frequency #' of the AMR super population.} -#' \item{SAS_AF} {a \code{character} string representing the allele frequency +#' \item{SAS_AF}{ a \code{character} string representing the allele frequency #' of the SAS super population.} #' } #' #' @examples #' -#' ## Needed package -#' library(withr) -#' #' ## Path to the demo pedigree file is located in this package #' dataDir <- system.file("extdata", package="RAIDS") #' @@ -167,16 +168,17 @@ prepPed1KG <- function(filePed, pathGeno=file.path("data", "sampleGeno"), #' ## Temporary output files #' ## The first file contains the indexes of the retained SNPs #' ## The second file contains the filtered SNP information -#' snpIndexFile <- local_file(file.path(dataDir, "listSNP_TEMP.rds")) -#' filterSNVFile <- local_file(file.path(dataDir, "mapSNVSel_TEMP.rds")) +#' snpIndexFile <- file.path(tempdir(), "listSNP_TEMP.rds") +#' filterSNVFile <- file.path(tempdir(), "mapSNVSel_TEMP.rds") #' #' ## Create a data.frame containing the information of the retained #' ## samples (samples with existing genotyping files) #' generateMapSnvSel(cutOff=0.01, fileSNV=snvFile, -#' fileSNPsRDS=snpIndexFile, fileFREQ=filterSNVFile) +#' fileSNPsRDS=snpIndexFile, fileFREQ=filterSNVFile) #' #' ## Remove temporary files -#' deferred_run() +#' unlink(snpIndexFile, force=TRUE) +#' unlink(filterSNVFile, force=TRUE) #' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz #' @importFrom S4Vectors isSingleNumber @@ -195,8 +197,8 @@ generateMapSnvSel <- function(cutOff=0.01, fileSNV, fileSNPsRDS, fileFREQ) { } ## Read the bulk SNP information file - mapSNVSel <- read.csv2(fileSNV) - + # mapSNVSel <- read.csv2(fileSNV) + mapSNVSel <- read.csv(fileSNV) ## Identify SNPs that have a frequency equal or superior to the cut-off ## in at least one super population listSNP <- which(rowSums(mapSNVSel[,c("EAS_AF", @@ -223,12 +225,12 @@ generateMapSnvSel <- function(cutOff=0.01, fileSNV, fileSNPsRDS, fileFREQ) { } -#' @title Generate the GDS file that will contain the information from 1KG -#' data set (reference data set) +#' @title Generate the GDS file that will contain the information from +#' Reference data set (reference data set) #' #' @description This function generates the GDS file that will contain the -#' information from 1KG. The function also add the samples information, the -#' SNP information and the genotyping information into the GDS file. +#' information from Reference. The function also add the samples information, +#' the SNP information and the genotyping information into the GDS file. #' #' @param pathGeno a \code{character} string representing the path where #' the 1K genotyping files for each sample are located. The name of the @@ -272,14 +274,14 @@ generateMapSnvSel <- function(cutOff=0.01, fileSNV, fileSNPsRDS, fileFREQ) { #' #' @examples #' -#' ## Required package -#' library(withr) -#' #' ## Path to the demo pedigree file is located in this package #' dataDir <- system.file("extdata", package="RAIDS") #' +#' ## Path to the CSV genoytype files +#' pathGeno <- file.path(dataDir, "demoProfileGenotypes") +#' #' ## The RDS file containing the pedigree information -#' pedigreeFile <- file.path(dataDir, "PedigreeDemo.rds") +#' pedigreeFile <- file.path(dataDir, "PedigreeReferenceDemo.rds") #' #' ## The RDS file containing the indexes of the retained SNPs #' snpIndexFile <- file.path(dataDir, "listSNPIndexes_Demo.rds") @@ -287,16 +289,17 @@ generateMapSnvSel <- function(cutOff=0.01, fileSNV, fileSNPsRDS, fileFREQ) { #' ## The RDS file containing the filtered SNP information #' filterSNVFile <- file.path(dataDir, "mapSNVSelected_Demo.rds") #' -#' ## Temporary GDS file containing 1KG information -#' gdsFile <- local_file(file.path(dataDir, "1KG_TEMP.gds")) +#' ## Temporary Reference GDS file +#' tempRefGDS <- file.path(tempdir(), "1KG_TEMP.gds") #' -#' ## Create a temporary GDS file containing information from 1KG -#' generateGDS1KG(pathGeno=dataDir, filePedRDS=pedigreeFile, -#' fileSNVIndex=snpIndexFile, fileSNVSelected=filterSNVFile, -#' fileNameGDS=gdsFile, listSamples=NULL) +#' ## Create a temporary Reference GDS file +#' generateGDS1KG(pathGeno=pathGeno, filePedRDS=pedigreeFile, +#' fileSNVIndex=snpIndexFile, fileSNVSelected=filterSNVFile, +#' fileNameGDS=tempRefGDS, listSamples=NULL) #' #' ## Remove temporary files -#' deferred_run() +#' unlink(tempRefGDS, force=TRUE) +#' #' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz #' @@ -328,7 +331,7 @@ generateGDS1KG <- function(pathGeno=file.path("data", "sampleGeno"), dfPedReference=ped1KG, listSamples=listSamples) if(verbose) { message("Sample info DONE ", Sys.time()) } - generateGDSSNPinfo(gdsReference=newGDS, fileFREQ=fileSNVSelected, + generateGDSSNPinfo(gdsReference=newGDS, fileFreq=fileSNVSelected, verbose=verbose) if(verbose) { message("SNP info DONE ", Sys.time()) } @@ -359,25 +362,28 @@ generateGDS1KG <- function(pathGeno=file.path("data", "sampleGeno"), #' the individual identification (Individual.ID) in the pedigree file. #' Default: \code{"./data/sampleGeno"}. #' -#' @param fileSNPsRDS a \code{character} string representing the path and file +#' @param fileSNVIndex a \code{character} string representing the path and file #' name of the RDS file that contains the indexes of the retained SNPs. The #' file must exist. The file must be a RDS file. #' #' @param verbose a \code{logicial} indicating if the function should #' print messages when running. Default: \code{FALSE}. #' -#' @return The function returns \code{0L} when succesful. +#' @return The function returns \code{0L} when successful. #' #' @examples #' #' ## Required package -#' library(withr) +#' library(gdsfmt) #' #' ## Path to the demo pedigree file is located in this package #' dataDir <- system.file("extdata", package="RAIDS") #' +#' ## Path where the demo genotype CSV files are located +#' pathGeno <- file.path(dataDir, "demoProfileGenotypes") +#' #' ## The RDS file containing the pedigree information -#' pedigreeFile <- file.path(dataDir, "PedigreeDemo.rds") +#' pedigreeFile <- file.path(dataDir, "PedigreeReferenceDemo.rds") #' #' ## The RDS file containing the indexes of the retained SNPs #' snpIndexFile <- file.path(dataDir, "listSNPIndexes_Demo.rds") @@ -385,16 +391,16 @@ generateGDS1KG <- function(pathGeno=file.path("data", "sampleGeno"), #' ## The RDS file containing the filtered SNP information #' filterSNVFile <- file.path(dataDir, "mapSNVSelected_Demo.rds") #' -#' ## Temporary Reference GDS file containing 1KG information -#' fileReferenceGDS <- local_file(file.path(dataDir, "1KG_TEMP_02.gds")) +#' ## Temporary Reference GDS file containing reference information +#' fileReferenceGDS <- file.path(tempdir(), "1KG_TEMP_02.gds") #' #' ## Create a temporary Reference GDS file containing information from 1KG -#' generateGDS1KG(pathGeno=dataDir, filePedRDS=pedigreeFile, -#' fileSNVIndex=snpIndexFile, fileSNVSelected=filterSNVFile, -#' fileNameGDS=fileReferenceGDS, listSamples=NULL) +#' generateGDS1KG(pathGeno=pathGeno, filePedRDS=pedigreeFile, +#' fileSNVIndex=snpIndexFile, fileSNVSelected=filterSNVFile, +#' fileNameGDS=fileReferenceGDS, listSamples=NULL) #' #' ## Temporary Phase GDS file that will contain the 1KG Phase information -#' fileRefPhaseGDS <- local_file(file.path(dataDir, "1KG_TEMP_Phase_02.gds")) +#' fileRefPhaseGDS <- file.path(tempdir(), "1KG_TEMP_Phase_02.gds") #' #' ## Create Reference Phase GDS file #' gdsPhase <- createfn.gds(fileRefPhaseGDS) @@ -402,33 +408,36 @@ generateGDS1KG <- function(pathGeno=file.path("data", "sampleGeno"), #' ## Open Reference GDS file #' gdsRef <- openfn.gds(fileReferenceGDS) #' -#' \dontrun{ #' ## Fill temporary Reference Phase GDS file -#' generatePhase1KG2GDS(gdsReference=gdsRef, gdsReferencePhase=gdsPhase, -#' pathGeno=dataDir, fileSNPsRDS=filterSNVFile, verbose=FALSE) +#' if (FALSE) { +#' generatePhase1KG2GDS(gdsReference=gdsRef, +#' gdsReferencePhase=gdsPhase, +#' pathGeno=pathGeno, fileSNVIndex=snpIndexFile, +#' verbose=FALSE) #' } #' -#' ## Close 1KG Phase information file +#' ## Close Reference Phase information file #' closefn.gds(gdsPhase) #' #' ## Close Reference information file #' closefn.gds(gdsRef) #' #' ## Remove temporary files -#' deferred_run() +#' unlink(fileReferenceGDS, force=TRUE) +#' unlink(fileRefPhaseGDS, force=TRUE) #' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz #' @importFrom gdsfmt index.gdsn read.gdsn readmode.gdsn #' @encoding UTF-8 #' @export generatePhase1KG2GDS <- function(gdsReference, gdsReferencePhase, - pathGeno, fileSNPsRDS, verbose=FALSE) { + pathGeno, fileSNVIndex, verbose=FALSE) { ## The verbose parameter must be a logical validateLogical(logical=verbose, "verbose") sample.id <- read.gdsn(index.gdsn(gdsReference, "sample.id")) - listSNP <- readRDS(fileSNPsRDS) + listSNP <- readRDS(fileSNVIndex) var.phase <- NULL for(i in seq_len(length(sample.id))){ @@ -438,6 +447,7 @@ generatePhase1KG2GDS <- function(gdsReference, gdsReferencePhase, file1KG <- file.path(pathGeno, paste0(sample.id[i],".csv.bz2")) matSample <- read.csv2( file1KG, row.names = NULL)[listSNP,, drop=FALSE] + matSample <- matrix(as.numeric(unlist(strsplit(matSample[, 1], "\\|"))), nrow=2)[1,] @@ -462,16 +472,116 @@ generatePhase1KG2GDS <- function(gdsReference, gdsReferencePhase, return(0L) } -#' @title Identify genetically unrelated patients in GDS 1KG file +#' @title Adding the phase information into the Reference GDS file +#' +#' @description The function is adding the phase information into the +#' Reference Phase GDS file. The phase information is extracted from a Reference +#' GDS file and is added into a Reference Phase GDS file. An entry called +#' 'phase' is added to the Reference Phase GDS file. +#' +#' @param fileReferenceGDS a \code{character} string representing the file +#' name of the Reference GDS file. The file must exist. +#' +#' @param fileReferenceAnnotGDS a \code{character} string representing the +#' file name of the Population Reference GDS Annotation file. The file +#' must exist. +#' +#' @param pathGeno a \code{character} string representing the path where +#' the 1K genotyping files for each sample are located. The name of the +#' genotyping files must correspond to +#' the individual identification (Individual.ID) in the pedigree file. +#' Default: \code{"./data/sampleGeno"}. +#' +#' @param fileSNVIndex a \code{character} string representing the path and file +#' name of the RDS file that contains the indexes of the retained SNPs. The +#' file must exist. The file must be a RDS file. +#' +#' @param verbose a \code{logicial} indicating if the function should +#' print messages when running. Default: \code{FALSE}. +#' +#' @return The function returns \code{0L} when successful. +#' +#' @examples +#' +#' ## Path to the demo pedigree file is located in this package +#' dataDir <- system.file("extdata", package="RAIDS") +#' +#' ## Path where the demo genotype CSV files are located +#' pathGeno <- file.path(dataDir, "demoProfileGenotypes") +#' +#' ## The RDS file containing the pedigree information +#' pedigreeFile <- file.path(dataDir, "PedigreeReferenceDemo.rds") +#' +#' ## The RDS file containing the indexes of the retained SNPs +#' snpIndexFile <- file.path(dataDir, "listSNPIndexes_Demo.rds") +#' +#' ## The RDS file containing the filtered SNP information +#' filterSNVFile <- file.path(dataDir, "mapSNVSelected_Demo.rds") +#' +#' ## Temporary Reference GDS file containing reference information +#' fileReferenceGDS <- file.path(tempdir(), "1KG_TEMP_02.gds") +#' +#' ## Create a temporary Reference GDS file containing information from 1KG +#' generateGDS1KG(pathGeno=pathGeno, filePedRDS=pedigreeFile, +#' fileSNVIndex=snpIndexFile, fileSNVSelected=filterSNVFile, +#' fileNameGDS=fileReferenceGDS, listSamples=NULL) +#' +#' ## Temporary Phase GDS file that will contain the 1KG Phase information +#' fileRefPhaseGDS <- file.path(tempdir(), "1KG_TEMP_Phase_02.gds") +#' +#' +#' ## Fill temporary Reference Phase GDS file +#' if (FALSE) { +#' generatePhaseRef(fileReferenceGDS=fileReferenceGDS, +#' fileReferenceAnnotGDS=fileRefPhaseGDS, +#' pathGeno=pathGeno, fileSNVIndex=snpIndexFile, +#' verbose=FALSE) +#' } +#' +#' +#' ## Remove temporary files +#' unlink(fileReferenceGDS, force=TRUE) +#' unlink(fileRefPhaseGDS, force=TRUE) +#' +#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz +#' @importFrom gdsfmt createfn.gds closefn.gds index.gdsn read.gdsn readmode.gdsn +#' @importFrom SNPRelate snpgdsOpen +#' @encoding UTF-8 +#' @export +generatePhaseRef <- function(fileReferenceGDS, fileReferenceAnnotGDS, + pathGeno, fileSNVIndex, verbose=FALSE) { + + if (!(is.character(fileReferenceGDS) && (file.exists(fileReferenceGDS)))) { + stop("The \'fileReferenceGDS\' must be a character string ", + "representing the Reference GDS file. The file must exist.") + } + if (!(is.character(fileReferenceAnnotGDS) && (! file.exists(fileReferenceAnnotGDS)))) { + stop("The \'fileReferenceAnnotGDS\' must be a character string ", + "representing the Reference annotation GDS file. The file must not exist.") + } + + gdsReference <- snpgdsOpen(filename=fileReferenceGDS) + gdsReferencePhase <- createfn.gds(fileReferenceAnnotGDS) + + res <- generatePhase1KG2GDS(gdsReference, gdsReferencePhase, + pathGeno, fileSNVIndex, verbose=FALSE) + closefn.gds(gdsReference) + closefn.gds(gdsReferencePhase) + + + return(res) +} + +#' @title Identify genetically unrelated patients in GDS Reference file #' #' @description The function identify patients that are genetically related in -#' the 1KG file. It generates a first RDS file with the list of unrelated +#' the Reference file. It generates a first RDS file with the list of unrelated #' patient. It also generates a second RDS file with the kinship coefficient #' between the patients. #' #' @param gds an object of class #' \code{\link[SNPRelate:SNPGDSFileClass]{SNPRelate::SNPGDSFileClass}}, the -#' 1KG GDS file. +#' Reference GDS file. #' #' @param maf a single \code{numeric} representing the threshold for the minor #' allele frequency. Only the SNPs with ">= maf" will be used. @@ -488,7 +598,7 @@ generatePhase1KG2GDS <- function(gdsReference, gdsReferencePhase, #' #' @param filePart a \code{character} string representing the path and file #' name of the RDS file that will be created. The RDS file will contain the -#' information about the 1KG patients that are unrelated. The file will +#' information about the Reference patients that are unrelated. The file will #' contains two lists: the \code{list} of related samples, called \code{rels} #' and the list of unrelated samples, called \code{unrels}. #' The extension of the file must be '.rds'. @@ -497,34 +607,55 @@ generatePhase1KG2GDS <- function(gdsReference, gdsReferencePhase, #' #' @examples #' -#' ## Needed packages -#' library(withr) +#' ## Required package #' library(gdsfmt) #' #' ## Path to the demo pedigree file is located in this package #' dataDir <- system.file("extdata", package="RAIDS") #' -#' ## Open existing 1K GDS file -#' fileGDS <- file.path(dataDir, "1KG_Demo.gds") +#' ## Open existing demo Reference GDS file +#' fileGDS <- file.path(dataDir, "PopulationReferenceDemo.gds") #' tmpGDS <- snpgdsOpen(fileGDS) #' #' ## Temporary output files #' ## The first RDS file will contain the list of unrelated patients #' ## The second RDS file will contain the kinship information between patients -#' patientTmpFile <- local_file(file.path(dataDir, -#' "unrelatedPatients_TEMP.rds")) -#' ibdTmpFile <- local_file(file.path(dataDir,"ibd_TEMP.rds")) +#' patientTmpFile <- "unrelatedPatients_TEMP.rds" +#' ibdTmpFile <- "ibd_TEMP.rds" #' -#' ## Identify unrelated patients in 1KG GDS file -#' identifyRelative(gds=tmpGDS, maf=0.05, thresh=2^(-11/2), -#' fileIBD=ibdTmpFile, filePart=patientTmpFile) +#' ## Different code depending of the withr package availability +#' if (requireNamespace("withr", quietly=TRUE)) { #' -#' ## Close 1K GDS file -#' closefn.gds(tmpGDS) +#' ## Temporary output files +#' ## The first RDS file will contain the list of unrelated patients +#' ## The second RDS file will contain the kinship information +#' ## between patients +#' patientTmpFileLocal <- withr::local_file(patientTmpFile) +#' ibdTmpFileLocal <- withr::local_file(ibdTmpFile) #' -#' ## Remove temporary files -#' deferred_run() +#' ## Identify unrelated patients in demo Reference GDS file +#' identifyRelative(gds=tmpGDS, maf=0.05, thresh=2^(-11/2), +#' fileIBD=ibdTmpFileLocal, filePart=patientTmpFileLocal) #' +#' ## Close demo Reference GDS file +#' closefn.gds(tmpGDS) +#' +#' ## Remove temporary files +#' withr::deferred_run() +#' +#' } else { +#' +#' ## Identify unrelated patients in demo Reference GDS file +#' identifyRelative(gds=tmpGDS, maf=0.05, thresh=2^(-11/2), +#' fileIBD=ibdTmpFile, filePart=patientTmpFile) +#' +#' ## Close demo Reference GDS file +#' closefn.gds(tmpGDS) +#' +#' ## Remove temporary files +#' unlink(patientTmpFile, force=TRUE) +#' unlink(ibdTmpFile, force=TRUE) +#' } #' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz #' @@ -565,25 +696,127 @@ identifyRelative <- function(gds, maf=0.05, thresh=2^(-11/2), saveRDS(part, filePart) } +#' @title Identify genetically unrelated patients in GDS Reference file +#' +#' @description The function identify patients that are genetically related in +#' the Reference file. It generates a first RDS file with the list of unrelated +#' patient. It also generates a second RDS file with the kinship coefficient +#' between the patients. +#' +#' @param fileReferenceGDS a \code{character} string representing the file +#' name of the Reference GDS file. The file must exist. +#' +#' @param maf a single \code{numeric} representing the threshold for the minor +#' allele frequency. Only the SNPs with ">= maf" will be used. +#' Default: \code{0.05}. +#' +#' @param thresh a single \code{numeric} representing the threshold value used +#' to decide if a pair of individuals is ancestrally divergent. +#' Default: \code{2^(-11/2)}. +#' +#' @param fileIBD a \code{character} string representing the path and file +#' name of the RDS file that will be created. The RDS file will contain the +#' kinship coefficient between the patients. +#' The extension of the file must be '.rds'. +#' +#' @param filePart a \code{character} string representing the path and file +#' name of the RDS file that will be created. The RDS file will contain the +#' information about the Reference patients that are unrelated. The file will +#' contains two lists: the \code{list} of related samples, called \code{rels} +#' and the list of unrelated samples, called \code{unrels}. +#' The extension of the file must be '.rds'. +#' +#' @return \code{NULL} invisibly. +#' +#' @examples +#' +#' +#' ## Path to the demo pedigree file is located in this package +#' dataDir <- system.file("extdata", package="RAIDS") +#' +#' ## Open existing demo Reference GDS file +#' fileGDS <- file.path(dataDir, "PopulationReferenceDemo.gds") +#' +#' +#' ## Temporary output files +#' ## The first RDS file will contain the list of unrelated patients +#' ## The second RDS file will contain the kinship information between patients +#' patientTmpFile <- "unrelatedPatients_TEMP.rds" +#' ibdTmpFile <- "ibd_TEMP.rds" +#' +#' ## Different code depending of the withr package availability +#' if (requireNamespace("withr", quietly=TRUE)) { +#' +#' ## Temporary output files +#' ## The first RDS file will contain the list of unrelated patients +#' ## The second RDS file will contain the kinship information +#' ## between patients +#' patientTmpFileLocal <- withr::local_file(patientTmpFile) +#' ibdTmpFileLocal <- withr::local_file(ibdTmpFile) +#' +#' ## Identify unrelated patients in demo Reference GDS file +#' identifyRelativeRef(fileReferenceGDS=fileGDS, maf=0.05, thresh=2^(-11/2), +#' fileIBD=ibdTmpFileLocal, filePart=patientTmpFileLocal) +#' +#' ## Remove temporary files +#' withr::deferred_run() +#' +#' } else { +#' +#' ## Identify unrelated patients in demo Reference GDS file +#' identifyRelativeRef(fileReferenceGDS=fileGDS, maf=0.05, thresh=2^(-11/2), +#' fileIBD=ibdTmpFile, filePart=patientTmpFile) +#' +#' ## Remove temporary files +#' unlink(patientTmpFile, force=TRUE) +#' unlink(ibdTmpFile, force=TRUE) +#' } +#' +#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz +#' +#' @importFrom GENESIS pcairPartition +#' @importFrom gdsfmt closefn.gds +#' @importFrom SNPRelate snpgdsOpen +#' @importFrom S4Vectors isSingleNumber +#' @importFrom methods is +#' @encoding UTF-8 +#' @export +identifyRelativeRef <- function(fileReferenceGDS, maf=0.05, thresh=2^(-11/2), + fileIBD, filePart) { + + if (!(is.character(fileReferenceGDS) && (file.exists(fileReferenceGDS)))) { + stop("The \'fileReferenceGDS\' must be a character string ", + "representing the Reference GDS file. The file must exist.") + } + + gdsReference <- snpgdsOpen(filename=fileReferenceGDS) -#' @title Add the information about the unrelated patients to the 1KG GDS file + + identifyRelative(gdsReference, maf=maf, thresh=thresh, + fileIBD, filePart) + closefn.gds(gdsReference) + +} + +#' @title Add the information about the unrelated patients to the Reference +#' GDS file #' #' @description This function adds the information about the unrelated patients -#' to the 1KG GDS file. More specifically, it creates the field +#' to the Reference GDS file. More specifically, it creates the field #' \code{sample.ref} which as the value \code{1} when the sample #' is unrelated and the value \code{0} otherwise. #' The \code{sample.ref} is filled based on the information present in the #' input RDS file. #' #' @param fileNameGDS a \code{character} string representing the path and file -#' name of the GDS file that contains the 1KG information. The 1KG GDS file -#' must contain the SNP information, the genotyping information and -#' the pedigree information from 1000 Genomes. +#' name of the GDS file that contains the Reference information. The +#' Reference GDS file must contain the SNP information, the genotyping +#' information and the pedigree information from Reference dataset. #' The extension of the file must be '.gds'. #' #' @param filePart a \code{character} string representing the path and file #' name of the RDS file that contains the -#' information about the 1KG patients that are unrelated. +#' information about the Reference patients that are unrelated. #' The extension of the file must be '.rds'. The file must exists. #' #' @return The integer \code{0L} when successful. @@ -664,94 +897,159 @@ addRef2GDS1KG <- function(fileNameGDS, filePart) { return(0L) } - -#' @title Compute principal component axes (PCA) on SNV data using the -#' reference samples +#' @title Extract the specified column from the 1KG GDS 'sample.ref' node +#' for the reference profiles (real ancestry assignation) +#' +#' @description The function extract the specified column for the 'sample.ref' +#' node present in the Reference GDS file. The column must be present in the +#' \code{data.frame} saved in the 'sample.ref' node. Only the information for +#' the reference profiles is returned. The values +#' represent the known ancestry assignation. #' -#' @description The function runs a Principal Component Analysis (PCA) on -#' the SNv genotype data. The function also loads SNVs into the PCA to -#' calculate the SNV eigenvectors. Those 2 steps are done with the -#' \code{\link[SNPRelate]{snpgdsPCA}} and -#' \code{\link[SNPRelate]{snpgdsPCASNPLoading}} -#' functions. +#' @param gdsReference an object of class +#' \link[gdsfmt]{gds.class} (a GDS file), the opened Reference GDS file. #' -#' @param gds an object of class -#' \code{\link[SNPRelate:SNPGDSFileClass]{SNPRelate::SNPGDSFileClass}}, a SNP -#' GDS file. -#' -#' @param listSample.Ref a \code{vector} of \code{character} strings -#' corresponding to -#' the sample identifiers that will be used for the PCA. -#' -#' @param listSNP a \code{vector} of \code{character} strings representing -#' the SNV identifiers retained for the PCA. -#' -#' @param np a single positive \code{integer} representing the number of -#' threads. Default: \code{1L}. -#' -#' @return a \code{list} with 3 entries: -#' \itemize{ -#' \item{SNP}{ a \code{vector} of \code{character} strings representing the -#' SNV identifiers used in the PCA.} -#' \item{pca.unrel}{ an object of class \code{snpgdsPCAClass} as generated -#' by the -#' \code{\link[SNPRelate:snpgdsPCA]{SNPRelate::snpgdsPCA}} function. } -#' \item{snp.load}{ an object of class \code{snpgdsPCASNPLoading} as generated -#' by the -#' \code{\link[SNPRelate:snpgdsPCASNPLoading]{SNPRelate::snpgdsPCASNPLoading}} -#' function. } -#' } +#' @param popName a \code{character} string representing the name of the column +#' that will be fetched in the \code{data.frame} present in the Reference GDS +#' "sample.ref" node. The column must be present in the \code{data.frame}. +#' Default: \code{"superPop"}. +#' +#' @return \code{vector} of \code{character} strings representing the content +#' of the extracted column for the 1KG GDS 'sample.ref' node. The values +#' represent the known ancestry assignation. The profile +#' identifiers are used as names for the \code{vector}. #' #' @examples #' +#' ## Required library +#' library(gdsfmt) +#' #' ## Path to the demo pedigree file is located in this package #' dataDir <- system.file("extdata", package="RAIDS") #' -#' ## TODO +#' ## Open existing demo 1K GDS file with "sample.ref" node +#' nameFileGDS <- file.path(dataDir, "PopulationReferenceDemo.gds") +#' fileGDS <- snpgdsOpen(nameFileGDS) +#' +#' ## Extract super population information for the 1KG profiles +#' getRef1KGPop(gdsReference=fileGDS, popName="superPop") +#' +#' ## Close 1K GDS file +#' closefn.gds(fileGDS) #' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz -#' @importFrom SNPRelate snpgdsPCA snpgdsPCASNPLoading +#' @importFrom gdsfmt index.gdsn read.gdsn +#' @importFrom stats rmultinom #' @encoding UTF-8 #' @export -basePCASample <- function(gds, listSample.Ref=NULL, listSNP=NULL, np=1L) { +getRef1KGPop <- function(gdsReference, popName="superPop") { + + ## The gdsReference must be an object of class "gds.class" + validateGDSClass(gds=gdsReference, "gdsReference") + + ## The popName is a character string + if (!is.character(popName)) { + stop("The \'popName\' parameter must be a single character string.") + } + + sample.ref <- read.gdsn(index.gdsn(gdsReference, "sample.ref")) + dataRef <- read.gdsn(index.gdsn(gdsReference, + "sample.annot"))[which(sample.ref == TRUE),] + + if(! popName %in% colnames(dataRef)) { + stop("The population ", popName, " is not supported ", + "(not found in the 1KG GDS file).") + } - listPCA <- list() + dataRef <- dataRef[, popName] + names(dataRef) <- read.gdsn(index.gdsn(node=gdsReference, + "sample.id"))[which(sample.ref == TRUE)] + + return(dataRef) +} - ## Save the SNV list - listPCA[["SNP"]] <- listSNP +#' @title Extract the from the 1KG GDS 'sample.ref' node +#' for the reference profiles (real ancestry assignation) +#' +#' @description The function extract the specified column for the 'sample.ref' +#' node present in the Reference GDS file. The column must be present in the +#' \code{data.frame} saved in the 'sample.ref' node. Only the information for +#' the reference profiles is returned. The values +#' represent the known ancestry assignation. +#' +#' @param fileReferenceGDS a \code{character} string representing the file +#' name of the Reference GDS file. The file must exist. +#' +#' +#' @return \code{vector} of \code{character} strings representing the content +#' of the extracted column for the 1KG GDS 'sample.ref' node. The values +#' represent the known ancestry assignation. The profile +#' identifiers are used as names for the \code{vector}. +#' +#' @examples +#' +#' +#' ## Path to the demo pedigree file is located in this package +#' dataDir <- system.file("extdata", package="RAIDS") +#' +#' ## Open existing demo 1K GDS file with "sample.ref" node +#' nameFileGDS <- file.path(dataDir, "PopulationReferenceDemo.gds") +#' +#' ## Extract super population information for the 1KG profiles +#' getRefSuperPop(fileReferenceGDS=nameFileGDS) +#' +#' +#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz +#' @importFrom gdsfmt index.gdsn read.gdsn closefn.gds +#' @importFrom stats rmultinom +#' @importFrom SNPRelate snpgdsOpen +#' @encoding UTF-8 +#' @export +getRefSuperPop <- function(fileReferenceGDS) { - ## Calculate the PCA and save the results - listPCA[["pca.unrel"]] <- snpgdsPCA(gds, sample.id=listSample.Ref, - snp.id=listSNP, - num.thread=np, - verbose=TRUE) + ## The fileReferenceGDS must be a character string and the file must exists + if (!(is.character(fileReferenceGDS) && (file.exists(fileReferenceGDS)))) { + stop("The \'fileReferenceGDS\' must be a character string ", + "representing the Reference GDS file. The file must exist.") + } - ## Calculate the SNV eigenvectors and save the results - listPCA[["snp.load"]] <- snpgdsPCASNPLoading(listPCA[["pca.unrel"]], - gdsobj=gds, - num.thread=np, - verbose=TRUE) + gdsReference <- snpgdsOpen(filename=fileReferenceGDS) + df <- getRef1KGPop(gdsReference) + closefn.gds(gdsReference) - ## Return a list with 3 entries - return(listPCA) + return(df) } -#' @title TODO contain the information from 1KG +#' @title Append information associated to ld blocks, as indexes, into the +#' Population Reference SNV Annotation GDS file #' -#' @description TODO +#' @description The function appends the information about the ld blocks into +#' the Population Reference SNV Annotation GDS file. The information is +#' extracted from the Population Reference GDS file and files \'.det\'. #' -#' @param gds an object of class -#' \link[gdsfmt]{gds.class} (a GDS file), TODO +#' @param fileReferenceGDS a \code{character} string representing the file +#' name of the Reference GDS file. The file must exist. #' -#' @param gdsOut an object of class \code{gds} in writing +#' @param gdsRefAnnotFile a \code{character} string representing the +#' file name corresponding the Reference SNV +#' Annotation GDS file. The function will +#' open it in write mode and close it after. The file must exist. #' -#' @param PATHBLOCK TODO +#' @param pathBlock a \code{character} string representing the directory +#' where all the output file det from the plink block command are located. +#' The directory must not include other file with the extension \'.det\'. +#' The name of the \'.det\' must include the super-population between \'.\' +#' and the chromosome in the form \'chrNumber.\' \( \'chr1.\'\). #' -#' @param superPop TODO +#' @param superPop a \code{character} string representing the super population. #' -#' @param blockName TODO +#' @param blockName a \code{character} string representing the id of the block. +#' The blockName should not exist in \'gdsRefAnnotFile\'. +#' Default: \code{"ldBlock"}. #' -#' @param blockDesc TODO +#' @param blockDesc a \code{character} string representing the description of +#' the block. +#' Default: \code{"Not Define"} #' #' @param verbose a \code{logical} indicating if message information should be #' printed. Default: \code{FALSE}. @@ -766,137 +1064,274 @@ basePCASample <- function(gds, listSample.Ref=NULL, listSNP=NULL, np=1L) { #' #' @examples #' -#' # TODO +#' ## Path to the demo pedigree file is located in this package +#' dataDir <- system.file("extdata", package="RAIDS") +#' +# ## Temporary file +#' fileAnnotGDS <- file.path(tempdir(), "ex1_good_small_1KG_Ann_GDS.gds") +#' +#' ## Demo of of output file det from the plink block +#' ## command for chromosome 1 +#' fileLdBlock <- file.path(dirname(fileAnnotGDS), "block.sp.EUR.Ex.chr1.blocks.det") +#' +#' +#' file.copy(file.path(dataDir, "tests", +#' "ex1_NoBlockGene.1KG_Annot_GDS.gds"), fileAnnotGDS) +#' file.copy(file.path(dataDir, "block.sp.EUR.Ex.chr1.blocks.det"), +#' fileLdBlock) +#' +#' +#' +#' ## GDS Reference file +#' fileReferenceGDS <- file.path(dataDir, "tests", +#' "ex1_good_small_1KG.gds") +#' +#' \donttest{ +#' +#' +#' ## Append information associated to blocks +#' addBlockFromDetFile(fileReferenceGDS=fileReferenceGDS, +#' gdsRefAnnotFile=fileAnnotGDS, +#' pathBlock=dirname(fileAnnotGDS), +#' superPop="EUR") +#' +#' gdsAnnot1KG <- openfn.gds(fileAnnotGDS) +#' print(gdsAnnot1KG) +#' +#' closefn.gds(gdsAnnot1KG) +#' } +#' +#' ## Remove temporary file +#' unlink(fileAnnotGDS, force=TRUE) +#' unlink(fileLdBlock, force=TRUE) #' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz #' -#' @importFrom gdsfmt createfn.gds put.attr.gdsn closefn.gds +#' @importFrom gdsfmt openfn.gds closefn.gds read.gdsn index.gdsn ls.gdsn +#' @importFrom SNPRelate snpgdsOpen #' @encoding UTF-8 #' @export -addBlockFromPlink2GDS <- function(gds, gdsOut, PATHBLOCK, - superPop, blockName, - blockDesc, verbose=FALSE) { +addBlockFromDetFile <- function(fileReferenceGDS, gdsRefAnnotFile, pathBlock, + superPop, blockName="ldBlock", + blockDesc="Not Define", verbose=FALSE) { + if (!(is.character(fileReferenceGDS) && (file.exists(fileReferenceGDS)))) { + stop("The \'fileReferenceGDS\' must be a character string ", + "representing the Reference GDS file. The file must exist.") + } + if(!(is.character(blockName))){ + stop("The \'blockName\' must be a character string ", + "representing the name of the block.") + } + + if(blockName == "ldBlock"){ + blockName <- paste0(blockName, ".", superPop) + } + + gdsRefAnnot <- openfn.gds(gdsRefAnnotFile) + + if(("block.annot" %in% ls.gdsn(gdsRefAnnot))) { + listAnno <- read.gdsn(index.gdsn(gdsRefAnnot, "block.annot")) + if(length(which(gdsRefAnnot$block.id == blockName)) > 0){ + stop("The \'blockName\' already exist in \'gdsRefAnnotFile\'.") + } + } + closefn.gds(gdsRefAnnot) + + gdsReference <- snpgdsOpen(filename=fileReferenceGDS) + - ## The gds must be an object of class "gds.class" - validateGDSClass(gds=gds, name="gds") ## The verbose must be a logical validateLogical(verbose, "verbose") ## Extract the SNP chromosomes and positions - snp.chromosome <- read.gdsn(index.gdsn(gds, "snp.chromosome")) - snp.position <- read.gdsn(index.gdsn(gds, "snp.position")) + snpChromosome <- read.gdsn(index.gdsn(gdsReference, "snp.chromosome")) + #snpPosition <- read.gdsn(index.gdsn(gdsReference, "snp.position")) + closefn.gds(gdsReference) - listChr <- unique(snp.chromosome) + listFileBlock <- dir(pathBlock, ".det") + listFileBlock <- listFileBlock[grep(paste0("\\.", superPop, "\\."), listFileBlock)] - listChr <- listChr[order(listChr)] - listChr <- seq_len(22) - listBlock <- list() - for(chr in listChr) { - if(verbose) { message("chr", chr, " ",Sys.time()) } + listChr <- unique(snpChromosome) - snp.keep <- snp.position[snp.chromosome == chr] + #listChr <- listChr[order(listChr)] + #listChr <- seq_len(22) + listBlock <- list() - listBlock[[chr]] <- processBlockChr(snp.keep, PATHBLOCK, superPop, chr) - if(chr > 1) { - vMax <- max(listBlock[[chr-1]]) - vMin <- min(listBlock[[chr-1]]) - listBlock[[chr]][listBlock[[chr]] > 0] <- + for(chr in seq_len(length(listChr))) { + if(verbose) { message("chr", listChr[chr], " ",Sys.time()) } + listChrCur <- listFileBlock[grep(paste0("chr",listChr[chr],"\\."), listFileBlock)] + if(length(listChrCur) == 1){ + tmp <- processBlockChr(fileReferenceGDS, file.path(pathBlock, listChrCur)) + listBlock[[chr]] <- tmp$block.snp + if(chr > 1) { + vMax <- max(listBlock[[chr-1]], 0) + vMin <- min(listBlock[[chr-1]], 0) + listBlock[[chr]][listBlock[[chr]] > 0] <- listBlock[[chr]][listBlock[[chr]] > 0] + vMax - if(vMin < 0) { - listBlock[[chr]][listBlock[[chr]] < 0] <- + if(vMin < 0) { + listBlock[[chr]][listBlock[[chr]] < 0] <- listBlock[[chr]][listBlock[[chr]] < 0] + vMin + } + } + }else{ + + listBlock[[chr]] <- rep(-1, length(which(snpChromosome == listChr[chr]))) + vMin <- 0 + if(chr > 1){ + vMin <- min(listBlock[[chr-1]]) + } + if(vMin < 0){ + listBlock[[chr]] <- listBlock[[chr]] + vMin } } + } listBlock <- do.call(c, listBlock) - ## Save the information into the GDS file - addGDS1KGLDBlock(gdsOut, listBlock, blockName, blockDesc) + gdsRefAnnot <- openfn.gds(gdsRefAnnotFile, readonly = FALSE) + ## Save the information into the GDS file + addGDS1KGLDBlock(gdsRefAnnot, listBlock, blockName, blockDesc) + closefn.gds(gdsRefAnnot) ## Success return(0L) } -#' @title Extract the specified column from the 1KG GDS 'sample.ref' node -#' for the reference profiles (real ancestry assignation) +#' @title Append information associated to blocks, as indexes, into the +#' Population Reference SNV Annotation GDS file #' -#' @description The function extract the specified column for the 'sample.ref' -#' node present in the 1KG GDS file. The column must be present in the -#' \code{data.frame} saved in the 'sample.ref' node. Only the information for -#' the reference profiles is returned. The values -#' represent the known ancestry assignation. +#' @description The function appends the information about the blocks into +#' the Population Reference SNV Annotation GDS file. The information is +#' extracted from the Population Reference GDS file. #' #' @param gdsReference an object of class -#' \link[gdsfmt]{gds.class} (a GDS file), the opened 1KG GDS file. +#' \link[gdsfmt]{gds.class} (a GDS file), the opened Reference GDS file. #' -#' @param popName a \code{character} string representing the name of the column -#' that will be fetched in the \code{data.frame} present in the 1KG GDS -#' "sample.ref" node. The column must be present in the \code{data.frame}. -#' Default: \code{"superPop"}. +#' @param gdsRefAnnotFile a \code{character} string representing the +#' file name corresponding the Reference SNV +#' Annotation GDS file. The function will +#' open it in write mode and close it after. The file must exist. #' -#' @return \code{vector} of \code{character} strings representing the content -#' of the extracted column for the 1KG GDS 'sample.ref' node. The values -#' represent the known ancestry assignation. The profile -#' identifiers are used as names for the \code{vector}. +#' @param winSize a single positive \code{integer} representing the +#' size of the window to use to group the SNVs when the SNVs are in a +#' non-coding region. Default: \code{10000L}. +#' +#' @param ensDb An object with the ensembl genome annotation +#' Default: \code{EnsDb.Hsapiens.v86}. +#' +#' @param suffixBlockName a \code{character} string that identify the source +#' of the block and that will be added to the block description into +#' the Reference SNV Annotation GDS file, as example: Ensembl.Hsapiens.v86. +#' +#' @return The integer \code{OL} when the function is successful. #' #' @examples #' +#' ## Required library +#' library(SNPRelate) +#' #' ## Path to the demo pedigree file is located in this package #' dataDir <- system.file("extdata", package="RAIDS") #' -#' ## Open existing 1K GDS file with "sample.ref" node -#' nameFileGDS <- file.path(dataDir, "1KG_Demo_with_sampleREF.gds") -#' fileGDS <- snpgdsOpen(nameFileGDS) +# ## Temporary file +#' fileAnnotGDS <- file.path(tempdir(), "ex1_good_small_1KG_Ann_GDS.gds") #' -#' ## Extract super population information for the 1KG profiles -#' getRef1KGPop(gdsReference=fileGDS, popName="superPop") +#' ## Required library +#' if (requireNamespace("EnsDb.Hsapiens.v86", quietly=TRUE)) { #' -#' ## Close 1K GDS file -#' closefn.gds(fileGDS) +#' file.copy(file.path(dataDir, "tests", +#' "ex1_NoBlockGene.1KG_Annot_GDS.gds"), fileAnnotGDS) +#' +#' ## Making a "short cut" on the ensDb object +#' edb <- EnsDb.Hsapiens.v86::EnsDb.Hsapiens.v86 +#' +#' ## GDS Reference file +#' fileReferenceGDS <- file.path(dataDir, "tests", +#' "ex1_good_small_1KG.gds") +#' +#' \donttest{ +#' ## Open the reference GDS file (demo version) +#' gds1KG <- snpgdsOpen(fileReferenceGDS) +#' +#' ## Append information associated to blocks +#' addGeneBlockGDSRefAnnot(gdsReference=gds1KG, +#' gdsRefAnnotFile=fileAnnotGDS, +#' ensDb=edb, +#' suffixBlockName="EnsDb.Hsapiens.v86") +#' +#' gdsAnnot1KG <- openfn.gds(fileAnnotGDS) +#' print(gdsAnnot1KG) +#' print(read.gdsn(index.gdsn(gdsAnnot1KG, "block.annot"))) +#' +#' ## Close GDS files +#' closefn.gds(gds1KG) +#' closefn.gds(gdsAnnot1KG) +#' } +#' +#' ## Remove temporary file +#' unlink(fileAnnotGDS, force=TRUE) +#' +#' } #' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz -#' @importFrom gdsfmt index.gdsn read.gdsn -#' @importFrom stats rmultinom +#' @importFrom gdsfmt openfn.gds closefn.gds +#' @importFrom S4Vectors isSingleNumber #' @encoding UTF-8 #' @export -getRef1KGPop <- function(gdsReference, popName="superPop") { +addGeneBlockGDSRefAnnot <- function(gdsReference, gdsRefAnnotFile, + winSize=10000, ensDb, suffixBlockName) { ## The gdsReference must be an object of class "gds.class" - validateGDSClass(gds=gdsReference, "gdsReference") - - ## The popName is a character string - if (!is.character(popName)) { - stop("The \'popName\' parameter must be a single character string.") + if (!inherits(gdsReference, "gds.class")) { + stop("The \'gdsReference\' must be an object of class \'gds.class\'") } - sample.ref <- read.gdsn(index.gdsn(gdsReference, "sample.ref")) - dataRef <- read.gdsn(index.gdsn(gdsReference, - "sample.annot"))[which(sample.ref == TRUE),] + ## Validate that the Reference Annotation GDS file exists + if (! file.exists(gdsRefAnnotFile)) { + stop("The file \'", gdsRefAnnotFile, "\' does not exist.") + } - if(! popName %in% colnames(dataRef)) { - stop("The population ", popName, " is not supported ", - "(not found in the 1KG GDS file).") + ## The winSize must be a positive single number + if (!(isSingleNumber(winSize) && (winSize > 0))) { + stop("The \'winSize\' parameter must be a single numeric value." ) } - dataRef <- dataRef[, popName] - names(dataRef) <- read.gdsn(index.gdsn(node=gdsReference, - "sample.id"))[which(sample.ref == TRUE)] + ## Generate two indexes based on gene annotation for + ## the Reference GDS Annotation block + dfGeneBlock <- generateGeneBlock(gdsReference=gdsReference, + winSize=winSize, ensDb=ensDb) - return(dataRef) -} + ## Open GDS Reference Annotation file in writing mode + gdsRefAnnot <- openfn.gds(gdsRefAnnotFile, readonly=FALSE) + + blockName <- paste0("Gene.", suffixBlockName) + blockDesc <- paste0("List of blocks including overlapping genes ", + suffixBlockName) + addBlockInGDSAnnot(gdsRefAnnot, dfGeneBlock$Gene, blockName, blockDesc) + blockName <- paste0("GeneS.", suffixBlockName) + blockDesc <- paste0("List of blocks of split by genes ", suffixBlockName) + addBlockInGDSAnnot(gdsRefAnnot, dfGeneBlock$GeneS, blockName, blockDesc) + + ## Close GDS Reference annotation file + closefn.gds(gdsRefAnnot) + ## Success + return(0L) +} -#' @title Generate two indexes based on gene annotation for gdsAnnot1KG -#' block and add the indexes into the -#' gdsAnnot1KG +#' @title Append information associated to blocks, as indexes, into the +#' Population Reference SNV Annotation GDS file #' -#' @description TODO +#' @description The function appends the information about the blocks into +#' the Population Reference SNV Annotation GDS file. The information is +#' extracted from the Population Reference GDS file. #' -#' @param gdsReference an object of class -#' \link[gdsfmt]{gds.class} (a GDS file), the opened 1KG GDS file. +#' @param fileReferenceGDS a \code{character} string representing the file +#' name of the Reference GDS file. The file must exist. #' -#' @param file.gdsRefAnnot the filename corresponding the 1KG SNV +#' @param gdsRefAnnotFile a \code{character} string representing the +#' file name corresponding the Reference SNV #' Annotation GDS file. The function will #' open it in write mode and close it after. The file must exist. #' @@ -904,10 +1339,12 @@ getRef1KGPop <- function(gdsReference, popName="superPop") { #' size of the window to use to group the SNVs when the SNVs are in a #' non-coding region. Default: \code{10000L}. #' -#' @param EnsDb An object with the ensembl genome annotation +#' @param ensDb An object with the ensembl genome annotation #' Default: \code{EnsDb.Hsapiens.v86}. #' -#' @param suffixe.blockName TODO ex Ensembl.Hsapiens.v86 +#' @param suffixBlockName a \code{character} string that identify the source +#' of the block and that will be added to the block description into +#' the Reference SNV Annotation GDS file, as example: Ensembl.Hsapiens.v86. #' #' @return The integer \code{OL} when the function is successful. #' @@ -916,49 +1353,62 @@ getRef1KGPop <- function(gdsReference, popName="superPop") { #' ## Path to the demo pedigree file is located in this package #' dataDir <- system.file("extdata", package="RAIDS") #' -#' ## TODO +# ## Temporary file +#' fileAnnotGDS <- file.path(tempdir(), "ex1_good_small_1KG_Ann_GDS.gds") +#' +#' ## Required library +#' if (requireNamespace("EnsDb.Hsapiens.v86", quietly=TRUE)) { +#' +#' file.copy(file.path(dataDir, "tests", +#' "ex1_NoBlockGene.1KG_Annot_GDS.gds"), fileAnnotGDS) +#' +#' ## Making a "short cut" on the ensDb object +#' edb <- EnsDb.Hsapiens.v86::EnsDb.Hsapiens.v86 +#' +#' ## GDS Reference file +#' fileReferenceGDS <- file.path(dataDir, "tests", +#' "ex1_good_small_1KG.gds") +#' +#' \donttest{ +#' +#' +#' ## Append information associated to blocks +#' addGeneBlockRefAnnot(fileReferenceGDS=fileReferenceGDS, +#' gdsRefAnnotFile=fileAnnotGDS, +#' ensDb=edb, +#' suffixBlockName="EnsDb.Hsapiens.v86") +#' +#' gdsAnnot1KG <- openfn.gds(fileAnnotGDS) +#' print(gdsAnnot1KG) +#' print(read.gdsn(index.gdsn(gdsAnnot1KG, "block.annot"))) +#' +#' closefn.gds(gdsAnnot1KG) +#' } +#' +#' ## Remove temporary file +#' unlink(fileAnnotGDS, force=TRUE) +#' +#' } #' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz #' @importFrom gdsfmt openfn.gds closefn.gds +#' @importFrom SNPRelate snpgdsOpen #' @importFrom S4Vectors isSingleNumber #' @encoding UTF-8 #' @export -addGeneBlockGDSRefAnnot <- function(gdsReference, file.gdsRefAnnot, - winSize=10000, EnsDb, suffixe.blockName) { +addGeneBlockRefAnnot <- function(fileReferenceGDS, gdsRefAnnotFile, + winSize=10000, ensDb, suffixBlockName) { - ## The gdsReference must be an object of class "gds.class" - if (!inherits(gdsReference, "gds.class")) { - stop("The \'gdsReference\' must be an object of class \'gds.class\'") - } - ## Validate that the file.gdsRefAnnot GDS file exists - if (! file.exists(file.gdsRefAnnot)) { - stop("The file \'", file.gdsRefAnnot, "\' does not exist.") + if (!(is.character(fileReferenceGDS) && (file.exists(fileReferenceGDS)))) { + stop("The \'fileReferenceGDS\' must be a character string ", + "representing the Reference GDS file. The file must exist.") } - ## The winSize must be a positive single number - if (!(isSingleNumber(winSize) && (winSize > 0))) { - stop("The \'winSize\' parameter must be a single numeric value." ) - } - - ## Generate two indexes based on gene annotation for gdsAnnot1KG block - dfGeneBlock <- generateGeneBlock(gdsReference, winSize, EnsDb) - - ## Opne GDS 1KG Annotation file in writting mode - gdsRefAnnot <- openfn.gds(file.gdsRefAnnot, readonly=FALSE) - - blockName <- paste0("Gene.", suffixe.blockName) - blockDesc <- paste0("List of blocks including overlapping genes ", - suffixe.blockName) - addGDS1KGLDBlock(gdsRefAnnot, dfGeneBlock$Gene, blockName, blockDesc) - blockName <- paste0("GeneS.", suffixe.blockName) - blockDesc <- paste0("List of blocks of split by genes ", suffixe.blockName) - addGDS1KGLDBlock(gdsRefAnnot, dfGeneBlock$GeneS, blockName, blockDesc) - - ## Close GDS 1KG annotation file - closefn.gds(gdsRefAnnot) - + gdsReference <- snpgdsOpen(filename=fileReferenceGDS) + res <- addGeneBlockGDSRefAnnot(gdsReference, gdsRefAnnotFile, + winSize=10000, ensDb, suffixBlockName) + closefn.gds(gdsReference) ## Success - return(0L) + return(res) } - diff --git a/R/process1KG_internal.R b/R/process1KG_internal.R index 19568222f..95280a027 100644 --- a/R/process1KG_internal.R +++ b/R/process1KG_internal.R @@ -1,9 +1,10 @@ -#' @title Extract the pruned SNVs in a reference data set (1KG) by chromosome -#' and/or allelic frequency +#' @title Extract the pruned SNVs in a population reference data set (ex:1KG) +#' by chromosome and/or allelic frequency #' -#' @description The function extracts the pruned SNVs in a reference data -#' set (1KG) by chromosome and/or allelic frequency. The pruning is done -#' through the linkage disequilibrium analysis. +#' @description The function extracts the pruned SNVs in a population +#' reference data set (ex: 1KG) by chromosome and/or allelic frequency. +#' The pruning is done through the linkage disequilibrium analysis. The +#' pruned SNVs are saved in a RDS file. #' #' @param gdsReference an object of class #' \code{\link[SNPRelate:SNPGDSFileClass]{SNPRelate::SNPGDSFileClass}}, an @@ -45,19 +46,40 @@ #' Default: \code{NULL}. #' #' @param outPrefix a \code{character} string that represents the prefix of the -#' RDS files that will be generated. Default: \code{"pruned_1KG"}. +#' RDS file(s) that will be generated. Default: \code{"pruned_1KG"}. #' #' @param keepObj a \code{logical} specifying if the function must save the -#' the processed information into a RDS object. Default: \code{FALSE}. +#' the processed information into a second RDS file. Default: \code{FALSE}. #' #' @return The function returns \code{0L} when successful. #' #' @examples #' +#' ## Required libraries +#' library(SNPRelate) +#' library(gdsfmt) +#' #' ## Path to the demo pedigree file is located in this package #' dataDir <- system.file("extdata", package="RAIDS") #' -#' ## TODO +#' ## The 1KG Population Reference GDS demo file (opened) +#' gds1KG <- snpgdsOpen(file.path(dataDir, "PopulationReferenceDemo.gds")) +#' +#' ## The prefix of the RDS file to be created and containing the pruned SNVs +#' outPrefix <- file.path(tempdir(), "Pruned_Demo_Reference") +#' +#' ## Create a RDS file with the pruned SNVs +#' RAIDS:::pruning1KGbyChr(gdsReference=gds1KG, outPrefix=outPrefix) +#' +#' prunedSNVs <- readRDS(file.path(paste0(outPrefix, ".rds"))) +#' prunedSNVs +#' +#' ## Close 1K GDS file +#' closefn.gds(gds1KG) +#' +#' ## Delete temporary file +#' unlink(paste0(outPrefix, ".rds"), force=TRUE) +#' #' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz #' @importFrom gdsfmt index.gdsn read.gdsn @@ -72,25 +94,21 @@ pruning1KGbyChr <- function(gdsReference, method="corr", listSamples=NULL, fileObj <- file.path(paste0(outPrefix, "Obj.rds")) snpGDS <- index.gdsn(gdsReference, "snp.id") listKeep <- NULL - if(is.null(minAF)){ - if(!is.null(chr)){ - snpGDS <- index.gdsn(gdsReference, "snp.id") + if (is.null(minAF)) { + if (!is.null(chr)) { snpID <- read.gdsn(snpGDS) - chrGDS <- index.gdsn(gdsReference, "snp.chromosome") - snpCHR <- read.gdsn(chrGDS) + snpCHR <- read.gdsn(index.gdsn(gdsReference, "snp.chromosome")) listKeep <- snpID[which(snpCHR == chr)] } - } else{ - snpGDS <- index.gdsn(gdsReference, "snp.id") + } else { snpID <- read.gdsn(snpGDS) - afGDS <- index.gdsn(gdsReference, "snp.AF") - snpAF <- read.gdsn(afGDS) + snpAF <- read.gdsn(index.gdsn(gdsReference, "snp.AF")) - if(is.null(chr)){ + if (is.null(chr)) { listKeep <- snpID[which(snpAF >= minAF & snpAF <= 1-minAF)] - } else{ + } else { chrGDS <- index.gdsn(gdsReference, "snp.chromosome") snpCHR <- read.gdsn(chrGDS) @@ -99,6 +117,7 @@ pruning1KGbyChr <- function(gdsReference, method="corr", listSamples=NULL, } } + ## SNP pruning based on linkage disequilibrium (LD) snpset <- runLDPruning(gds=gdsReference, method=method, listSamples=listSamples, listKeep=listKeep, slideWindowMaxBP=slideWindowMaxBP, @@ -106,7 +125,7 @@ pruning1KGbyChr <- function(gdsReference, method="corr", listSamples=NULL, pruned <- unlist(snpset, use.names=FALSE) saveRDS(pruned, filePruned) - if(keepObj){ + if (keepObj) { saveRDS(snpset, fileObj) } @@ -116,7 +135,8 @@ pruning1KGbyChr <- function(gdsReference, method="corr", listSamples=NULL, #' @title Generate two indexes based on gene annotation for gdsAnnot1KG block #' -#' @description TODO +#' @description Generate two indexes based on gene annotation for +#' gdsAnnot1KG block #' #' @param gdsReference an object of class #' \link[gdsfmt]{gds.class} (a GDS file), the opened 1KG GDS file (reference). @@ -125,18 +145,23 @@ pruning1KGbyChr <- function(gdsReference, method="corr", listSamples=NULL, #' size of the window to use to group the SNVs when the SNVs are in a #' non-coding region. Default: \code{10000}. #' -#' @param EnsDb An object of class \code{EnsDb} with the Ensembl genome +#' @param ensDb An object of class \code{EnsDb} with the Ensembl genome #' annotation. By default, the \code{EnsDb.Hsapiens.v86} class has been used. #' #' @return a \code{data.frame} with those columns: -#' \itemize{ -#' \item{chr} {} -#' \item{pos} {} -#' \item{snp.allele} {} -#' \item{Exon} {} -#' \item{GName} {} -#' \item{Gene} {} -#' \item{GeneS} {} +#' \describe{ +#' \item{chr}{ a single \code{integer} representing the SNV chromosome.} +#' \item{pos}{ a single \code{integer} representing the SNV position.} +#' \item{snp.allele}{ a \code{character} string representing the reference allele +#' and alternative allele for each of the SNV} +#' \item{Exon}{ a \code{character} with the ensembl GeneId(s) if the SNV is in +#' one exon. If more than one GeneId they are separted by ':'} +#' \item{GName}{ a \code{character} with the ensembl GeneId(s) if the SNV is in +#' the gene. If more than one GeneId they are separted by ':'} +#' \item{Gene}{ a single \code{integer} specific to the SNVs that share +#' at least one genes} +#' \item{GeneS}{ a single \code{integer} specific to the SNVs that share +#' a unique combination of genes} #' } #' "chr", "pos", "snp.allele", "Exon", "GName", "Gene", "GeneS" #' Example for GName and the two indexes "Gene", "GeneS" @@ -152,10 +177,37 @@ pruning1KGbyChr <- function(gdsReference, method="corr", listSamples=NULL, #' 493 ENSG00000230021:ENSG00000228794 17 3825 #' @examples #' +#' ## Required library +#' library(SNPRelate) +#' #' ## Path to the demo pedigree file is located in this package #' dataDir <- system.file("extdata", package="RAIDS") #' -#' # TODO +#' ## Required library +#' if (requireNamespace("EnsDb.Hsapiens.v86", quietly=TRUE)) { +#' +#' ## Making a "short cut" on the ensDb object +#' edb <- EnsDb.Hsapiens.v86::EnsDb.Hsapiens.v86 +#' +#' path1KG <- file.path(dataDir, "tests") +#' +#' ## Reference GDS file +#' fileReferenceGDS <- file.path(path1KG, "ex1_good_small_1KG.gds") +#' +#' \donttest{ +#' ## Open the reference GDS file (demo version) +#' gds1KG <- snpgdsOpen(fileReferenceGDS) +#' +#' ## The function returns a data.frame containing +#' ## gene block information +#' matGeneBlock <- RAIDS:::generateGeneBlock(gdsReference=gds1KG, +#' ensDb=edb) +#' print(head(matGeneBlock[grep("ENSG00000157152", +#' matGeneBlock$GName),])) +#' +#' closefn.gds(gds1KG) +#' } +#' } #' #' @author Pascal Belleau, Astrid Deschênes and Alex Krasnitz #' @importFrom S4Vectors Rle @@ -167,9 +219,9 @@ pruning1KGbyChr <- function(gdsReference, method="corr", listSamples=NULL, #' @importFrom AnnotationFilter GeneIdFilter #' @encoding UTF-8 #' @keywords internal -generateGeneBlock <- function(gdsReference, winSize=10000, EnsDb) { +generateGeneBlock <- function(gdsReference, winSize=10000, ensDb) { - edb <- EnsDb + edb <- ensDb listEnsId <- unique(names(genes(edb))) cols <- c("GENEID", "SYMBOL", "GENENAME", "GENESEQSTART", @@ -211,7 +263,7 @@ generateGeneBlock <- function(gdsReference, winSize=10000, EnsDb) { stringsAsFactors=FALSE) offsetGene <- 0 offsetGeneS <- 0 - offsetGene.O <- 0 + offsetGeneO <- 0 for(chr in seq_len(22)) { @@ -221,84 +273,84 @@ generateGeneBlock <- function(gdsReference, winSize=10000, EnsDb) { # colnames(matFreq) <- c("chr", "pos", "ref", "alt", "af", "EAS_AF", # "EUR_AF","AFR_AF", "AMR_AF", "SAS_AF") - message(system.time({ - # SNV in the GDS - matFreq <- matFreqAll[which(matFreqAll$chr == chr),] - # create two vector (one for the exon and one for the gene) of char - # with 1 entry for each SNV in the GDS - # I will keep the name of the gene and exon at this position - listSNVExons <- character(nrow(matFreq)) - listSNVGenes <- character(nrow(matFreq)) - - listPos <- seq_len(nrow(matFreq)) - listPos <- listPos[order(matFreq$pos)] - # Create an index to accelerate the process - startIndex <- seq(1, nrow(matFreq), 1000) - # Add if the last entry is not the last position - # is not the nb row of matFreq add the the last - #position - if(startIndex[length(startIndex)] < nrow(matFreq)){ - startIndex <- c(startIndex, nrow(matFreq)) - } - # For gene in the chr - # slow but acceptable - # user system elapsed - # 26.116 0.074 26.201 - # see blockAnnotation.R for slower alternatives - for (genePos in seq_len(nrow(dfGenneAllChr))) { - # the gene is where SNV exists - if (dfGenneAllChr$end[genePos] >= matFreq$pos[listPos[1]] & - dfGenneAllChr$start[genePos] <= matFreq$pos[nrow(matFreq)]) { - # In which partitions from the index the gene is located - vStart <- max(c(which(matFreq$pos[startIndex] <= - dfGenneAllChr$start[genePos]), 1)) - vEnd <- min(c(which(matFreq$pos[startIndex] >= - dfGenneAllChr$end[genePos]), - length(startIndex))) - # List of SNV in the gene - listP <- which(matFreq$pos[listPos[startIndex[vStart]:startIndex[vEnd]]] >= dfGenneAllChr$start[genePos] & - matFreq$pos[listPos[startIndex[vStart]:startIndex[vEnd]]] <= dfGenneAllChr$end[genePos]) - - # if SNV in the gene - if (length(listP) > 0) { - # listPos in the gene - listP <- - listPos[startIndex[vStart]:startIndex[vEnd]][listP] - - # Add the name of the gene of SNVs - listSNVGenes[listP] <- paste0(listSNVGenes[listP], ":", - dfGenneAllChr$mcols.GENEID[genePos]) - - # Allow run on all without check if the SNV have - # already gene name - listSNVGenes[listP] <- gsub("^:", "", - listSNVGenes[listP]) - - # Exon of the gene - dfExon <- dfExonChr[which(dfExonChr$GeneID == - dfGenneAllChr$mcols.GENEID[genePos]),] - k <- 1 - - listE <- list() - for (pos in listP) { - if(length(which(dfExon$Start <= matFreq$pos[pos] & - dfExon$End >= matFreq$pos[pos])) > 0) { - listE[[k]] <- pos - k <- k + 1 - } - } - if (length(listE) > 0) { - listE <- do.call(c, listE) - listSNVExons[listE] <- paste0(listSNVExons[listE], - ":", dfGenneAllChr$mcols.GENEID[genePos]) - listSNVExons[listE] <- gsub("^:", "", - listSNVExons[listE]) + # SNV in the GDS + matFreq <- matFreqAll[which(matFreqAll$chr == chr),] + # create two vector (one for the exon and one for the gene) of char + # with 1 entry for each SNV in the GDS + # I will keep the name of the gene and exon at this position + listSNVExons <- character(nrow(matFreq)) + listSNVGenes <- character(nrow(matFreq)) + + listPos <- seq_len(nrow(matFreq)) + listPos <- listPos[order(matFreq$pos)] + # Create an index to accelerate the process + startIndex <- seq(1, nrow(matFreq), 1000) + # Add if the last entry is not the last position + # is not the nb row of matFreq add the the last + #position + if(startIndex[length(startIndex)] < nrow(matFreq)){ + startIndex <- c(startIndex, nrow(matFreq)) + } + # For gene in the chr + # slow but acceptable + # user system elapsed + # 26.116 0.074 26.201 + # see blockAnnotation.R for slower alternatives + for (genePos in seq_len(nrow(dfGenneAllChr))) { + # the gene is where SNV exists + if (dfGenneAllChr$end[genePos] >= matFreq$pos[listPos[1]] & + dfGenneAllChr$start[genePos] <= matFreq$pos[nrow(matFreq)]) { + # In which partitions from the index the gene is located + vStart <- max(c(which(matFreq$pos[startIndex] <= + dfGenneAllChr$start[genePos]), 1)) + vEnd <- min(c(which(matFreq$pos[startIndex] >= + dfGenneAllChr$end[genePos]), + length(startIndex))) + # List of SNV in the gene + listP <- which(matFreq$pos[listPos[startIndex[vStart]:startIndex[vEnd]]] >= dfGenneAllChr$start[genePos] & + matFreq$pos[listPos[startIndex[vStart]:startIndex[vEnd]]] <= dfGenneAllChr$end[genePos]) + + # if SNV in the gene + if (length(listP) > 0) { + # listPos in the gene + listP <- + listPos[startIndex[vStart]:startIndex[vEnd]][listP] + + # Add the name of the gene of SNVs + listSNVGenes[listP] <- paste0(listSNVGenes[listP], ":", + dfGenneAllChr$mcols.GENEID[genePos]) + + # Allow run on all without check if the SNV have + # already gene name + listSNVGenes[listP] <- gsub("^:", "", + listSNVGenes[listP]) + + # Exon of the gene + dfExon <- dfExonChr[which(dfExonChr$GeneID == + dfGenneAllChr$mcols.GENEID[genePos]),] + k <- 1 + + listE <- list() + for (pos in listP) { + if(length(which(dfExon$Start <= matFreq$pos[pos] & + dfExon$End >= matFreq$pos[pos])) > 0) { + listE[[k]] <- pos + k <- k + 1 } } + + if (length(listE) > 0) { + listE <- do.call(c, listE) + listSNVExons[listE] <- paste0(listSNVExons[listE], + ":", dfGenneAllChr$mcols.GENEID[genePos]) + listSNVExons[listE] <- gsub("^:", "", + listSNVExons[listE]) + } } } - })) + } + # add the column Exon with the list of gene with an exon with the SNV @@ -353,7 +405,7 @@ generateGeneBlock <- function(gdsReference, winSize=10000, EnsDb) { matFreq$GeneS[matFreq$Gene < 0] <- 0 listOrph <- which(matFreq$GeneS == 0) flag <- TRUE - v <- offsetGene.O - 1 + v <- offsetGeneO - 1 i <- 1 curZone <- "GeneS" curZone1 <- "Gene" @@ -388,7 +440,7 @@ generateGeneBlock <- function(gdsReference, winSize=10000, EnsDb) { i <- which(listOrph == j) + 1 flag <- ifelse(i <= length(listOrph), TRUE, FALSE) } - offsetGene.O <- min(offsetGene.O, min(matFreq$Gene)) + offsetGeneO <- min(offsetGeneO, min(matFreq$Gene)) } listMat[[chr]] <- matFreq @@ -397,9 +449,9 @@ generateGeneBlock <- function(gdsReference, winSize=10000, EnsDb) { # create the space at the begining } - matGene.Block <- do.call(rbind, listMat) + matGeneBlock <- do.call(rbind, listMat) rm(listMat) - return(matGene.Block) + return(matGeneBlock) } @@ -415,14 +467,14 @@ generateGeneBlock <- function(gdsReference, winSize=10000, EnsDb) { #' exist. #' #' @param pathGeno a \code{character} string representing the path where -#' the 1KG genotyping files for each profile are located. Only the profiles -#' with associated genotyping files are retained in the creation of the final -#' \code{data.frame}. The name of the genotyping files must correspond to -#' the individual identification (Individual.ID) in the pedigree file -#' (PED file). +#' the Reference genotyping files for each profile are located. Only the +#' profiles with associated genotyping files are retained in the creation of +#' the final \code{data.frame}. The name of the genotyping files must +#' correspond to the individual identification (Individual.ID) in the +#' pedigree file (PED file). #' #' @param batch a\code{integer} that uniquely identifies the source of the -#' pedigree information. The 1KG is usually \code{0L}. +#' pedigree information. The Reference is usually \code{0L}. #' #' @return The function returns \code{0L} when successful. #' @@ -508,7 +560,7 @@ validatePrepPed1KG <- function(filePed, pathGeno, batch) { #' dataDir <- system.file("extdata", package="RAIDS") #' #' ## The RDS file containing the pedigree information -#' pedigreeFile <- file.path(dataDir, "PedigreeDemo.rds") +#' pedigreeFile <- file.path(dataDir, "PedigreeReferenceDemo.rds") #' #' ## The RDS file containing the indexes of the retained SNPs #' snpIndexFile <- file.path(dataDir, "listSNPIndexes_Demo.rds") diff --git a/R/processStudy.R b/R/processStudy.R index 71500cbb8..580ea09d8 100644 --- a/R/processStudy.R +++ b/R/processStudy.R @@ -2,16 +2,19 @@ #' using the information from a RDS Sample description file and the 1KG #' GDS file #' -#' @description The function uses the information for the 1KG GDS file and the -#' RDS Sample Description file to create the Profile GDS file. One Profile GDS -#' file is created per profile. One Profile GDS file will be created for each -#' entry present in the \code{listProfiles} parameter. +#' @description The function uses the information for the Reference GDS file +#' and the RDS Sample Description file to create the Profile GDS file. One +#' Profile GDS file is created per profile. One Profile GDS file will be +#' created for each entry present in the \code{listProfiles} parameter. #' #' @param pathGeno a \code{character} string representing the path to the #' directory containing the VCF output of SNP-pileup for each sample. The #' SNP-pileup files must be compressed (gz files) and have the name identifiers #' of the samples. A sample with "Name.ID" identifier would have an -#' associated SNP-pileup file called "Name.ID.txt.gz". +#' associated file called +#' if genoSource is "VCF", then "Name.ID.vcf.gz", +#' if genoSource is "generic", then "Name.ID.generic.txt.gz" +#' if genoSource is "snp-pileup", then "Name.ID.txt.gz". #' #' @param filePedRDS a \code{character} string representing the path to the #' RDS file that contains the information about the sample to analyse. @@ -31,7 +34,7 @@ #' can be defined. #' #' @param fileNameGDS a \code{character} string representing the file name of -#' the 1KG GDS file. The file must exist. +#' the Reference GDS file. The file must exist. #' #' @param batch a single positive \code{integer} representing the current #' identifier for the batch. Beware, this field is not stored anymore. @@ -54,13 +57,15 @@ #' Default: \code{NULL}. #' #' @param genoSource a \code{character} string with two possible values: -#' 'snp-pileup' or 'generic'. It specifies if the genotype files +#' 'snp-pileup', 'generic' or 'VCF'. It specifies if the genotype files #' are generated by snp-pileup (Facets) or are a generic format CSV file #' with at least those columns: #' 'Chromosome', 'Position', 'Ref', 'Alt', 'Count', 'File1R' and 'File1A'. #' The 'Count' is the depth at the specified position; #' 'FileR' is the depth of the reference allele and #' 'File1A' is the depth of the specific alternative allele. +#' Finally the file can be a VCF file with at least those genotype +#' fields: GT, AD, DP. #' #' @param verbose a \code{logical} indicating if message information should be #' printed. Default: \code{FALSE}. @@ -71,7 +76,7 @@ #' #' ## Path to the demo 1KG GDS file is located in this package #' dataDir <- system.file("extdata/tests", package="RAIDS") -#' fileGDS <- file.path(dataDir, "ex1_good_small_1KG_GDS.gds") +#' fileGDS <- file.path(dataDir, "ex1_good_small_1KG.gds") #' #' ## The data.frame containing the information about the study #' ## The 3 mandatory columns: "study.id", "study.desc", "study.platform" @@ -90,13 +95,13 @@ #' Source=rep("Databank B", 2), stringsAsFactors=FALSE) #' rownames(samplePED) <- samplePED$Name.ID #' -#' ## Create the Profile GDS File for sample in listSamples vector +#' ## Create the Profile GDS File for samples in 'listSamples' vector #' ## (in this case, samples "ex1") #' ## The Profile GDS file is created in the pathProfileGDS directory #' result <- createStudy2GDS1KG(pathGeno=dataDir, #' pedStudy=samplePED, fileNameGDS=fileGDS, #' studyDF=studyDF, listProfiles=c("ex1"), -#' pathProfileGDS=dataDir, +#' pathProfileGDS=tempdir(), #' genoSource="snp-pileup", #' verbose=FALSE) #' @@ -105,10 +110,11 @@ #' #' ## The Profile GDS file 'ex1.gds' has been created in the #' ## specified directory -#' list.files(dataDir) +#' list.files(tempdir()) +#' +#' ## Remove Profile GDS file (created for demo purpose) +#' unlink(file.path(tempdir(), "ex1.gds"), force=TRUE) #' -#' ## Unlink Profile GDS file (created for demo purpose) -#' unlink(file.path(dataDir, "ex1.gds")) #' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz #' @importFrom gdsfmt createfn.gds put.attr.gdsn closefn.gds read.gdsn @@ -120,7 +126,7 @@ createStudy2GDS1KG <- function(pathGeno=file.path("data", "sampleGeno"), filePedRDS=NULL, pedStudy=NULL, fileNameGDS, batch=1, studyDF, listProfiles=NULL, pathProfileGDS=NULL, - genoSource=c("snp-pileup", "generic"), + genoSource=c("snp-pileup", "generic", "VCF"), verbose=FALSE) { ## When filePedRDS is defined and pedStudy is null @@ -148,7 +154,7 @@ createStudy2GDS1KG <- function(pathGeno=file.path("data", "sampleGeno"), genoSource <- arg_match(genoSource) - ## Read the 1KG GDS file + ## Read the Reference GDS file gdsReference <- snpgdsOpen(filename=fileNameGDS) ## Extract the chromosome and position information for all SNPs in 1KG GDS @@ -181,142 +187,10 @@ createStudy2GDS1KG <- function(pathGeno=file.path("data", "sampleGeno"), return(0L) } -#' @title Create the GDS Sample file(s) for one or multiple specific samples -#' using the information from a RDS Sample description file and the 1KG -#' GDS file -#' -#' @description The function uses the information for the 1KG GDS file and the -#' RDS Sample Description file to create the GDS Sample file. One GDS Sample -#' file is created per sample. One GDS Sample file will be created for each -#' entry present in the \code{listSamples} parameter. -#' -#' @param pathGeno a \code{character} string representing the path to the -#' directory containing the output of SNP-pileup, a VCF Sample file, for -#' each sample. The -#' SNP-pileup files must be compressed (gz files) and have the name identifiers -#' of the samples. A sample with "Name.ID" identifier would have an -#' associated SNP-pileup file called "Name.ID.txt.gz". -#' -#' @param filePedRDS a \code{character} string representing the path to the -#' RDS file that contains the information about the sample to analyse. -#' The RDS file must -#' include a \code{data.frame} with those mandatory columns: "Name.ID", -#' "Case.ID", "Sample.Type", "Diagnosis", "Source". All columns must be in -#' \code{character} strings. The \code{data.frame} -#' must contain the information for all the samples passed in the -#' \code{listSamples} parameter. -#' -#' @param fileNameGDS a \code{character} string representing the file name of -#' the 1KG GDS file. The file must exist. -#' -#' @param batch a single positive \code{integer} representing the current -#' identifier for the batch. Beware, this field is not stored anymore. -#' Default: \code{1}. -#' -#' @param studyDF a \code{data.frame} containing the information about the -#' study associated to the analysed sample(s). The \code{data.frame} must have -#' those 3 columns: "study.id", "study.desc", "study.platform". All columns -#' must be in \code{character} strings. -#' -#' @param listSamples a \code{vector} of \code{character} string corresponding -#' to the sample identifiers that will have a GDS Sample file created. The -#' sample identifiers must be present in the "Name.ID" column of the RDS file -#' passed to the \code{filePedRDS} parameter. -#' If \code{NULL}, all samples in the \code{filePedRDS} are selected. -#' Default: \code{NULL}. -#' -#' @param pathProfileGDS a \code{character} string representing the path to -#' the directory where the GDS Sample files will be created. -#' Default: \code{NULL}. -#' -#' @param genoSource a \code{character} string with two possible values: -#' 'snp-pileup' or 'generic'. It specifies if the genotype files -#' are generated by snp-pileup (Facets) or are a generic format CSV file -#' with at least those columns: -#' 'Chromosome', 'Position', 'Ref', 'Alt', 'Count', 'File1R' and 'File1A'. -#' The 'Count' is the depth at the specified position; -#' 'FileR' is the depth of the reference allele and -#' 'File1A' is the depth of the specific alternative allele. -#' -#' @param verbose a \code{logical} indicating if message information should be -#' printed. Default: \code{TRUE}. -#' -#' @return The function returns \code{0L} when successful. -#' -#' @examples -#' -#' ## Path to the demo pedigree file is located in this package -#' dataDir <- system.file("extdata", package="RAIDS") -#' -#' ## The data.frame containing the information about the study -#' ## The 3 mandatory columns: "study.id", "study.desc", "study.platform" -#' ## The entries should be strings, not factors (stringsAsFactors=FALSE) -#' studyInfo <- data.frame(study.id="Pancreatic.WES", -#' study.desc="Pancreatic study", -#' study.platform="WES", -#' stringsAsFactors=FALSE) -#' -#' ## TODO -#' filePedRDS <- "TODO" -#' -#' -#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz -#' @importFrom gdsfmt createfn.gds put.attr.gdsn closefn.gds read.gdsn -#' @importFrom rlang arg_match -#' @encoding UTF-8 -#' @export -appendStudy2GDS1KG <- function(pathGeno=file.path("data", "sampleGeno"), - filePedRDS, fileNameGDS, batch=1, studyDF, - listSamples=NULL, pathProfileGDS=NULL, - genoSource=c("snp-pileup", "generic"), verbose=TRUE) { - - ## Validate inputs - validateAppendStudy2GDS1KG(pathGeno=pathGeno, filePedRDS=filePedRDS, - fileNameGDS=fileNameGDS, batch=batch, studyDF=studyDF, - listSamples=listSamples, pathProfileGDS=pathProfileGDS, - genoSource=genoSource, verbose=verbose) - - genoSource <- arg_match(genoSource) - - ## Open the RDS Sample information file - pedStudy <- readRDS(file=filePedRDS) - - ## Read the 1KG GDS file - gdsReference <- snpgdsOpen(filename=fileNameGDS) - - ## Extract the chromosome and position information for all SNPs in 1KG GDS - ## Create a data.frame containing the information - snpCHR <- index.gdsn(node=gdsReference, "snp.chromosome") - snpPOS <- index.gdsn(node=gdsReference, "snp.position") - - listPos <- data.frame(snp.chromosome=read.gdsn(snpCHR), - snp.position=read.gdsn(snpPOS)) - - if (verbose) { - message("Start ", Sys.time()) - message("Sample info DONE ", Sys.time()) - } - - generateGDS1KGgenotypeFromSNPPileup(pathGeno=pathGeno, - listSamples=listSamples, listPos=listPos, offset=-1, - minCov=10, minProb=0.999, seqError=0.001, dfPedProfile=pedStudy, - batch=batch, studyDF=studyDF, pathProfileGDS=pathProfileGDS, - genoSource=genoSource, verbose=verbose) - - if (verbose) { - message("Genotype DONE ", Sys.time()) - } - - ## Close 1KG GDS file - closefn.gds(gdsReference) - - ## Return successful code - return(0L) -} - #' @title Compute the list of pruned SNVs for a specific profile using the -#' information from the 1KG GDS file and a linkage disequilibrium analysis +#' information from the Reference GDS file and a linkage disequilibrium +#' analysis #' #' @description This function computes the list of pruned SNVs for a #' specific profile. When @@ -400,9 +274,9 @@ appendStudy2GDS1KG <- function(pathGeno=file.path("data", "sampleGeno"), #' ## Required library for GDS #' library(gdsfmt) #' -#' ## Path to the demo 1KG GDS file is located in this package +#' ## Path to the demo Reference GDS file is located in this package #' dataDir <- system.file("extdata/tests", package="RAIDS") -#' fileGDS <- file.path(dataDir, "ex1_good_small_1KG_GDS.gds") +#' fileGDS <- file.path(dataDir, "ex1_good_small_1KG.gds") #' #' ## The data.frame containing the information about the study #' ## The 3 mandatory columns: "study.id", "study.desc", "study.platform" @@ -421,14 +295,11 @@ appendStudy2GDS1KG <- function(pathGeno=file.path("data", "sampleGeno"), #' Source = rep("Databank B", 2), stringsAsFactors = FALSE) #' rownames(samplePED) <- samplePED$Name.ID #' +#' ## Temporary Profile GDS file +#' profileFile <- file.path(tempdir(), "ex1.gds") +#' #' ## Copy the Profile GDS file demo that has not been pruned yet -#' ## into a test directory (deleted after the example has been run) -#' dataDirPruning <- file.path(system.file("extdata", package="RAIDS"), -#' "demoPruning") -#' dir.create(dataDirPruning, showWarnings=FALSE, -#' recursive=FALSE, mode="0777") -#' file.copy(file.path(dataDir, "ex1_demo.gds"), -#' file.path(dataDirPruning, "ex1.gds")) +#' file.copy(file.path(dataDir, "ex1_demo.gds"), profileFile) #' #' ## Open 1KG file #' gds1KG <- snpgdsOpen(fileGDS) @@ -436,22 +307,22 @@ appendStudy2GDS1KG <- function(pathGeno=file.path("data", "sampleGeno"), #' ## Compute the list of pruned SNVs for a specific profile 'ex1' #' ## and save it in the Profile GDS file 'ex1.gds' #' pruningSample(gdsReference=gds1KG, currentProfile=c("ex1"), -#' studyID = studyDF$study.id, pathProfileGDS=dataDirPruning) +#' studyID = studyDF$study.id, pathProfileGDS=tempdir()) #' -#' ## Close the 1KG GDS file (it is important to always close the GDS files) +#' ## Close the Reference GDS file (important) #' closefn.gds(gds1KG) #' #' ## Check content of Profile GDS file #' ## The 'pruned.study' entry should be present -#' content <- openfn.gds(file.path(dataDirPruning, "ex1.gds")) +#' content <- openfn.gds(profileFile) #' content #' -#' ## Close the Profile GDS file (it is important to always close the GDS files) +#' ## Close the Profile GDS file (important) #' closefn.gds(content) #' -#' ## Unlink Profile GDS file (created for demo purpose) -#' unlink(file.path(dataDirPruning, "ex1.gds")) -#' unlink(dataDirPruning) +#' ## Remove Profile GDS file (created for demo purpose) +#' unlink(profileFile, force=TRUE) +#' #' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz #' @importFrom gdsfmt index.gdsn read.gdsn @@ -525,13 +396,13 @@ pruningSample <- function(gdsReference, listKeepPos <- listGeno ## Select SNVs based on the chromosome - if(!is.null(chr)) { + if (!is.null(chr)) { snpCHR <- read.gdsn(index.gdsn(gdsReference, "snp.chromosome")) listKeepPos <- intersect(which(snpCHR == chr), listKeepPos) } ## Select SNVs based on the minimum allele frequency in the populations - if(!is.null(superPopMinAF)) { + if (!is.null(superPopMinAF)) { listTMP <- NULL for(sp in c("EAS", "EUR", "AFR", "AMR", "SAS")) { snpAF <- read.gdsn(index.gdsn(gdsReference, @@ -542,7 +413,7 @@ pruningSample <- function(gdsReference, listKeepPos <- intersect(listTMP, listKeepPos) } - if(length(listKeepPos) == 0) { + if (length(listKeepPos) == 0) { stop("In pruningSample, the sample ", currentProfile, " doesn't have SNPs after filters\n") } @@ -559,13 +430,13 @@ pruningSample <- function(gdsReference, pruned <- unlist(snpset, use.names=FALSE) ## When TRUE, generate 2 RDS file with the pruned SNVs information - if(keepFile) { + if (keepFile) { saveRDS(pruned, filePruned) saveRDS(snpset, fileObj) } ## When TRUE, add the pruned SNvs information to the Profile GDS file - if(keepPrunedGDS) { + if (keepPrunedGDS) { gdsSample <- openfn.gds(filename=fileGDSSample, readonly=FALSE) addGDSStudyPruning(gdsProfile=gdsSample, pruned=pruned) closefn.gds(gdsfile=gdsSample) @@ -580,7 +451,9 @@ pruningSample <- function(gdsReference, #' #' @description The function extracts the information about the pruned SNVs #' from the 1KG GDS file and adds entries related to the pruned SNVs in -#' the Profile GDS file. +#' the Profile GDS file. The nodes are added to the Profile GDS file: +#' 'sample.id', 'snp.id', 'snp.chromosome', 'snp.position', 'snp.index', +#' 'genotype' and 'lap'. #' #' @param gdsReference an object of class #' \link[gdsfmt]{gds.class} (a GDS file), the opened 1KG GDS file. @@ -599,11 +472,11 @@ pruningSample <- function(gdsReference, #' @examples #' #' ## Required library for GDS -#' library(gdsfmt) +#' library(SNPRelate) #' #' ## Path to the demo 1KG GDS file is located in this package #' dataDir <- system.file("extdata/tests", package="RAIDS") -#' fileGDS <- file.path(dataDir, "ex1_good_small_1KG_GDS.gds") +#' fileGDS <- file.path(dataDir, "ex1_good_small_1KG.gds") #' #' ## The data.frame containing the information about the study #' ## The 3 mandatory columns: "studyID", "study.desc", "study.platform" @@ -613,39 +486,37 @@ pruningSample <- function(gdsReference, #' study.platform="PLATFORM", #' stringsAsFactors=FALSE) #' -#' ## Copy the Profile GDS file demo that has been pruned -#' ## into a test directory (deleted after the example has been run) -#' dataDirGenotyping <- file.path(system.file("extdata", package="RAIDS"), -#' "demoAddGenotype") -#' dir.create(dataDirGenotyping, showWarnings=FALSE, -#' recursive=FALSE, mode="0777") +#' ## Temporary Profile file +#' fileProfile <- file.path(tempdir(), "ex2.gds") +#' +#' ## Copy required file #' file.copy(file.path(dataDir, "ex1_demo_with_pruning.gds"), -#' file.path(dataDirGenotyping, "ex1.gds")) +#' fileProfile) #' #' ## Open 1KG file #' gds1KG <- snpgdsOpen(fileGDS) #' #' ## Compute the list of pruned SNVs for a specific profile 'ex1' -#' ## and save it in the Profile GDS file 'ex1.gds' +#' ## and save it in the Profile GDS file 'ex2.gds' #' add1KG2SampleGDS(gdsReference=gds1KG, -#' fileProfileGDS=file.path(dataDirGenotyping, "ex1.gds"), -#' currentProfile=c("ex1"), -#' studyID=studyDF$study.id) +#' fileProfileGDS=fileProfile, +#' currentProfile=c("ex1"), +#' studyID=studyDF$study.id) #' -#' ## Close the 1KG GDS file (it is important to always close the GDS files) +#' ## Close the 1KG GDS file (important) #' closefn.gds(gds1KG) #' #' ## Check content of Profile GDS file #' ## The 'pruned.study' entry should be present -#' content <- openfn.gds(file.path(dataDirGenotyping, "ex1.gds")) +#' content <- openfn.gds(fileProfile) #' content #' -#' ## Close the Profile GDS file (it is important to always close the GDS files) +#' ## Close the Profile GDS file (important) #' closefn.gds(content) #' -#' ## Unlink Profile GDS file (created for demo purpose) -#' unlink(file.path(dataDirGenotyping, "ex1.gds")) -#' unlink(dataDirGenotyping) +#' ## Remove Profile GDS file (created for demo purpose) +#' unlink(fileProfile, force=TRUE) +#' #' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz #' @importFrom gdsfmt index.gdsn read.gdsn objdesp.gdsn @@ -663,7 +534,7 @@ add1KG2SampleGDS <- function(gdsReference, fileProfileGDS, currentProfile, gdsSample <- openfn.gds(fileProfileGDS, readonly=FALSE) ## Extract needed information from 1KG GDS file - snp.id <- read.gdsn(index.gdsn(gdsReference,"snp.id")) + snp.id <- read.gdsn(index.gdsn(gdsReference, "snp.id")) ## Extract list of pruned SNVs from the GDS Sample file pruned <- read.gdsn(index.gdsn(gdsSample, "pruned.study")) @@ -687,26 +558,33 @@ add1KG2SampleGDS <- function(gdsReference, fileProfileGDS, currentProfile, var.geno <- NULL - j <- 1 - for(i in listRef) { - g <- read.gdsn(index.gdsn(gdsReference, "genotype"), start=c(1,i), - count = c(-1,1))[listSNP] - - if(! ("genotype" %in% ls.gdsn(gdsSample))){ - var.geno <- add.gdsn(gdsSample, "genotype", - valdim=c(length(listSNP), 1), g, storage="bit2") - - }else { - if(is.null(var.geno)) { - var.geno <- index.gdsn(gdsSample, "genotype") - } - append.gdsn(var.geno, g) - } - if(j %% 5 == 0) { - sync.gds(gdsSample) - } - j <- j + 1 - } + j <- apply(matrix(c(seq_len(length(listRef)), listRef), ncol=2), 1, + FUN=function(x, gdsReference, + gdsSample, listSNP){ + i <- x[2] + j <- x[1] + + g <- read.gdsn(index.gdsn(gdsReference, "genotype"), + start=c(1,i), count = c(-1,1))[listSNP] + + if(! ("genotype" %in% ls.gdsn(gdsSample))){ + var.geno <- add.gdsn(gdsSample, "genotype", + valdim=c(length(listSNP), 1), g, storage="bit2") + + }else { + if(is.null(var.geno)) { + var.geno <- index.gdsn(gdsSample, "genotype") + } + append.gdsn(var.geno, g) + } + if(j %% 5 == 0) { + sync.gds(gdsSample) + } + return(NULL) + }, + gdsReference=gdsReference, + gdsSample=gdsSample, + listSNP=listSNP) # add.gdsn(gdsSample, "SamplePos", objdesp.gdsn(index.gdsn(gdsSample, # "genotype"))$dim[2] + 1, @@ -715,7 +593,9 @@ add1KG2SampleGDS <- function(gdsReference, fileProfileGDS, currentProfile, posCur <- which(study.annot$data.id == currentProfile & study.annot$study.id == studyID) - + if(is.null(var.geno)) { + var.geno <- index.gdsn(gdsSample, "genotype") + } g <- read.gdsn(index.gdsn(gdsSample, "geno.ref"), start=c(1, posCur), count=c(-1, 1))[listSNP] append.gdsn(var.geno, g) @@ -730,499 +610,170 @@ add1KG2SampleGDS <- function(gdsReference, fileProfileGDS, currentProfile, return(0L) } -#' @title TODO + +#' @title Append information about the 1KG samples into +#' the Profile GDS file #' -#' @description TODO +#' @description The information about the samples present in the 1KG GDS file +#' is added into the GDS Sample file. Only the information about the +#' unrelated samples +#' from the 1OOO Genome Study are copied into the GDS Sample file. The +#' information is only added to the GDS Sample file when the 1KG Study is not +#' already present in the GDS Sample file. The sample information for all +#' selected samples is appended to the GDS Sample file "study.annot" node. +#' The study information is appended to the GDS Sample file "study.list" node. #' #' @param gdsReference an object of class #' \link[gdsfmt]{gds.class} (a GDS file), the opened 1KG GDS file. #' -#' @param pathProfileGDS a \code{character} string representing the path to -#' the directory that contains the Profile GDS files. The directory must -#' exist. -#' -#' @param pathGeno a \code{character} string representing the path to -#' the directory that contains TODO -#' -#' @param fileSNPsRDS TODO +#' @param fileProfileGDS a \code{character} string representing the path and +#' file name of the GDS Sample file. The GDS Sample file must exist. #' -#' @param verbose a \code{logical} indicating if message information should be -#' printed. Default: \code{FALSE}. +#' @param verbose a \code{logical} indicating if messages should be printed +#' to show how the different steps in the function. Default: \code{FALSE}. #' #' @return The integer \code{0L} when successful. #' #' @examples #' -#' ## Path to the demo pedigree file is located in this package -#' dataDir <- system.file("extdata", 'RAIDS') +#' ## Required library for GDS +#' library(gdsfmt) #' -#' ## TODO +#' ## Get the temp folder +#' tempDir <- tempdir() #' -#' @author Pascal Belleau, Astrid Deschênes and Alex Krasnitz -#' @importFrom gdsfmt index.gdsn read.gdsn -#' @encoding UTF-8 -#' @export -addPhase1KG2SampleGDSFromFile <- function(gdsReference, pathProfileGDS, - pathGeno, fileSNPsRDS, verbose=FALSE) { - - ## The gdsReference must be an object of class "gds.class" - validateGDSClass(gds=gdsReference, name="gdsReference") - - ## Verbose must be a logical - if (!is.logical(verbose)) { - stop("The \'verbose\' parameter must be a logical (TRUE or FALSE).") - } - - listGDSSample <- dir(pathProfileGDS, pattern = ".+.gds") - - indexAll <- NULL - for(fileProfileGDS in listGDSSample) { - gdsSample <- openfn.gds(filename=file.path(pathProfileGDS, - fileProfileGDS)) - - snp.index <- read.gdsn(node=index.gdsn(node=gdsSample, "snp.index")) - - indexAll <- union(indexAll, snp.index) - closefn.gds(gdsfile=gdsSample) - } - - gdsSample <- createfn.gds(file.path(pathProfileGDS, "phase1KG.gds")) - indexAll <- indexAll[order(indexAll)] - snp.id <- read.gdsn(index.gdsn(gdsReference,"snp.id"))[indexAll] - add.gdsn(gdsSample, "snp.id", snp.id) - add.gdsn(gdsSample, "snp.index", indexAll) - listRef <- which(read.gdsn(index.gdsn(gdsReference, "sample.ref"))==1) - listSample <- read.gdsn(index.gdsn(gdsReference, "sample.id"))[listRef] - listSNP <- readRDS(file=fileSNPsRDS) - i<-1 - for(sample1KG in listSample){ - if(verbose) { message("P ", i, " ", Sys.time()) } - i <- i + 1 - file1KG <- file.path(pathGeno, paste0(sample1KG,".csv.bz2")) - matSample <- read.csv2(file=file1KG, row.names=NULL) - matSample <- matSample[listSNP[indexAll],, drop=FALSE] - matSample <- matrix(as.numeric(unlist(strsplit(matSample[,1], - "\\|"))), nrow=2)[1,] - var.phase <- NULL - if (!("phase" %in% ls.gdsn(gdsSample))) { - var.phase <- add.gdsn(gdsSample, "phase", - valdim=c(length(indexAll), 1), - matSample, storage="bit2") - } else { - if (is.null(var.phase)) { - var.phase <- index.gdsn(node=gdsSample, "phase") - } - append.gdsn(node=var.phase, val=matSample) - } - } - - closefn.gds(gdsfile=gdsSample) - - ## Success - return(0L) -} - - -#' @title TODO +#' ## Create a temporary 1KG GDS file and add needed information +#' fileName1KG <- file.path(tempDir, "GDS_TEMP_addStudy1Kg_1KG.gds") +#' gds1KG <- createfn.gds(filename=fileName1KG) +#' add.gdsn(gds1KG, "sample.id", c("HTT101", "HTT102", "HTT103")) #' -#' @description TODO +#' samples <- data.frame(sex=c(1, 1, 2), pop.group=c("GBR", "GIH", "GBR"), +#' superPop=c("EUR", "SAS", "EUR"), batch=rep(0, 3), +#' stringsAsFactors = FALSE) #' -#' @param gdsReference an object of class -#' \code{\link[SNPRelate:SNPGDSFileClass]{SNPRelate::SNPGDSFileClass}}, a SNP -#' GDS file. +#' add.gdsn(gds1KG, "sample.annot", samples) +#' add.gdsn(gds1KG, "sample.ref", c(1,0, 1)) +#' sync.gds(gds1KG) #' -#' @param gdsPhase TODO +#' ## Create a temporary Profile GDS file +#' fileNameProfile <- file.path(tempDir, "GDS_TEMP_addStudy1Kg_Sample.gds") +#' gdsProfile <- createfn.gds(fileNameProfile) #' -#' @param pathProfileGDS the path of an object of class \code{gds} related to -#' the sample +#' study.list <- data.frame(study.id=c("HTT Study"), +#' study.desc=c("Important Study"), +#' study.platform=c("Panel"), stringsAsFactors=FALSE) #' -#' @param verbose a \code{logical} indicating if message information should be -#' printed. Default: \code{TRUE}. +#' add.gdsn(gdsProfile, "study.list", study.list) #' -#' @return The integer \code{0} when successful. +#' study.annot <- data.frame(data.id=c("TOTO1"), case.id=c("TOTO1"), +#' sample.type=c("Study"), diagnosis=c("Study"), +#' source=rep("IGSR"), study.id=c("Study"), +#' stringsAsFactors=FALSE) #' -#' @examples +#' add.gdsn(gdsProfile, "study.annot", study.annot) +#' sync.gds(gdsProfile) +#' closefn.gds(gdsProfile) #' -#' ## Path to the demo pedigree file is located in this package -#' dataDir <- system.file("extdata", "RAIDS") +#' ## Append information about the 1KG samples into the Profile GDS file +#' ## The Profile GDS file will contain 'study.list' and 'study.annot' entries +#' addStudy1Kg(gdsReference=gds1KG, fileProfileGDS=fileNameProfile, +#' verbose=TRUE) #' -#' ## TODO +#' closefn.gds(gds1KG) +#' unlink(fileNameProfile, recursive=TRUE, force=TRUE) +#' unlink(fileName1KG, recursive=TRUE, force=TRUE) +#' unlink(tempDir) #' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz -#' @importFrom gdsfmt index.gdsn read.gdsn +#' @importFrom gdsfmt add.gdsn index.gdsn delete.gdsn sync.gds ls.gdsn #' @encoding UTF-8 #' @export -addPhase1KG2SampleGDSFromGDS <- function(gdsReference, gdsPhase, pathProfileGDS, - verbose=FALSE) { +addStudy1Kg <- function(gdsReference, fileProfileGDS, verbose=FALSE) { + + ## Validate parameters + validateAddStudy1Kg(gdsReference=gdsReference, + fileProfileGDS=fileProfileGDS, verbose=verbose) - listGDSSample <- dir(pathProfileGDS, pattern = ".+.gds") + ## Open GDS Sample file + gdsSample <- openfn.gds(filename=fileProfileGDS, readonly=FALSE) - indexAll <- NULL - for(fileProfileGDS in listGDSSample){ - gdsSample <- openfn.gds(file.path(pathProfileGDS, fileProfileGDS)) + ## Extract study information from GDS Sample file + snp.study <- read.gdsn(index.gdsn(node=gdsSample, "study.list")) - snp.index <- read.gdsn(index.gdsn(gdsSample,"snp.index")) + ## When the 1KG Study is not already present in the GDS Sample file + if(length(which(snp.study$study.id == "Ref.1KG")) == 0) { - indexAll <- union(indexAll, snp.index) - closefn.gds(gdsSample) - } + ## Extract information about all samples from 1KG that are unrelated + ## and can be used in the ancestry analysis + sample.ref <- read.gdsn(index.gdsn(node=gdsReference, "sample.ref")) + sample.id <- read.gdsn(index.gdsn(node=gdsReference, + "sample.id"))[which(sample.ref == 1)] - gdsSample <- createfn.gds(file.path(pathProfileGDS, "phase1KG.gds")) - indexAll <- indexAll[order(indexAll)] - snp.id <- read.gdsn(index.gdsn(gdsReference,"snp.id"))[indexAll] - add.gdsn(gdsSample, "snp.id", snp.id) - add.gdsn(gdsSample, "snp.index", indexAll) - listRef <- which(read.gdsn(index.gdsn(gdsReference, "sample.ref"))==1) - listSample <- read.gdsn(index.gdsn(gdsReference, "sample.id"))[listRef] - #listSNP <- readRDS(fileSNPsRDS) - i<-1 - for(sample1KG in listSample){ - if(verbose) { message("P ", i, " ", Sys.time()) } - - matSample <- read.gdsn(index.gdsn(gdsPhase, "phase"), - start=c(1, listRef[i]), count=c(-1,1))[indexAll] - i<-i+1 - - var.phase <- NULL - if (! ("phase" %in% ls.gdsn(gdsSample))) { - var.phase <- add.gdsn(gdsSample, "phase", - valdim=c(length(indexAll), 1), - matSample, storage="bit2") + ## Create study information for the 1KG Study + study.list <- data.frame(study.id="Ref.1KG", + study.desc="Unrelated samples from 1000 Genomes", + study.platform="GRCh38 1000 genotypes", stringsAsFactors=FALSE) - } else { - if (is.null(var.phase)) { - var.phase <- index.gdsn(gdsSample, "phase") - } - append.gdsn(var.phase, matSample) - } + ## Create the pedigree information for the 1KG samples + ped1KG <- data.frame(Name.ID=sample.id, Case.ID=sample.id, + Sample.Type=rep("Reference", length(sample.id)), + Diagnosis=rep("Reference", length(sample.id)), + Source=rep("IGSR", length(sample.id)), stringsAsFactors=FALSE) + + ## Row names must be the sample identifiers + rownames(ped1KG) <- ped1KG$Name.ID + + ## Add the information about the 1KG samples into the Profile GDS + addStudyGDSSample(gdsProfile=gdsSample, pedProfile=ped1KG, batch=1, + listSamples=NULL, studyDF=study.list, verbose=verbose) + + sync.gds(gdsSample) } + ## Close GDS Sample file (important) closefn.gds(gdsSample) - ## Successful + ## Return success return(0L) } -#' @title Compute principal component axes (PCA) on pruned SNV with the -#' reference samples +#' @title Project synthetic profiles onto existing principal component axes +#' generated using the reference 1KG profiles #' -#' @description This function compute the PCA on pruned SNV with the -#' reference samples +#' @description The function projects the synthetic profiles onto existing +#' principal component axes generated using the reference 1KG profiles. The +#' reference profiles used to generate the synthetic profiles have previously +#' been removed from the set of reference profiles. #' -#' @param gdsProfile an object of class -#' \code{\link[SNPRelate:SNPGDSFileClass]{SNPRelate::SNPGDSFileClass}}, a SNP -#' GDS file. +#' @param gdsProfile an object of class \link[gdsfmt]{gds.class} (a GDS file), +#' an opened Profile GDS file. #' -#' @param listRef a \code{vector} of string representing the -#' identifier of the profiles in the reference (unrelated). +#' @param listPCA a \code{list} containing the PCA \code{object} generated +#' with the 1KG reference profiles (excluding the ones used to generate the +#' synthetic data set) in an entry called \code{"pca.unrel"}. #' -#' @param np a single positive \code{integer} representing the number of -#' threads. Default: \code{1L}. +#' @param sampleRef a \code{vector} of \code{character} strings representing +#' the identifiers of the 1KG reference profiles that have been used to +#' generate the synthetic profiles +#' that are going to be analysed here. The sub-continental +#' identifiers are used as names for the \code{vector}. #' -#' @param verbose a \code{logical} indicating if the PCA functions should be -#' verbose. Default: \code{FALSE}. -#' -#' @return listPCA a \code{list} containing two objects -#' pca.unrel -> \code{snpgdsPCAClass} -#' and a snp.load -> \code{snpgdsPCASNPLoading} -#' -#' @details -#' -#' More information about the method used to calculate the patient eigenvectors -#' can be found at the Bioconductor SNPRelate website: -#' https://bioconductor.org/packages/SNPRelate/ -#' -#' @examples -#' -#' ## Path to the demo pedigree file is located in this package -#' dataDir <- system.file("extdata", "RAIDS") -#' -#' ## TODO -#' -#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz -#' @importFrom SNPRelate snpgdsPCA snpgdsPCASNPLoading -#' @importFrom gdsfmt index.gdsn read.gdsn -#' @importFrom S4Vectors isSingleNumber -#' @encoding UTF-8 -#' @export -computePrunedPCARef <- function(gdsProfile, listRef, np=1L, verbose=FALSE) { - - ## The gdsReference must be an object of class "gds.class" - validateGDSClass(gds=gdsProfile, name="gdsProfile") - - ## Validate that np is a single positive integer - if(! (isSingleNumber(np) && np > 0)) { - stop("The \'np\' parameter must be a single positive integer.") - } - - if(! is.logical(verbose)) { - stop("The \'verbose\' parameter must be logical (TRUE or FALSE).") - } - - listPCA <- list() - - listPruned <- read.gdsn(index.gdsn(gdsProfile, "pruned.study")) - - ## Calculate the eigenvectors using the specified SNP loadings for - ## the reference profiles - listPCA[["pca.unrel"]] <- snpgdsPCA(gdsobj=gdsProfile, - sample.id=listRef, - snp.id=listPruned, - num.thread=np, - verbose=verbose) - - listPCA[["snp.load"]] <- snpgdsPCASNPLoading(pcaobj=listPCA[["pca.unrel"]], - gdsobj=gdsProfile, - num.thread=np, - verbose=verbose) - return(listPCA) -} - - - -#' @title Project profile onto existing principal component axes (PCA) -#' -#' @description This function calculates the profile eigenvectors using -#' the specified SNP loadings. -#' -#' @param gdsProfile an object of class -#' \code{\link[SNPRelate:SNPGDSFileClass]{SNPRelate::SNPGDSFileClass}}, an -#' opened Profile GDS file. -#' -#' @param listPCA a \code{list} containing two entries: -#' \itemize{ -#' \item{pca.unrel} {\code{snpgdsPCAClass} object} -#' \item{snp.load} {\code{snpgdsPCASNPLoading} object} -#' } -#' -#' @param currentProfile a \code{character} string representing the -#' identifiant of the profile to be projected in the PCA. -#' -#' @param np a single positive \code{integer} representing the number of -#' threads. Default: \code{1L}. -#' -#' @param verbose a \code{logical} passed to the PCA function. -#' Default: \code{FALSE}. -#' -#' @return a \code{snpgdsPCAClass} object, a \code{list} that contains: -#' \itemize{ -#' \item{sample.id} {the sample ids used in the analysis} -#' \item{snp.id} {the SNP ids used in the analysis} -#' \item{eigenvalues} {eigenvalues} -#' \item{eigenvect} {eigenvactors, “# of samples” x “eigen.cnt”} -#' \item{TraceXTX} {the trace of the genetic covariance matrix} -#' \item{Bayesian} {whether use bayerisan normalization} -#'} -#' -#' @details -#' -#' More information about the method used to calculate the patient eigenvectors -#' can be found at the Bioconductor SNPRelate website: -#' https://bioconductor.org/packages/SNPRelate/ -#' -#' @examples -#' -#' ## Path to the demo pedigree file is located in this package -#' dataDir <- system.file("extdata", "RAIDS") -#' -#' ## TODO -#' -#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz -#' @importFrom SNPRelate snpgdsPCASampLoading -#' @importFrom S4Vectors isSingleNumber -#' @encoding UTF-8 -#' @export -projectSample2PCA <- function(gdsProfile, listPCA, currentProfile, np=1L, - verbose=FALSE) { - - - ## Validate that currentProfile is a character string - if(! is.character(currentProfile)) { - stop("The \'currentProfile\' parameter must be a character string.") - } - - ## Validate that np is a single positive integer - if(! (isSingleNumber(np) && np > 0)) { - stop("The \'np\' parameter must be a single positive integer.") - } - - if(! is.logical(verbose)) { - stop("The \'verbose\' parameter must be logical (TRUE or FALSE).") - } - - ## Calculate the sample eigenvectors using the specified SNP loadings - samplePCA <- snpgdsPCASampLoading(listPCA[["snp.load"]], - gdsobj=gdsProfile, sample.id=currentProfile, - num.thread=1, verbose=verbose) - - return(samplePCA) -} - - -#' @title Append information about the 1KG samples into -#' the Profile GDS file -#' -#' @description The information about the samples present in the 1KG GDS file -#' is added into the GDS Sample file. Only the information about the -#' unrelated samples -#' from the 1OOO Genome Study are copied into the GDS Sample file. The -#' information is only added to the GDS Sample file when the 1KG Study is not -#' already present in the GDS Sample file. The sample information for all -#' selected samples is appended to the GDS Sample file "study.annot" node. -#' The study information is appended to the GDS Sample file "study.list" node. -#' -#' @param gdsReference an object of class -#' \link[gdsfmt]{gds.class} (a GDS file), the opened 1KG GDS file. -#' -#' @param fileProfileGDS a \code{character} string representing the path and -#' file name of the GDS Sample file. The GDS Sample file must exist. -#' -#' @param verbose a \code{logical} indicating if messages should be printed -#' to show how the different steps in the function. Default: \code{FALSE}. -#' -#' @return The integer \code{0L} when successful. -#' -#' @examples -#' -#' ## Required library for GDS -#' library(gdsfmt) -#' -#' ## Get the temp folder -#' tempDir <- tempdir() -#' -#' ## Create a temporary 1KG GDS file and add needed information -#' fileName1KG <- file.path(tempDir, "GDS_TEMP_addStudy1Kg_1KG.gds") -#' gds1KG <- createfn.gds(filename=fileName1KG) -#' add.gdsn(gds1KG, "sample.id", c("HTT101", "HTT102", "HTT103")) -#' -#' samples <- data.frame(sex=c(1, 1, 2), pop.group=c("GBR", "GIH", "GBR"), -#' superPop=c("EUR", "SAS", "EUR"), batch=rep(0, 3), -#' stringsAsFactors = FALSE) -#' -#' add.gdsn(gds1KG, "sample.annot", samples) -#' add.gdsn(gds1KG, "sample.ref", c(1,0, 1)) -#' sync.gds(gds1KG) -#' -#' ## Create a temporary Profile GDS file -#' fileNameProfile <- file.path(tempDir, "GDS_TEMP_addStudy1Kg_Sample.gds") -#' gdsProfile <- createfn.gds(fileNameProfile) -#' -#' study.list <- data.frame(study.id=c("HTT Study"), -#' study.desc=c("Important Study"), -#' study.platform=c("Panel"), stringsAsFactors=FALSE) -#' -#' add.gdsn(gdsProfile, "study.list", study.list) -#' -#' study.annot <- data.frame(data.id=c("TOTO1"), case.id=c("TOTO1"), -#' sample.type=c("Study"), diagnosis=c("Study"), -#' source=rep("IGSR"), study.id=c("Study"), -#' stringsAsFactors=FALSE) -#' -#' add.gdsn(gdsProfile, "study.annot", study.annot) -#' sync.gds(gdsProfile) -#' closefn.gds(gdsProfile) -#' -#' ## Append information about the 1KG samples into the Profile GDS file -#' ## The Profile GDS file will contain 'study.list' and 'study.annot' entries -#' addStudy1Kg(gdsReference=gds1KG, fileProfileGDS=fileNameProfile, -#' verbose=TRUE) -#' -#' closefn.gds(gds1KG) -#' unlink(fileNameProfile, recursive=TRUE, force=TRUE) -#' unlink(fileName1KG, recursive=TRUE, force=TRUE) -#' unlink(tempDir) -#' -#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz -#' @importFrom gdsfmt add.gdsn index.gdsn delete.gdsn sync.gds ls.gdsn -#' @encoding UTF-8 -#' @export -addStudy1Kg <- function(gdsReference, fileProfileGDS, verbose=FALSE) { - - ## Validate parameters - validateAddStudy1Kg(gdsReference=gdsReference, - fileProfileGDS=fileProfileGDS, verbose=verbose) - - ## Open GDS Sample file - gdsSample <- openfn.gds(filename=fileProfileGDS, readonly=FALSE) - - ## Extract study information from GDS Sample file - snp.study <- read.gdsn(index.gdsn(node=gdsSample, "study.list")) - - ## When the 1KG Study is not already present in the GDS Sample file - if(length(which(snp.study$study.id == "Ref.1KG")) == 0) { - - ## Extract information about all samples from 1KG that are unrelated - ## and can be used in the ancestry analysis - sample.ref <- read.gdsn(index.gdsn(node=gdsReference, "sample.ref")) - sample.id <- read.gdsn(index.gdsn(node=gdsReference, - "sample.id"))[which(sample.ref == 1)] - - ## Create study information for the 1KG Study - study.list <- data.frame(study.id="Ref.1KG", - study.desc="Unrelated samples from 1000 Genomes", - study.platform="GRCh38 1000 genotypes", stringsAsFactors=FALSE) - - ## Create the pedigree information for the 1KG samples - ped1KG <- data.frame(Name.ID=sample.id, Case.ID=sample.id, - Sample.Type=rep("Reference", length(sample.id)), - Diagnosis=rep("Reference", length(sample.id)), - Source=rep("IGSR", length(sample.id)), stringsAsFactors=FALSE) - - ## Row names must be the sample identifiers - rownames(ped1KG) <- ped1KG$Name.ID - - ## Add the information about the 1KG samples into the Profile GDS - addStudyGDSSample(gdsProfile=gdsSample, pedProfile=ped1KG, batch=1, - listSamples=NULL, studyDF=study.list, verbose=verbose) - - sync.gds(gdsSample) - } - - ## Close GDS Sample file (important) - closefn.gds(gdsSample) - - ## Return success - return(0L) -} - - -#' @title Project synthetic profiles onto existing principal component axes -#' generated using the reference 1KG profiles -#' -#' @description The function projects the synthetic profiles onto existing -#' principal component axes generated using the reference 1KG profiles. The -#' reference profiles used to generate the synthetic profiles have previously -#' been removed from the set of reference profiles. -#' -#' @param gdsProfile an object of class \link[gdsfmt]{gds.class} (a GDS file), -#' an opened Profile GDS file. -#' -#' @param listPCA a \code{list} containing the PCA \code{object} generated -#' with the 1KG reference profiles (excluding the ones used to generate the -#' synthetic data set) in an entry called \code{"pca.unrel"}. -#' -#' @param sampleRef a \code{vector} of \code{character} strings representing -#' the identifiers of the 1KG reference profiles that have been used to -#' generate the synthetic profiles -#' that are going to be analysed here. The sub-continental -#' identifiers are used as names for the \code{vector}. -#' -#' @param studyIDSyn a \code{character} string corresponding to the study -#' identifier. -#' The study identifier must be present in the Profile GDS file. +#' @param studyIDSyn a \code{character} string corresponding to the study +#' identifier. +#' The study identifier must be present in the Profile GDS file. #' #' @param verbose a \code{logical} indicating if messages should be printed #' to show how the different steps in the function. Default: \code{FALSE}. #' #' @return a \code{list} containing 3 entries: -#' \itemize{ -#' \item{sample.id} { a \code{vector} of \code{character} strings representing +#' \describe{ +#' \item{sample.id}{ a \code{vector} of \code{character} strings representing #' the identifiers of the synthetic profiles that have been projected onto #' the 1KG PCA. } -#' \item{eigenvector.ref} { a \code{matrix} of \code{numeric} with the +#' \item{eigenvector.ref}{ a \code{matrix} of \code{numeric} with the #' eigenvectors of the 1KG reference profiles used to generate the PCA.} -#' \item{eigenvector} { a \code{matrix} of \code{numeric} with the +#' \item{eigenvector}{ a \code{matrix} of \code{numeric} with the #' eigenvectors of the synthetic profiles projected onto the 1KG PCA. } #' } #' @@ -1231,6 +782,9 @@ addStudy1Kg <- function(gdsReference, fileProfileGDS, verbose=FALSE) { #' ## Required library #' library(gdsfmt) #' +#' ## Loading demo PCA on subset of 1KG reference dataset +#' data(demoPCA1KG) +#' #' ## Path to the demo Profile GDS file is located in this package #' dataDir <- system.file("extdata/demoKNNSynthetic", package="RAIDS") #' @@ -1246,14 +800,12 @@ addStudy1Kg <- function(gdsReference, fileProfileGDS, verbose=FALSE) { #' "PEL", "PJL", "KHV", "ACB", "GWD", "ESN", "BEB", "MSL", "STU", "ITU", #' "CEU", "YRI", "CHB", "JPT", "LWK", "ASW", "MXL", "TSI", "GIH") #' -#' ## The PCA on the 1KG reference profiles -#' pca <- readRDS(file.path(dataDir, "pca1KG.RDS")) -#' #' ## Open the Profile GDS file #' gdsProfile <- snpgdsOpen(file.path(dataDir, "ex1.gds")) #' #' ## Projects synthetic profiles on 1KG PCA -#' results <- computePCAMultiSynthetic(gdsProfile=gdsProfile, listPCA=pca, +#' results <- computePCAMultiSynthetic(gdsProfile=gdsProfile, +#' listPCA=demoPCA1KG, #' sampleRef=samplesRM, studyIDSyn=studyID, verbose=FALSE) #' #' ## The eigenvectors for the synthetic profiles @@ -1352,12 +904,12 @@ computePCAMultiSynthetic <- function(gdsProfile, listPCA, #' doi: 10.1016/j.ajhg.2015.12.022. Epub 2016 Feb 25. #' #' @return a \code{list} containing 3 entries: -#' \itemize{ -#' \item{\code{sample.id}} { a \code{character} string representing the unique +#' \describe{ +#' \item{\code{sample.id}}{ a \code{character} string representing the unique #' identifier of the analyzed profile.} -#' \item{\code{eigenvector.ref}} { a \code{matrix} of \code{numeric} +#' \item{\code{eigenvector.ref}}{ a \code{matrix} of \code{numeric} #' representing the eigenvectors of the reference profiles. } -#' \item{\code{eigenvector}} { a \code{matrix} of \code{numeric} representing +#' \item{\code{eigenvector}}{ a \code{matrix} of \code{numeric} representing #' the eigenvectors of the analyzed profile. } #' } #' @@ -1488,28 +1040,28 @@ computePCARefSample <- function(gdsProfile, currentProfile, #' Default: \code{seq(2, 15, 1)}. #' #' @return a \code{list} containing 4 entries: -#' \itemize{ -#' \item{\code{sample.id}} {a \code{vector} of \code{character} strings +#' \describe{ +#' \item{\code{sample.id}}{ a \code{vector} of \code{character} strings #' representing the identifiers of the synthetic profiles analysed.} -#' \item{\code{sample1Kg}} {a \code{vector} of \code{character} strings +#' \item{\code{sample1Kg}}{ a \code{vector} of \code{character} strings #' representing the identifiers of the 1KG reference profiles used to #' generate the synthetic profiles.} -#' \item{\code{sp}} {a \code{vector} of \code{character} strings representing +#' \item{\code{sp}}{ a \code{vector} of \code{character} strings representing #' the known super population ancestry of the 1KG reference profiles used #' to generate the synthetic profiles.} -#' \item{\code{matKNN}} {a \code{data.frame} containing the super population +#' \item{\code{matKNN}}{ a \code{data.frame} containing the super population #' inference for each synthetic profiles for different values of PCA #' dimensions \code{D} and k-neighbors values \code{K}. The fourth column title #' corresponds to the \code{fieldPopInfAnc} parameter. #' The \code{data.frame} contains 4 columns: -#' \itemize{ -#' \item{\code{sample.id}} {a \code{character} string representing +#' \describe{ +#' \item{\code{sample.id}}{ a \code{character} string representing #' the identifier of the synthetic profile analysed.} -#' \item{\code{D}} {a \code{numeric} strings representing +#' \item{\code{D}}{ a \code{numeric} strings representing #' the value of the PCA dimension used to infer the super population.} -#' \item{\code{K}} {a \code{numeric} strings representing +#' \item{\code{K}}{ a \code{numeric} strings representing #' the value of the k-neighbors used to infer the super population.} -#' \item{\code{fieldPopInfAnc} value} {a \code{character} string representing +#' \item{\code{fieldPopInfAnc} value}{ a \code{character} string representing #' the inferred ancestry.} #' } #' } @@ -1520,27 +1072,29 @@ computePCARefSample <- function(gdsProfile, currentProfile, #' ## Required library #' library(gdsfmt) #' -#' ## Path to the demo Profile GDS file is located in this package -#' dataDir <- system.file("extdata/demoKNNSynthetic", package="RAIDS") -#' -#' # The name of the synthetic study -#' studyID <- "MYDATA.Synthetic" +#' ## Load the demo PCA on the synthetic profiles projected on the +#' ## demo 1KG reference PCA +#' data(demoPCASyntheticProfiles) #' -#' ## The PCA on the synthetic profiles projected on the 1KG reference PCA -#' pca <- readRDS(file.path(dataDir, "pcaSynthetic.RDS")) +#' ## Load the known ancestry for the demo 1KG reference profiles +#' data(demoKnownSuperPop1KG) #' -#' ## The known ancestry for the 1KG reference profiles -#' refKnownSuperPop <- readRDS(file.path(dataDir, "knownSuperPop1KG.RDS")) +#' ## Path to the demo Profile GDS file is located in this package +#' dataDir <- system.file("extdata/demoKNNSynthetic", package="RAIDS") #' #' ## Open the Profile GDS file #' gdsProfile <- snpgdsOpen(file.path(dataDir, "ex1.gds")) #' +#' # The name of the synthetic study +#' studyID <- "MYDATA.Synthetic" +#' #' ## Projects synthetic profiles on 1KG PCA -#' results <- computeKNNRefSynthetic(gdsProfile=gdsProfile, listEigenvector=pca, +#' results <- computeKNNRefSynthetic(gdsProfile=gdsProfile, +#' listEigenvector=demoPCASyntheticProfiles, #' listCatPop=c("EAS", "EUR", "AFR", "AMR", "SAS"), studyIDSyn=studyID, -#' spRef=refKnownSuperPop) +#' spRef=demoKnownSuperPop1KG) #' -#' ## The inferred ancestry for the synthetic profiles for differents values +#' ## The inferred ancestry for the synthetic profiles for different values #' ## of D and K #' head(results$matKNN) #' @@ -1662,22 +1216,22 @@ computeKNNRefSynthetic <- function(gdsProfile, listEigenvector, #' Default: \code{seq(2, 15, 1)}. #' #' @return a \code{list} containing 4 entries: -#' \itemize{ -#' \item{\code{sample.id}} { a \code{vector} of \code{character} strings +#' \describe{ +#' \item{\code{sample.id}}{ a \code{vector} of \code{character} strings #' representing the identifier of the profile analysed.} -#' \item{\code{matKNN}} { a \code{data.frame} containing the super population +#' \item{\code{matKNN}}{ a \code{data.frame} containing the super population #' inference for the profile for different values of PCA #' dimensions \code{D} and k-neighbors values \code{K}. The fourth column title #' corresponds to the \code{fieldPopInfAnc} parameter. #' The \code{data.frame} contains 4 columns: -#' \itemize{ -#' \item{\code{sample.id}} {a \code{character} string representing +#' \describe{ +#' \item{\code{sample.id}}{ a \code{character} string representing #' the identifier of the profile analysed.} -#' \item{\code{D}} { a \code{numeric} strings representing +#' \item{\code{D}}{ a \code{numeric} strings representing #' the value of the PCA dimension used to infer the ancestry.} -#' \item{\code{K}} { a \code{numeric} strings representing +#' \item{\code{K}}{ a \code{numeric} strings representing #' the value of the k-neighbors used to infer the ancestry..} -#' \item{\code{fieldPopInfAnc}} { a \code{character} string representing +#' \item{\code{fieldPopInfAnc}}{ a \code{character} string representing #' the inferred ancestry.} #' } #' } @@ -1685,22 +1239,23 @@ computeKNNRefSynthetic <- function(gdsProfile, listEigenvector, #' #' @examples #' -#' ## Path to the demo files located in this package -#' dataDir <- system.file("extdata/demoKNNSynthetic", package="RAIDS") +#' ## Load the demo PCA on the synthetic profiles projected on the +#' ## demo 1KG reference PCA +#' data(demoPCASyntheticProfiles) +#' +#' ## Load the known ancestry for the demo 1KG reference profiles +#' data(demoKnownSuperPop1KG) #' #' ## The PCA with 1 profile projected on the 1KG reference PCA #' ## Only one profile is retained -#' pca <- readRDS(file.path(dataDir, "pcaSynthetic.RDS")) +#' pca <- demoPCASyntheticProfiles #' pca$sample.id <- pca$sample.id[1] #' pca$eigenvector <- pca$eigenvector[1, , drop=FALSE] #' -#' ## The known ancestry for the 1KG reference profiles -#' refKnownSuperPop <- readRDS(file.path(dataDir, "knownSuperPop1KG.RDS")) -#' #' ## Projects profile on 1KG PCA #' results <- computeKNNRefSample(listEigenvector=pca, #' listCatPop=c("EAS", "EUR", "AFR", "AMR", "SAS"), -#' spRef=refKnownSuperPop, fieldPopInfAnc="SuperPop", +#' spRef=demoKnownSuperPop1KG, fieldPopInfAnc="SuperPop", #' kList=seq(10, 15, 1), pcaList=seq(10, 15, 1)) #' #' ## The assigned ancestry to the profile for different values of K and D @@ -1840,16 +1395,16 @@ computeKNNRefSample <- function(listEigenvector, #' printed. Default: \code{FALSE}. #' #' @return a \code{list} containing the following entries: -#' \itemize{ -#' \item{sample.id}{a \code{vector} of \code{character} strings representing +#' \describe{ +#' \item{sample.id}{ a \code{vector} of \code{character} strings representing #' the identifiers of the synthetic profiles. } -#' \item{sample1Kg}{a \code{vector} of \code{character} strings representing +#' \item{sample1Kg}{ a \code{vector} of \code{character} strings representing #' the identifiers of the reference 1KG profiles used to generate the #' synthetic profiles. } -#' \item{sp}{a \code{vector} of \code{character} strings representing the +#' \item{sp}{ a \code{vector} of \code{character} strings representing the #' known ancestry for the reference 1KG profiles used to generate the #' synthetic profiles. } -#' \item{matKNN}{a \code{data.frame} containing 4 columns. The first column +#' \item{matKNN}{ a \code{data.frame} containing 4 columns. The first column #' 'sample.id' contains the name of the synthetic profile. The second column #' 'D' represents the dimension D used to infer the ancestry. The third column #' 'K' represents the number of neighbors K used to infer the ancestry. The @@ -1868,8 +1423,9 @@ computeKNNRefSample <- function(listEigenvector, #' ## Required library #' library(gdsfmt) #' -#' ## Path to the demo Profile GDS file is located in this package -#' dataDir <- system.file("extdata/demoKNNSynthetic", package="RAIDS") +#' ## Load the known ancestry for the demo 1KG reference profiles +#' data(demoKnownSuperPop1KG) +#' #' #' # The name of the synthetic study #' studyID <- "MYDATA.Synthetic" @@ -1883,8 +1439,8 @@ computeKNNRefSample <- function(listEigenvector, #' "PEL", "PJL", "KHV", "ACB", "GWD", "ESN", "BEB", "MSL", "STU", "ITU", #' "CEU", "YRI", "CHB", "JPT", "LWK", "ASW", "MXL", "TSI", "GIH") #' -#' ## The known ancestry for the 1KG reference profiles -#' refKnownSuperPop <- readRDS(file.path(dataDir, "knownSuperPop1KG.RDS")) +#' ## Path to the demo Profile GDS file is located in this package +#' dataDir <- system.file("extdata/demoKNNSynthetic", package="RAIDS") #' #' ## Open the Profile GDS file #' gdsProfile <- snpgdsOpen(file.path(dataDir, "ex1.gds")) @@ -1892,7 +1448,8 @@ computeKNNRefSample <- function(listEigenvector, #' ## Run a PCA analysis and a K-nearest neighbors analysis on a small set #' ## of synthetic data #' results <- computePoolSyntheticAncestryGr(gdsProfile=gdsProfile, -#' sampleRM=samplesRM, studyIDSyn=studyID, np=1L, spRef=refKnownSuperPop, +#' sampleRM=samplesRM, studyIDSyn=studyID, np=1L, +#' spRef=demoKnownSuperPop1KG, #' kList=seq(10,15,1), pcaList=seq(10,15,1), eigenCount=15L) #' #' ## The ancestry inference for the synthetic data using @@ -1957,168 +1514,28 @@ computePoolSyntheticAncestryGr <- function(gdsProfile, sampleRM, spRef, } -#' @title TODO + +#' @title Select the optimal K and D parameters using the synthetic data and +#' infer the ancestry of a specific profile #' -#' @description TODO +#' @description The function select the optimal K and D parameters for a +#' specific profile. The results on the synthetic data are used for the +#' parameter selection. Once the optimal parameters are selected, the +#' ancestry is inferred for the specific profile. #' #' @param gdsReference an object of class \link[gdsfmt]{gds.class} (a GDS -#' file), the opened Reference GDS file. -#' -#' @param gdsSample an object of class \link[gdsfmt]{gds.class} (a GDS file), -#' an opened Profile GDS file. +#' file), the opened 1KG GDS file. #' -#' @param profileID a single \code{character} string representing the -#' profile identifier. +#' @param gdsProfile an object of class \code{\link[gdsfmt]{gds.class}} +#' (a GDS file), the opened Profile GDS file. #' -#' @param dataRef a \code{data.frame} containing the information of the -#' synthetic profiles that will be +#' @param listFiles a \code{vector} of \code{character} strings representing +#' the name of files that contain the results of ancestry inference done on +#' the synthetic profiles for multiple values of _D_ and _K_. The files must +#' exist. #' -#' @param spRef TODO -#' -#' @param studyIDSyn a \code{character} string corresponding to the study -#' identifier. The study identifier must be present in the Profile GDS file. -#' -#' @param np a single positive \code{integer} representing the number of -#' threads. Default: \code{1L}. -#' -#' @param listCatPop a \code{vector} of \code{character} string -#' representing the list of possible ancestry assignations. Default: -#' \code{("EAS", "EUR", "AFR", "AMR", "SAS")}. -#' -#' @param fieldPopIn1KG a \code{character} string representing TODO . -#' Default: \code{"superPop"}. -#' -#' @param fieldPopInfAnc a \code{character} string representing the name of -#' the column that will contain the inferred ancestry for the specified -#' dataset. Default: \code{"SuperPop"}. -#' -#' @param kList a \code{vector} of \code{integer} representing the list of -#' values tested for the _K_ parameter. The _K_ parameter represents the -#' number of neighbors used in the K-nearest neighbors analysis. If -#' \code{NULL}, the value \code{seq(2,15,1)} is assigned. -#' Default: \code{seq(2,15,1)}. -#' -#' @param pcaList a \code{vector} of \code{integer} representing the list of -#' values tested for the _D_ parameter. The _D_ parameter represents the -#' number of dimensions used in the PCA analysis. If \code{NULL}, -#' the value \code{seq(2,15,1)} is assigned. -#' Default: \code{seq(2,15,1)}. -#' -#' @param algorithm a \code{character} string representing the algorithm used -#' to calculate the PCA. The 2 choices are "exact" (traditional exact -#' calculation) and "randomized" (fast PCA with randomized algorithm -#' introduced in Galinsky et al. 2016). Default: \code{"exact"}. -#' -#' @param eigenCount a single \code{integer} indicating the number of -#' eigenvectors that will be in the output of the \link[SNPRelate]{snpgdsPCA} -#' function; if 'eigenCount' <= 0, then all eigenvectors are returned. -#' Default: \code{32L}. -#' -#' @param missingRate a \code{numeric} value representing the threshold -#' missing rate at with the SNVs are discarded; the SNVs are retained in the -#' \link[SNPRelate]{snpgdsPCA} -#' with "<= missingRate" only; if \code{NaN}, no missing threshold. -#' Default: \code{0.025}. -#' -#' -#' @return A \code{list} TODO with the sample.id and eigenvectors -#' and a table with KNN callfor different K and pca dimension. -#' -#' @references -#' -#' Galinsky KJ, Bhatia G, Loh PR, Georgiev S, Mukherjee S, Patterson NJ, -#' Price AL. Fast Principal-Component Analysis Reveals Convergent Evolution -#' of ADH1B in Europe and East Asia. Am J Hum Genet. 2016 Mar 3;98(3):456-72. -#' doi: 10.1016/j.ajhg.2015.12.022. Epub 2016 Feb 25. -#' -#' @examples -#' -#' # TODO -#' listEigenvector <- "TOTO" -#' -#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz -#' @encoding UTF-8 -#' @export -computePoolSyntheticAncestry <- function(gdsReference, gdsSample, profileID, - dataRef, spRef, studyIDSyn, np=1L, - listCatPop=c("EAS", "EUR", "AFR", "AMR", "SAS"), - fieldPopIn1KG="superPop", fieldPopInfAnc="SuperPop", - kList=seq(2, 15, 1), pcaList = seq(2, 15, 1), - algorithm="exact", eigenCount=32L, missingRate=0.025) { - - ## Assign default value is kList is NULL - if(is.null(kList)) { - kList <- seq(2,15,1) - } - - ## Assign default value is pcaList is NULL - if(is.null(pcaList)) { - pcaList <- seq(2,15,1) - } - ## Add parameter validation - validateComputePoolSyntheticAncestry(gdsReference=gdsReference, - profileGDS=gdsSample, profileID=profileID, dataRef=dataRef, - spRef=spRef, studyIDSyn=studyIDSyn, np=np, listCatPop=listCatPop, - fieldPopIn1KG=fieldPopIn1KG, fieldPopInfAnc=fieldPopInfAnc, - kList=kList, pcaList=pcaList, algorithm=algorithm, - eigenCount=eigenCount, missingRate=missingRate) - - sampleRM <- splitSelectByPop(dataRef) - - KNN.list <- list() - for(j in seq_len(nrow(sampleRM))) { - ## Run a PCA analysis using 1 synthetic profile from each - ## sub-continental ancestry - ## The synthetic profiles are projected on the 1KG PCA space - ## (the reference samples used to generate the synthetic profiles are - ## removed from this PCA) - KNN.list[[j]] <- computePoolSyntheticAncestryGr(gdsProfile=gdsSample, - sampleRM=sampleRM[j,], spRef=spRef, studyIDSyn=studyIDSyn, - np=np, listCatPop=listCatPop, - fieldPopInfAnc=fieldPopInfAnc, kList=kList, - pcaList=pcaList, algorithm=algorithm, eigenCount=eigenCount, - missingRate=missingRate, verbose=FALSE) - } - - resultsKNN <- do.call(rbind, KNN.list) - - pedSyn <- prepPedSynthetic1KG(gdsReference=gdsReference, - gdsSample=gdsSample, studyID=studyIDSyn, popName=fieldPopIn1KG) - - listParaSample <- selParaPCAUpQuartile(resultsKNN, pedSyn, - fieldPopIn1KG, fieldPopInfAnc, listCatPop) - - listPCASample <- computePCARefSample(gdsProfile=gdsSample, - currentProfile=profileID, studyIDRef="Ref.1KG", np=np, - algorithm=algorithm, eigenCount=eigenCount, missingRate=missingRate) - - listKNNSample <- computeKNNSuperPopSample(gdsSample=gdsSample, - profileID, spRef) - - return(listKNNSample) -} - -#' @title Select the optimal K and D parameters using the synthetic data and -#' infer the ancestry of a specific profile -#' -#' @description The function select the optimal K and D parameters for a -#' specific profile. The results on the synthetic data are used for the -#' parameter selection. Once the optimal parameters are selected, the -#' ancestry is inferred for the specific profile. -#' -#' @param gdsReference an object of class \link[gdsfmt]{gds.class} (a GDS -#' file), the opened 1KG GDS file. -#' -#' @param gdsProfile an object of class \code{\link[gdsfmt]{gds.class}} -#' (a GDS file), the opened Profile GDS file. -#' -#' @param listFiles a \code{vector} of \code{character} strings representing -#' the name of files that contain the results of ancestry inference done on -#' the synthetic profiles for multiple values of _D_ and _K_. The files must -#' exist. -#' -#' @param currentProfile a \code{character} string representing the profile -#' identifier of the current profile on which ancestry will be inferred. +#' @param currentProfile a \code{character} string representing the profile +#' identifier of the current profile on which ancestry will be inferred. #' #' @param spRef a \code{vector} of \code{character} strings representing the #' known super population ancestry for the 1KG profiles. The 1KG profile @@ -2174,10 +1591,10 @@ computePoolSyntheticAncestry <- function(gdsReference, gdsSample, profileID, #' to show how the different steps in the function. Default: \code{FALSE}. #' #' @return a \code{list} containing 4 entries: -#' \itemize{ +#' \describe{ #' \item{\code{pcaSample}}{ a \code{list} containing the information related #' to the eigenvectors. The \code{list} contains those 3 entries: -#' \itemize{ +#' \describe{ #' \item{\code{sample.id}}{ a \code{character} string representing the unique #' identifier of the current profile.} #' \item{\code{eigenvector.ref}}{ a \code{matrix} of \code{numeric} containing @@ -2187,15 +1604,14 @@ computePoolSyntheticAncestry <- function(gdsReference, gdsSample, profileID, #' reference profiles.} #' } #' } -#' #' \item{\code{paraSample}}{ a \code{list} containing the results with #' different \code{D} and \code{K} values that lead to optimal parameter #' selection. The \code{list} contains those entries: -#' \itemize{ +#' \describe{ #' \item{\code{dfPCA}}{ a \code{data.frame} containing statistical results #' on all combined synthetic results done with a fixed value of \code{D} (the #' number of dimensions). The \code{data.frame} contains those columns: -#' \itemize{ +#' \describe{ #' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the #' number of dimensions).} #' \item{\code{median}}{ a \code{numeric} representing the median of the @@ -2215,7 +1631,7 @@ computePoolSyntheticAncestry <- function(gdsReference, gdsSample, profileID, #' all combined synthetic results done with different values of \code{D} (the #' number of dimensions) and \code{K} (the number of neighbors). #' The \code{data.frame} contains those columns: -#' \itemize{ +#' \describe{ #' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the #' number of dimensions).} #' \item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the @@ -2234,7 +1650,7 @@ computePoolSyntheticAncestry <- function(gdsReference, gdsSample, profileID, #' \item{\code{dfAUROC}}{ a \code{data.frame} the summary of the results by #' super-population. The \code{data.frame} contains #' those columns: -#' \itemize{ +#' \describe{ #' \item{\code{pcaD}}{ a \code{numeric} representing the value of \code{D} (the #' number of dimensions).} #' \item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the @@ -2260,17 +1676,16 @@ computePoolSyntheticAncestry <- function(gdsReference, gdsSample, profileID, #' \code{D} is possible.} #' } #' } -#' #' \item{\code{KNNSample}}{ a \code{list} containing the inferred ancestry #' using different \code{D} and \code{K} values. The \code{list} contains #' those entries: -#' \itemize{ +#' \describe{ #' \item{\code{sample.id}}{ a \code{character} string representing the unique #' identifier of the current profile.} #' \item{\code{matKNN}}{ a \code{data.frame} containing the inferred ancestry #' for different values of \code{K} and \code{D}. The \code{data.frame} #' contains those columns: -#' \itemize{ +#' \describe{ #' \item{\code{sample.id}}{ a \code{character} string representing the unique #' identifier of the current profile.} #' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the @@ -2283,11 +1698,10 @@ computePoolSyntheticAncestry <- function(gdsReference, gdsSample, profileID, #' } #' } #' } -#' #' \item{\code{Ancestry}}{ a \code{data.frame} containing the inferred #' ancestry for the current profile. The \code{data.frame} contains those #' columns: -#' \itemize{ +#' \describe{ #' \item{\code{sample.id}}{ a \code{character} string representing the unique #' identifier of the current profile.} #' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the @@ -2314,11 +1728,14 @@ computePoolSyntheticAncestry <- function(gdsReference, gdsSample, profileID, #' ## Required library #' library(gdsfmt) #' +#' ## Load the known ancestry for the demo 1KG reference profiles +#' data(demoKnownSuperPop1KG) +#' #' ## The Reference GDS file -#' path1KG <- system.file("extdata/example/gdsRef", package="RAIDS") +#' path1KG <- system.file("extdata/tests", package="RAIDS") #' #' ## Open the Reference GDS file -#' gdsRef <- snpgdsOpen(file.path(path1KG, "ex1kg.gds")) +#' gdsRef <- snpgdsOpen(file.path(path1KG, "ex1_good_small_1KG.gds")) #' #' ## Path to the demo synthetic results files #' ## List of the KNN result files from PCA run on synthetic data @@ -2329,10 +1746,6 @@ computePoolSyntheticAncestry <- function(gdsReference, gdsSample, profileID, #' # The name of the synthetic study #' studyID <- "MYDATA.Synthetic" #' -#' ## The known ancestry for the 1KG reference profiles -#' dataDir1KG <- system.file("extdata/demoKNNSynthetic", package="RAIDS") -#' refKnownSuperPop <- readRDS(file.path(dataDir1KG, "knownSuperPop1KG.RDS")) -#' #' ## Path to the demo Profile GDS file is located in this package #' dataDir <- system.file("extdata/demoAncestryCall", package="RAIDS") #' @@ -2346,7 +1759,7 @@ computePoolSyntheticAncestry <- function(gdsReference, gdsSample, profileID, #' gdsProfile=gdsProfile, #' listFiles=listFiles, #' currentProfile=c("ex1"), -#' spRef=refKnownSuperPop, +#' spRef=demoKnownSuperPop1KG, #' studyIDSyn=studyID, np=1L) #' #' ## The ancestry called with the optimal D and K values @@ -2362,11 +1775,8 @@ computePoolSyntheticAncestry <- function(gdsReference, gdsSample, profileID, #' @encoding UTF-8 #' @export computeAncestryFromSyntheticFile <- function(gdsReference, gdsProfile, - listFiles, - currentProfile, - spRef, - studyIDSyn, - np=1L, + listFiles, currentProfile, spRef, + studyIDSyn, np=1L, listCatPop=c("EAS", "EUR", "AFR", "AMR", "SAS"), fieldPopIn1KG="superPop", fieldPopInfAnc="SuperPop", @@ -2375,8 +1785,8 @@ computeAncestryFromSyntheticFile <- function(gdsReference, gdsProfile, algorithm=c("exact", "randomized"), eigenCount=32L, missingRate=NaN, verbose=FALSE) { - - if(is.null(pcaList)) { + ## Set parameters when no values given by user + if (is.null(pcaList)) { pcaList <- seq(2, 15, 1) } @@ -2394,7 +1804,7 @@ computeAncestryFromSyntheticFile <- function(gdsReference, gdsProfile, verbose=verbose) ## Matches a character method against a table of candidate values - algorithm <- arg_match(algorithm, multiple=FALSE) + algorithm <- arg_match(algorithm) ## Merge results from PCA run on synthetic data present in RDS files KNN.list <- list() @@ -2444,12 +1854,12 @@ computeAncestryFromSyntheticFile <- function(gdsReference, gdsProfile, #' @title Run most steps leading to the ancestry inference call on a specific -#' profile +#' exome profile #' #' @description This function runs most steps leading to the ancestry inference -#' call on a specific profile. First, the function creates the Profile GDS file -#' for the specific profile using the information from a RDS Sample -#' description file and the 1KG reference GDS file. +#' call on a specific exome profile. First, the function creates the +#' Profile GDS file for the specific profile using the information from a +#' RDS Sample description file and the Population reference GDS file. #' #' @param pedStudy a \code{data.frame} with those mandatory columns: "Name.ID", #' "Case.ID", "Sample.Type", "Diagnosis", "Source". All columns must be in @@ -2471,16 +1881,20 @@ computeAncestryFromSyntheticFile <- function(gdsReference, gdsProfile, #' directory containing the VCF output of SNP-pileup for each sample. The #' SNP-pileup files must be compressed (gz files) and have the name identifiers #' of the samples. A sample with "Name.ID" identifier would have an -#' associated SNP-pileup file called "Name.ID.txt.gz". +#' associated file called +#' if genoSource is "VCF", then "Name.ID.vcf.gz", +#' if genoSource is "generic", then "Name.ID.generic.txt.gz" +#' if genoSource is "snp-pileup", then "Name.ID.txt.gz". #' #' @param pathOut a \code{character} string representing the path to #' the directory where the output files are created. #' #' @param fileReferenceGDS a \code{character} string representing the file -#' name of the 1KG GDS file. The file must exist. +#' name of the Reference GDS file. The file must exist. #' #' @param fileReferenceAnnotGDS a \code{character} string representing the -#' file name of the 1KG GDS annotation file. The file must exist. +#' file name of the Population Reference GDS Annotation file. The file must +#' exist. #' #' @param chrInfo a \code{vector} of positive \code{integer} values #' representing the length of the chromosomes. See 'details' section. @@ -2488,23 +1902,28 @@ computeAncestryFromSyntheticFile <- function(gdsReference, gdsProfile, #' @param syntheticRefDF a \code{data.frame} containing a subset of #' reference profiles for each sub-population present in the Reference GDS #' file. The \code{data.frame} must have those columns: -#' \itemize{ -#' \item{sample.id} { a \code{character} string representing the sample +#' \describe{ +#' \item{sample.id}{ a \code{character} string representing the sample #' identifier. } -#' \item{pop.group} { a \code{character} string representing the +#' \item{pop.group}{ a \code{character} string representing the #' subcontinental population assigned to the sample. } -#' \item{superPop} { a \code{character} string representing the +#' \item{superPop}{ a \code{character} string representing the #' super-population assigned to the sample. } #' } #' -#' @param genoSource a \code{stirng} with two possible values: -#' snp-pileup and generic. It specify if the genotype files -#' are generate by snp-pileup(Facets) or generic format csv -#' with the column at least the columns: -#' Chromosome,Position,Ref,Alt,Count,File1R,File1A -#' where Count is the deep at the position, -#' FileR is the deep of the reference allele, and -#' File1A is the deep of the specific alternative allele +#' @param genoSource a \code{character} string with two possible values: +#' 'snp-pileup', 'generic' or 'VCF'. It specifies if the genotype files +#' are generated by snp-pileup (Facets) or are a generic format CSV file +#' with at least those columns: +#' 'Chromosome', 'Position', 'Ref', 'Alt', 'Count', 'File1R' and 'File1A'. +#' The 'Count' is the depth at the specified position; +#' 'FileR' is the depth of the reference allele and +#' 'File1A' is the depth of the specific alternative allele. +#' Finally the file can be a VCF file with at least those genotype +#' fields: GT, AD, DP. +#' +#' @param np a single positive \code{integer} specifying the number of +#' threads to be used. Default: \code{1L}. #' #' @param verbose a \code{logical} indicating if messages should be printed #' to show how the different steps in the function. Default: \code{FALSE}. @@ -2516,12 +1935,12 @@ computeAncestryFromSyntheticFile <- function(gdsReference, gdsProfile, #' #' The runExomeAncestry() function generates 3 types of files #' in the OUTPUT directory. -#' \itemize{ -#' \item{Ancestry Inference}{The ancestry inference CSV file +#' \describe{ +#' \item{Ancestry Inference}{ The ancestry inference CSV file #' (".Ancestry.csv" file)} -#' \item{Inference Informaton}{The inference information RDS file +#' \item{Inference Informaton}{ The inference information RDS file #' (".infoCall.rds" file)} -#' \item{Synthetic Information}{The parameter information RDS files +#' \item{Synthetic Information}{ The parameter information RDS files #' from the synthetic inference ("KNN.synt.*.rds" files in a sub-directory)} #' } #' @@ -2538,18 +1957,16 @@ computeAncestryFromSyntheticFile <- function(gdsReference, gdsProfile, #' @examples #' #' ## Required library for GDS -#' library(gdsfmt) +#' library(SNPRelate) #' #' ## Path to the demo 1KG GDS file is located in this package #' dataDir <- system.file("extdata", package="RAIDS") #' #' ################################################################# -#' ## The path and file name for the PED RDS file -#' ## will the information about the analyzed samples +#' ## Load the information about the profile #' ################################################################# -#' filePED <- file.path(dataDir, "example", "pedEx.rds") -#' ped <- readRDS(filePED) -#' head(ped) +#' data(demoPedigreeEx1) +#' head(demoPedigreeEx1) #' #' ################################################################# #' ## The 1KG GDS file and the 1KG SNV Annotation GDS file @@ -2557,10 +1974,10 @@ computeAncestryFromSyntheticFile <- function(gdsReference, gdsProfile, #' ## Note that the 1KG GDS file used for this example is a #' ## simplified version and CANNOT be used for any real analysis #' ################################################################# -#' path1KG <- file.path(dataDir, "example", "gdsRef") +#' path1KG <- file.path(dataDir, "tests") #' -#' fileReferenceGDS <- file.path(path1KG, "ex1kg.gds") -#' fileAnnotGDS <- file.path(path1KG, "exAnnot1kg.gds") +#' fileReferenceGDS <- file.path(path1KG, "ex1_good_small_1KG.gds") +#' fileAnnotGDS <- file.path(path1KG, "ex1_good_small_1KG_Annot.gds") #' #' ################################################################# #' ## The Sample SNP pileup files (one per sample) need @@ -2572,9 +1989,9 @@ computeAncestryFromSyntheticFile <- function(gdsReference, gdsProfile, #' ## The path where the Profile GDS Files (one per sample) #' ## will be created need to be specified. #' ################################################################# -#' pathProfileGDS <- file.path(dataDir, "example", "out.tmp") +#' pathProfileGDS <- file.path(tempdir(), "out.tmp") #' -#' pathOut <- file.path(dataDir, "example", "res.out") +#' pathOut <- file.path(tempdir(), "res.out") #' #' ################################################################# #' ## A data frame containing general information about the study @@ -2595,36 +2012,29 @@ computeAncestryFromSyntheticFile <- function(gdsReference, gdsProfile, #' dataRef <- select1KGPop(gds1KG, nbProfiles=2L) #' closefn.gds(gds1KG) #' -#' ## Chromosome length information -#' ## chr23 is chrX, chr24 is chrY and chrM is 25 -#' chrInfo <- c(248956422L, 242193529L, 198295559L, 190214555L, -#' 181538259L, 170805979L, 159345973L, 145138636L, 138394717L, 133797422L, -#' 135086622L, 133275309L, 114364328L, 107043718L, 101991189L, 90338345L, -#' 83257441L, 80373285L, 58617616L, 64444167L, 46709983L, 50818468L, -#' 156040895L, 57227415L, 16569L) -#' -#' ## A formal way to get the chormosome length information -#' ## library(BSgenome.Hsapiens.UCSC.hg38) -#' ## chrInfo <- integer(25L) -#' ## for(i in seq_len(22L)){ chrInfo[i] <- -#' ## length(Hsapiens[[paste0("chr", i)]])} -#' ## chrInfo[23] <- length(Hsapiens[["chrX"]]) -#' ## chrInfo[24] <- length(Hsapiens[["chrY"]]) -#' ## chrInfo[25] <- length(Hsapiens[["chrM"]]) -#' -#' \dontrun{ -#' runExomeAncestry(pedStudy=ped, studyDF=studyDF, -#' pathProfileGDS=pathProfileGDS, -#' pathGeno=pathGeno, -#' pathOut=pathOut, -#' fileReferenceGDS=fileReferenceGDS, -#' fileReferenceAnnotGDS=fileAnnotGDS, -#' chrInfo=chrInfo, -#' syntheticRefDF=dataRef, -#' genoSource="snp-pileup") -#' -#' unlink(pathProfileGDS, recursive=TRUE, force=TRUE) -#' unlink(pathOut, recursive=TRUE, force=TRUE) +#' ## Required library for this example to run correctly +#' if (requireNamespace("GenomeInfoDb", quietly=TRUE) && +#' requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) { +#' +#' ## Chromosome length information +#' ## chr23 is chrX, chr24 is chrY and chrM is 25 +#' chrInfo <- GenomeInfoDb::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25] +#' +#' \donttest{ +#' +#' runExomeAncestry(pedStudy=demoPedigreeEx1, studyDF=studyDF, +#' pathProfileGDS=pathProfileGDS, +#' pathGeno=pathGeno, +#' pathOut=pathOut, +#' fileReferenceGDS=fileReferenceGDS, +#' fileReferenceAnnotGDS=fileAnnotGDS, +#' chrInfo=chrInfo, +#' syntheticRefDF=dataRef, +#' genoSource="snp-pileup") +#' +#' unlink(pathProfileGDS, recursive=TRUE, force=TRUE) +#' unlink(pathOut, recursive=TRUE, force=TRUE) +#' } #' } #' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz @@ -2635,10 +2045,11 @@ computeAncestryFromSyntheticFile <- function(gdsReference, gdsProfile, runExomeAncestry <- function(pedStudy, studyDF, pathProfileGDS, pathGeno, pathOut, fileReferenceGDS, fileReferenceAnnotGDS, chrInfo, syntheticRefDF, - genoSource=c("snp-pileup", "generic"), verbose=FALSE) { + genoSource=c("snp-pileup", "generic", "VCF"), np=1L, + verbose=FALSE) { ## Validate parameters - validateRunExomeAncestry(pedStudy=pedStudy, studyDF=studyDF, + validateRunExomeOrRNAAncestry(pedStudy=pedStudy, studyDF=studyDF, pathProfileGDS=pathProfileGDS, pathGeno=pathGeno, pathOut=pathOut, fileReferenceGDS=fileReferenceGDS, fileReferenceAnnotGDS=fileReferenceAnnotGDS, chrInfo=chrInfo, @@ -2646,110 +2057,1164 @@ runExomeAncestry <- function(pedStudy, studyDF, pathProfileGDS, genoSource <- arg_match(genoSource) - listProfiles <- pedStudy[, "Name.ID"] - - createStudy2GDS1KG(pathGeno=pathGeno, pedStudy=pedStudy, - fileNameGDS=fileReferenceGDS, listProfiles=listProfiles, - studyDF=studyDF, pathProfileGDS=pathProfileGDS, genoSource=genoSource, - verbose=verbose) - - ## Open the 1KG GDS file (demo version) - gds1KG <- snpgdsOpen(fileReferenceGDS) - ## Open the 1KG GDS file and 1KG SNV Annotation file - gdsAnnot1KG <- openfn.gds(fileReferenceAnnotGDS) - - listProfileRef <- syntheticRefDF$sample.id - studyDF.syn <- data.frame(study.id=paste0(studyDF$study.id, ".Synthetic"), - study.desc=paste0(studyDF$study.id, " synthetic data"), - study.platform=studyDF$study.platform, stringsAsFactors=FALSE) - - for(i in seq_len(length(listProfiles))) { - pruningSample(gdsReference=gds1KG, currentProfile=listProfiles[i], - studyID=studyDF$study.id, pathProfileGDS=pathProfileGDS) - file.GDSProfile <- file.path(pathProfileGDS, - paste0(listProfiles[i], ".gds")) - add1KG2SampleGDS(gdsReference=gds1KG, fileProfileGDS=file.GDSProfile, - currentProfile=listProfiles[i], - studyID=studyDF$study.id) - addStudy1Kg(gds1KG, file.GDSProfile) - - gdsProfile <- openfn.gds(file.GDSProfile, readonly=FALSE) - - estimateAllelicFraction(gdsReference=gds1KG, gdsProfile=gdsProfile, - currentProfile=listProfiles[i], studyID=studyDF$study.id, - chrInfo=chrInfo, verbose=verbose) - closefn.gds(gdsProfile) - - ## Add information related to the synthetic profiles in Profile GDS file - prepSynthetic(fileProfileGDS=file.GDSProfile, - listSampleRef=listProfileRef, profileID=listProfiles[i], - studyDF=studyDF.syn, prefix="1", verbose=verbose) - - resG <- syntheticGeno(gdsReference=gds1KG, gdsRefAnnot=gdsAnnot1KG, - fileProfileGDS=file.GDSProfile, profileID=listProfiles[i], - listSampleRef=listProfileRef, prefix="1") - - if(! file.exists(pathOut)) { - dir.create(pathOut) - } - spRef <- getRef1KGPop(gds1KG, "superPop") - sampleRM <- splitSelectByPop(syntheticRefDF) - - pathOutProfile <- file.path(pathOut, listProfiles[i]) - if(! file.exists(pathOutProfile)) { - dir.create(pathOutProfile) - } - - ## Open the Profile GDS file - gdsProfile <- snpgdsOpen(file.GDSProfile) - - ## This variable will contain the results from the PCA analyses - ## For each row of the sampleRM matrix - for(j in seq_len(nrow(sampleRM))) { - ## Run a PCA analysis using 1 synthetic profile from each - ## sub-continental ancestry - ## The synthetic profiles are projected on the 1KG PCA space - ## (the reference samples used to generate the synthetic profiles - ## are removed from this PCA) - ## The K-nearest neighbor analysis is done using - ## a range of K and D values - synthKNN <- computePoolSyntheticAncestryGr(gdsProfile=gdsProfile, - sampleRM=sampleRM[j,], studyIDSyn=studyDF.syn$study.id, - np=1L, spRef=spRef, eigenCount=15L, verbose=FALSE) - - ## Results are saved - saveRDS(synthKNN$matKNN, file.path(pathOutProfile, - paste0("KNN.synt.", listProfiles[i], ".", j, ".rds"))) - } - - ## Directory where the KNN results have been saved - pathKNN <- file.path(pathOut, listProfiles[i]) - listFilesName <- dir(file.path(pathKNN), ".rds") - ## List of the KNN result files from PCA on synthetic data - listFiles <- file.path(file.path(pathKNN) , listFilesName) - - resCall <- computeAncestryFromSyntheticFile(gdsReference=gds1KG, - gdsProfile=gdsProfile, listFiles=listFiles, - currentProfile=listProfiles[i], spRef=spRef, - studyIDSyn=studyDF.syn$study.id, np=1L) - - saveRDS(resCall, file.path(pathOut, - paste0(listProfiles[i], ".infoCall", ".rds"))) - - write.csv(x=resCall$Ancestry, file=file.path(pathOut, - paste0(listProfiles[i], ".Ancestry",".csv")), quote=FALSE, - row.names=FALSE) - - ## Close Profile GDS file (important) - closefn.gds(gdsProfile) - } - - ## Close all GDS files - closefn.gds(gds1KG) - closefn.gds(gdsAnnot1KG) + r <- runWrapperAncestry(pedStudy, studyDF, pathProfileGDS, + pathGeno, pathOut, fileReferenceGDS, fileReferenceAnnotGDS, + chrInfo, syntheticRefDF, genoSource, studyType="DNA", np=np, verbose) ## Successful - return(0L) + return(r) } +#' @title Run most steps leading to the ancestry inference call on a specific +#' DNA profile +#' +#' @description This function runs most steps leading to the ancestry inference +#' call on a specific RNA profile. First, the function creates the +#' Profile GDS file for the specific profile using the information from a +#' RDS Sample description file and the Population Reference GDS file. +#' +#' @param profileFile a \code{character} string representing the path and the +#' file name of the genotype file or the bam if genoSource is snp-pileup the +#' fine extension must be .txt.gz, if VCF the extension must be .vcf.gz +#' +#' @param pathProfileGDS a \code{character} string representing the path to +#' the directory where the GDS Profile files will be created. +#' Default: \code{NULL}. +#' +#' @param fileReferenceGDS a \code{character} string representing the file +#' name of the Population Reference GDS file. The file must exist. +#' +#' @param fileReferenceAnnotGDS a \code{character} string representing the +#' file name of the Population Reference GDS Annotation file. The file +#' must exist. +#' +#' @param chrInfo a \code{vector} of positive \code{integer} values +#' representing the length of the chromosomes. See 'details' section. +#' +#' @param syntheticRefDF a \code{data.frame} containing a subset of +#' reference profiles for each sub-population present in the Reference GDS +#' file. The \code{data.frame} must have those columns: +#' \describe{ +#' \item{sample.id}{ a \code{character} string representing the sample +#' identifier. } +#' \item{pop.group}{ a \code{character} string representing the +#' subcontinental population assigned to the sample. } +#' \item{superPop}{ a \code{character} string representing the +#' super-population assigned to the sample. } +#' } +#' +#' @param genoSource a \code{character} string with four possible values: +#' 'snp-pileup', 'generic', 'VCF' or 'bam'. It specifies if the genotype files +#' are generated by snp-pileup (Facets) or are a generic format CSV file +#' with at least those columns: +#' 'Chromosome', 'Position', 'Ref', 'Alt', 'Count', 'File1R' and 'File1A'. +#' The 'Count' is the depth at the specified position; +#' 'FileR' is the depth of the reference allele and +#' 'File1A' is the depth of the specific alternative allele. +#' Finally the file can be a VCF file with at least those genotype +#' fields: GT, AD, DP. +#' +#' @param np a single positive \code{integer} specifying the number of +#' threads to be used. Default: \code{1L}. +#' +#' @param verbose a \code{logical} indicating if messages should be printed +#' to show how the different steps in the function. Default: \code{FALSE}. +#' +#' @return a \code{list} containing 4 entries: +#' \describe{ +#' \item{\code{pcaSample}}{ a \code{list} containing the information related +#' to the eigenvectors. The \code{list} contains those 3 entries: +#' \describe{ +#' \item{\code{sample.id}}{ a \code{character} string representing the unique +#' identifier of the current profile.} +#' \item{\code{eigenvector.ref}}{ a \code{matrix} of \code{numeric} containing +#' the eigenvectors for the reference profiles.} +#' \item{\code{eigenvector}}{ a \code{matrix} of \code{numeric} containing the +#' eigenvectors for the current profile projected on the PCA from the +#' reference profiles.} +#' } +#' } +#' \item{\code{paraSample}}{ a \code{list} containing the results with +#' different \code{D} and \code{K} values that lead to optimal parameter +#' selection. The \code{list} contains those entries: +#' \describe{ +#' \item{\code{dfPCA}}{ a \code{data.frame} containing statistical results +#' on all combined synthetic results done with a fixed value of \code{D} (the +#' number of dimensions). The \code{data.frame} contains those columns: +#' \describe{ +#' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +#' number of dimensions).} +#' \item{\code{median}}{ a \code{numeric} representing the median of the +#' minimum AUROC obtained (within super populations) for all combination of +#' the fixed \code{D} value and all tested \code{K} values. } +#' \item{\code{mad}}{ a \code{numeric} representing the MAD of the minimum +#' AUROC obtained (within super populations) for all combination of the fixed +#' \code{D} value and all tested \code{K} values. } +#' \item{\code{upQuartile}}{ a \code{numeric} representing the upper quartile +#' of the minimum AUROC obtained (within super populations) for all +#' combination of the fixed \code{D} value and all tested \code{K} values. } +#' \item{\code{k}}{ a \code{numeric} representing the optimal \code{K} value +#' (the number of neighbors) for a fixed \code{D} value. } +#' } +#' } +#' \item{\code{dfPop}}{ a \code{data.frame} containing statistical results on +#' all combined synthetic results done with different values of \code{D} (the +#' number of dimensions) and \code{K} (the number of neighbors). +#' The \code{data.frame} contains those columns: +#' \describe{ +#' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +#' number of dimensions).} +#' \item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +#' number of neighbors).} +#' \item{\code{AUROC.min}}{ a \code{numeric} representing the minimum accuracy +#' obtained by grouping all the synthetic results by super-populations, for +#' the specified values of \code{D} and \code{K}.} +#' \item{\code{AUROC}}{ a \code{numeric} representing the accuracy obtained +#' by grouping all the synthetic results for the specified values of \code{D} +#' and \code{K}.} +#' \item{\code{Accu.CM}}{ a \code{numeric} representing the value of accuracy +#' of the confusion matrix obtained by grouping all the synthetic results for +#' the specified values of \code{D} and \code{K}.} +#' } +#' } +#' \item{\code{dfAUROC}}{ a \code{data.frame} the summary of the results by +#' super-population. The \code{data.frame} contains +#' those columns: +#' \describe{ +#' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +#' number of dimensions).} +#' \item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +#' number of neighbors).} +#' \item{\code{Call}}{ a \code{character} string representing the +#' super-population.} +#' \item{\code{L}}{ a \code{numeric} representing the lower value of the 95% +#' confidence interval for the AUROC obtained for the fixed values of +#' super-population, \code{D} and \code{K}.} +#' \item{\code{AUROC}}{ a \code{numeric} representing the AUROC obtained for the +#' fixed values of super-population, \code{D} and \code{K}.} +#' \item{\code{H}}{ a \code{numeric} representing the higher value of the 95% +#' confidence interval for the AUROC obtained for the fixed values of +#' super-population, \code{D} and \code{K}.} +#' } +#' } +#' \item{\code{D}}{ a \code{numeric} representing the optimal \code{D} value +#' (the number of dimensions) for the specific profile.} +#' \item{\code{K}}{ a \code{numeric} representing the optimal \code{K} value +#' (the number of neighbors) for the specific profile.} +#' \item{\code{listD}}{ a \code{numeric} representing the optimal \code{D} +#' values (the number of dimensions) for the specific profile. More than one +#' \code{D} is possible.} +#' } +#' } +#' \item{\code{KNNSample}}{ a \code{data.frame} containing the inferred ancestry +#' for different values of \code{K} and \code{D}. The \code{data.frame} +#' contains those columns: +#' \describe{ +#' \item{\code{sample.id}}{ a \code{character} string representing the unique +#' identifier of the current profile.} +#' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +#' number of dimensions) used to infer the ancestry. } +#' \item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +#' number of neighbors) used to infer the ancestry. } +#' \item{\code{SuperPop}}{ a \code{character} string representing the inferred +#' ancestry for the specified \code{D} and \code{K} values.} +#' } +#' } +#' \item{\code{KNNSynthetic}}{ a \code{data.frame} containing the inferred ancestry +#' for each synthetic data for different values of \code{K} and \code{D}. +#' The \code{data.frame} +#' contains those columns: "sample.id", "D", "K", "infer.superPop", "ref.superPop" +#' \describe{ +#' \item{\code{sample.id}}{ a \code{character} string representing the unique +#' identifier of the current synthetic data.} +#' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +#' number of dimensions) used to infer the ancestry. } +#' \item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +#' number of neighbors) used to infer the ancestry. } +#' \item{\code{infer.superPop}}{ a \code{character} string representing the inferred +#' ancestry for the specified \code{D} and \code{K} values.} +#' \item{\code{ref.superPop}}{ a \code{character} string representing the known +#' ancestry from the reference} +#' } +#' } +#' \item{\code{Ancestry}}{ a \code{data.frame} containing the inferred +#' ancestry for the current profile. The \code{data.frame} contains those +#' columns: +#' \describe{ +#' \item{\code{sample.id}}{ a \code{character} string representing the unique +#' identifier of the current profile.} +#' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +#' number of dimensions) used to infer the ancestry.} +#' \item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +#' number of neighbors) used to infer the ancestry.} +#' \item{\code{SuperPop}}{ a \code{character} string representing the inferred +#' ancestry.} +#' } +#' } +#' } +#' +#' @references +#' +#' Galinsky KJ, Bhatia G, Loh PR, Georgiev S, Mukherjee S, Patterson NJ, +#' Price AL. Fast Principal-Component Analysis Reveals Convergent Evolution +#' of ADH1B in Europe and East Asia. Am J Hum Genet. 2016 Mar 3;98(3):456-72. +#' doi: 10.1016/j.ajhg.2015.12.022. Epub 2016 Feb 25. +#' +#' @examples +#' +#' ## Required library for GDS +#' library(SNPRelate) +#' +#' ## Path to the demo 1KG GDS file is located in this package +#' dataDir <- system.file("extdata", package="RAIDS") +#' +#' ################################################################# +#' ## The 1KG GDS file and the 1KG SNV Annotation GDS file +#' ## need to be located in the same directory +#' ## Note that the 1KG GDS file used for this example is a +#' ## simplified version and CANNOT be used for any real analysis +#' ################################################################# +#' path1KG <- file.path(dataDir, "tests") +#' +#' fileReferenceGDS <- file.path(path1KG, "ex1_good_small_1KG.gds") +#' fileAnnotGDS <- file.path(path1KG, "ex1_good_small_1KG_Annot.gds") +#' +#' ################################################################# +#' ## The Sample SNP pileup files (one per sample) need +#' ## to be located in the same directory. +#' ################################################################# +#' demoProfileEx1 <- file.path(dataDir, "example", "snpPileup", "ex1.txt.gz") +#' +#' ################################################################# +#' ## The path where the Profile GDS Files (one per sample) +#' ## will be created need to be specified. +#' ################################################################# +#' pathProfileGDS <- file.path(tempdir(), "out.tmp") +#' +#' #################################################################### +#' ## Fix seed to ensure reproducible results +#' #################################################################### +#' set.seed(3043) +#' +#' gds1KG <- snpgdsOpen(fileReferenceGDS) +#' dataRef <- select1KGPop(gds1KG, nbProfiles=2L) +#' closefn.gds(gds1KG) +#' +#' ## Required library for this example to run correctly +#' if (requireNamespace("GenomeInfoDb", quietly=TRUE) && +#' requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) { +#' +#' ## Chromosome length information +#' ## chr23 is chrX, chr24 is chrY and chrM is 25 +#' chrInfo <- GenomeInfoDb::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25] +#' +#' \donttest{ +#' +#' res <- inferAncestry(profileFile=demoProfileEx1, +#' pathProfileGDS=pathProfileGDS, +#' fileReferenceGDS=fileReferenceGDS, +#' fileReferenceAnnotGDS=fileAnnotGDS, +#' chrInfo=chrInfo, +#' syntheticRefDF=dataRef, +#' genoSource="snp-pileup") +#' +#' unlink(pathProfileGDS, recursive=TRUE, force=TRUE) +#' +#' } +#' } +#' +#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz +#' @importFrom utils write.csv +#' @importFrom rlang arg_match +#' @encoding UTF-8 +#' @export +inferAncestry <- function(profileFile, pathProfileGDS, + fileReferenceGDS, fileReferenceAnnotGDS, + chrInfo, syntheticRefDF, + genoSource=c("snp-pileup", "generic", "VCF", "bam"), + np=1L, verbose=FALSE) { + + profileBaseName <- basename(profileFile) + pathGeno <- dirname(profileFile) + + genoSource <- arg_match(genoSource) + + ## BAM format is not yet implemented + # if (genoSource == "bam") { + # stop("The bam is not release yet look to get a \'Devel\' version ", + # "or contact us") + # } + + ## Extract the name of the profile(s) + profileName <- gsub("\\.gz$", "", profileBaseName, ignore.case=TRUE) + for (extCur in c( "\\.vcf$", "\\.txt$", "\\.bam", "\\.tsv", "\\.csv")) { + profileName <- gsub(extCur, "", profileName, ignore.case = TRUE) + } + + ## Create required data frames + studyDF <- data.frame(study.id="NotDef", study.desc="NotDef", + study.platform="NotDef", stringsAsFactors=FALSE) + pedStudy <- data.frame(Name.ID=c(profileName), Case.ID=c(profileName), + Sample.Type=c("DNA"), Diagnosis="NotDef", + Source=c("ENotDef"), stringsAsFactors=FALSE) + row.names(pedStudy) <- pedStudy$Name.ID + + ## Validate parameters + validateRunExomeOrRNAAncestry(pedStudy=pedStudy, studyDF=studyDF, + pathProfileGDS=pathProfileGDS, pathGeno=pathGeno, pathOut="./", + fileReferenceGDS=fileReferenceGDS, + fileReferenceAnnotGDS=fileReferenceAnnotGDS, chrInfo=chrInfo, + syntheticRefDF=syntheticRefDF, genoSource=genoSource, verbose=verbose) + + ## Run ancestry inference + if (genoSource %in% c("snp-pileup", "generic", "VCF", "bam")) { + r <- wrapperAncestry(pedStudy, studyDF, pathProfileGDS, + profileFile, fileReferenceGDS, fileReferenceAnnotGDS, + chrInfo, syntheticRefDF, genoSource=genoSource, studyType="LD", np=np, + verbose=verbose) + }else{ + stop(paste0("The format ", genoSource," is not implemented yet\n")) + } + + ## Successful + return(r) +} + +#' @title Run most steps leading to the ancestry inference call on a specific +#' DNA profile (alias for inferAncestry ) +#' +#' @description This function runs most steps leading to the ancestry inference +#' call on a specific RNA profile. First, the function creates the +#' Profile GDS file for the specific profile using the information from a +#' RDS Sample description file and the Population Reference GDS file. +#' +#' @param profileFile a \code{character} string representing the path and the +#' file name of the genotype file or the bam if genoSource is snp-pileup the +#' fine extension must be .txt.gz, if VCF the extension must be .vcf.gz +#' +#' @param pathProfileGDS a \code{character} string representing the path to +#' the directory where the GDS Profile files will be created. +#' Default: \code{NULL}. +#' +#' @param fileReferenceGDS a \code{character} string representing the file +#' name of the Population Reference GDS file. The file must exist. +#' +#' @param fileReferenceAnnotGDS a \code{character} string representing the +#' file name of the Population Reference GDS Annotation file. The file +#' must exist. +#' +#' @param chrInfo a \code{vector} of positive \code{integer} values +#' representing the length of the chromosomes. See 'details' section. +#' +#' @param syntheticRefDF a \code{data.frame} containing a subset of +#' reference profiles for each sub-population present in the Reference GDS +#' file. The \code{data.frame} must have those columns: +#' \describe{ +#' \item{sample.id}{ a \code{character} string representing the sample +#' identifier. } +#' \item{pop.group}{ a \code{character} string representing the +#' subcontinental population assigned to the sample. } +#' \item{superPop}{ a \code{character} string representing the +#' super-population assigned to the sample. } +#' } +#' +#' @param genoSource a \code{character} string with four possible values: +#' 'snp-pileup', 'generic', 'VCF' or 'bam'. It specifies if the genotype files +#' are generated by snp-pileup (Facets) or are a generic format CSV file +#' with at least those columns: +#' 'Chromosome', 'Position', 'Ref', 'Alt', 'Count', 'File1R' and 'File1A'. +#' The 'Count' is the depth at the specified position; +#' 'FileR' is the depth of the reference allele and +#' 'File1A' is the depth of the specific alternative allele. +#' Finally the file can be a VCF file with at least those genotype +#' fields: GT, AD, DP. +#' +#' @param np a single positive \code{integer} specifying the number of +#' threads to be used. Default: \code{1L}. +#' +#' @param verbose a \code{logical} indicating if messages should be printed +#' to show how the different steps in the function. Default: \code{FALSE}. +#' +#' @return a \code{list} containing 4 entries: +#' \describe{ +#' \item{\code{pcaSample}}{ a \code{list} containing the information related +#' to the eigenvectors. The \code{list} contains those 3 entries: +#' \describe{ +#' \item{\code{sample.id}}{ a \code{character} string representing the unique +#' identifier of the current profile.} +#' \item{\code{eigenvector.ref}}{ a \code{matrix} of \code{numeric} containing +#' the eigenvectors for the reference profiles.} +#' \item{\code{eigenvector}}{ a \code{matrix} of \code{numeric} containing the +#' eigenvectors for the current profile projected on the PCA from the +#' reference profiles.} +#' } +#' } +#' \item{\code{paraSample}}{ a \code{list} containing the results with +#' different \code{D} and \code{K} values that lead to optimal parameter +#' selection. The \code{list} contains those entries: +#' \describe{ +#' \item{\code{dfPCA}}{ a \code{data.frame} containing statistical results +#' on all combined synthetic results done with a fixed value of \code{D} (the +#' number of dimensions). The \code{data.frame} contains those columns: +#' \describe{ +#' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +#' number of dimensions).} +#' \item{\code{median}}{ a \code{numeric} representing the median of the +#' minimum AUROC obtained (within super populations) for all combination of +#' the fixed \code{D} value and all tested \code{K} values. } +#' \item{\code{mad}}{ a \code{numeric} representing the MAD of the minimum +#' AUROC obtained (within super populations) for all combination of the fixed +#' \code{D} value and all tested \code{K} values. } +#' \item{\code{upQuartile}}{ a \code{numeric} representing the upper quartile +#' of the minimum AUROC obtained (within super populations) for all +#' combination of the fixed \code{D} value and all tested \code{K} values. } +#' \item{\code{k}}{ a \code{numeric} representing the optimal \code{K} value +#' (the number of neighbors) for a fixed \code{D} value. } +#' } +#' } +#' \item{\code{dfPop}}{ a \code{data.frame} containing statistical results on +#' all combined synthetic results done with different values of \code{D} (the +#' number of dimensions) and \code{K} (the number of neighbors). +#' The \code{data.frame} contains those columns: +#' \describe{ +#' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +#' number of dimensions).} +#' \item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +#' number of neighbors).} +#' \item{\code{AUROC.min}}{ a \code{numeric} representing the minimum accuracy +#' obtained by grouping all the synthetic results by super-populations, for +#' the specified values of \code{D} and \code{K}.} +#' \item{\code{AUROC}}{ a \code{numeric} representing the accuracy obtained +#' by grouping all the synthetic results for the specified values of \code{D} +#' and \code{K}.} +#' \item{\code{Accu.CM}}{ a \code{numeric} representing the value of accuracy +#' of the confusion matrix obtained by grouping all the synthetic results for +#' the specified values of \code{D} and \code{K}.} +#' } +#' } +#' \item{\code{dfAUROC}}{ a \code{data.frame} the summary of the results by +#' super-population. The \code{data.frame} contains +#' those columns: +#' \describe{ +#' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +#' number of dimensions).} +#' \item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +#' number of neighbors).} +#' \item{\code{Call}}{ a \code{character} string representing the +#' super-population.} +#' \item{\code{L}}{ a \code{numeric} representing the lower value of the 95% +#' confidence interval for the AUROC obtained for the fixed values of +#' super-population, \code{D} and \code{K}.} +#' \item{\code{AUROC}}{ a \code{numeric} representing the AUROC obtained for the +#' fixed values of super-population, \code{D} and \code{K}.} +#' \item{\code{H}}{ a \code{numeric} representing the higher value of the 95% +#' confidence interval for the AUROC obtained for the fixed values of +#' super-population, \code{D} and \code{K}.} +#' } +#' } +#' \item{\code{D}}{ a \code{numeric} representing the optimal \code{D} value +#' (the number of dimensions) for the specific profile.} +#' \item{\code{K}}{ a \code{numeric} representing the optimal \code{K} value +#' (the number of neighbors) for the specific profile.} +#' \item{\code{listD}}{ a \code{numeric} representing the optimal \code{D} +#' values (the number of dimensions) for the specific profile. More than one +#' \code{D} is possible.} +#' } +#' } +#' \item{\code{KNNSample}}{ a \code{data.frame} containing the inferred ancestry +#' for different values of \code{K} and \code{D}. The \code{data.frame} +#' contains those columns: +#' \describe{ +#' \item{\code{sample.id}}{ a \code{character} string representing the unique +#' identifier of the current profile.} +#' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +#' number of dimensions) used to infer the ancestry. } +#' \item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +#' number of neighbors) used to infer the ancestry. } +#' \item{\code{SuperPop}}{ a \code{character} string representing the inferred +#' ancestry for the specified \code{D} and \code{K} values.} +#' } +#' } +#' \item{\code{KNNSynthetic}}{ a \code{data.frame} containing the inferred ancestry +#' for each synthetic data for different values of \code{K} and \code{D}. +#' The \code{data.frame} +#' contains those columns: "sample.id", "D", "K", "infer.superPop", "ref.superPop" +#' \describe{ +#' \item{\code{sample.id}}{ a \code{character} string representing the unique +#' identifier of the current synthetic data.} +#' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +#' number of dimensions) used to infer the ancestry. } +#' \item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +#' number of neighbors) used to infer the ancestry. } +#' \item{\code{infer.superPop}}{ a \code{character} string representing the inferred +#' ancestry for the specified \code{D} and \code{K} values.} +#' \item{\code{ref.superPop}}{ a \code{character} string representing the known +#' ancestry from the reference} +#' } +#' } +#' \item{\code{Ancestry}}{ a \code{data.frame} containing the inferred +#' ancestry for the current profile. The \code{data.frame} contains those +#' columns: +#' \describe{ +#' \item{\code{sample.id}}{ a \code{character} string representing the unique +#' identifier of the current profile.} +#' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +#' number of dimensions) used to infer the ancestry.} +#' \item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +#' number of neighbors) used to infer the ancestry.} +#' \item{\code{SuperPop}}{ a \code{character} string representing the inferred +#' ancestry.} +#' } +#' } +#' } +#' +#' @references +#' +#' Galinsky KJ, Bhatia G, Loh PR, Georgiev S, Mukherjee S, Patterson NJ, +#' Price AL. Fast Principal-Component Analysis Reveals Convergent Evolution +#' of ADH1B in Europe and East Asia. Am J Hum Genet. 2016 Mar 3;98(3):456-72. +#' doi: 10.1016/j.ajhg.2015.12.022. Epub 2016 Feb 25. +#' +#' @examples +#' +#' ## Required library for GDS +#' library(SNPRelate) +#' +#' ## Path to the demo 1KG GDS file is located in this package +#' dataDir <- system.file("extdata", package="RAIDS") +#' +#' ################################################################# +#' ## The 1KG GDS file and the 1KG SNV Annotation GDS file +#' ## need to be located in the same directory +#' ## Note that the 1KG GDS file used for this example is a +#' ## simplified version and CANNOT be used for any real analysis +#' ################################################################# +#' path1KG <- file.path(dataDir, "tests") +#' +#' fileReferenceGDS <- file.path(path1KG, "ex1_good_small_1KG.gds") +#' fileAnnotGDS <- file.path(path1KG, "ex1_good_small_1KG_Annot.gds") +#' +#' ################################################################# +#' ## The Sample SNP pileup files (one per sample) need +#' ## to be located in the same directory. +#' ################################################################# +#' demoProfileEx1 <- file.path(dataDir, "example", "snpPileup", "ex1.txt.gz") +#' +#' ################################################################# +#' ## The path where the Profile GDS Files (one per sample) +#' ## will be created need to be specified. +#' ################################################################# +#' pathProfileGDS <- file.path(tempdir(), "out.tmp") +#' +#' #################################################################### +#' ## Fix seed to ensure reproducible results +#' #################################################################### +#' set.seed(3043) +#' +#' gds1KG <- snpgdsOpen(fileReferenceGDS) +#' dataRef <- select1KGPop(gds1KG, nbProfiles=2L) +#' closefn.gds(gds1KG) +#' +#' ## Required library for this example to run correctly +#' if (requireNamespace("GenomeInfoDb", quietly=TRUE) && +#' requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) { +#' +#' ## Chromosome length information +#' ## chr23 is chrX, chr24 is chrY and chrM is 25 +#' chrInfo <- GenomeInfoDb::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25] +#' +#' \donttest{ +#' +#' res <- inferAncestryDNA(profileFile=demoProfileEx1, +#' pathProfileGDS=pathProfileGDS, +#' fileReferenceGDS=fileReferenceGDS, +#' fileReferenceAnnotGDS=fileAnnotGDS, +#' chrInfo=chrInfo, +#' syntheticRefDF=dataRef, +#' genoSource="snp-pileup") +#' +#' unlink(pathProfileGDS, recursive=TRUE, force=TRUE) +#' +#' } +#' } +#' +#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz +#' @importFrom utils write.csv +#' @importFrom rlang arg_match +#' @encoding UTF-8 +#' @export +inferAncestryDNA <- function(profileFile, pathProfileGDS, + fileReferenceGDS, fileReferenceAnnotGDS, + chrInfo, syntheticRefDF, + genoSource=c("snp-pileup", "generic", "VCF", "bam"), + np=1L, verbose=FALSE) { + + return(inferAncestry(profileFile=profileFile, pathProfileGDS=pathProfileGDS, + fileReferenceGDS=fileReferenceGDS, + fileReferenceAnnotGDS=fileReferenceAnnotGDS, + chrInfo=chrInfo, + syntheticRefDF=syntheticRefDF, + genoSource=genoSource, + np=np, verbose=verbose)) +} + + +#' @title Run most steps leading to the ancestry inference call on a specific +#' RNA profile +#' +#' @description This function runs most steps leading to the ancestry inference +#' call on a specific RNA profile. First, the function creates the +#' Profile GDS file for the specific profile using the information from a +#' RDS Sample description file and the Population Reference GDS file. +#' +#' @param pedStudy a \code{data.frame} with those mandatory columns: "Name.ID", +#' "Case.ID", "Sample.Type", "Diagnosis", "Source". All columns must be in +#' \code{character} strings (no factor). The \code{data.frame} +#' must contain the information for all the samples passed in the +#' \code{listSamples} parameter. Only \code{filePedRDS} or \code{pedStudy} +#' can be defined. +#' +#' @param studyDF a \code{data.frame} containing the information about the +#' study associated to the analysed sample(s). The \code{data.frame} must have +#' those 3 columns: "study.id", "study.desc", "study.platform". All columns +#' must be in \code{character} strings (no factor). +#' +#' @param pathProfileGDS a \code{character} string representing the path to +#' the directory where the GDS Profile files will be created. +#' Default: \code{NULL}. +#' +#' @param pathGeno a \code{character} string representing the path to the +#' directory containing the VCF output of SNP-pileup for each sample. The +#' SNP-pileup files must be compressed (gz files) and have the name identifiers +#' of the samples. A sample with "Name.ID" identifier would have an +#' associated file called +#' if genoSource is "VCF", then "Name.ID.vcf.gz", +#' if genoSource is "generic", then "Name.ID.generic.txt.gz" +#' if genoSource is "snp-pileup", then "Name.ID.txt.gz". +#' +#' @param pathOut a \code{character} string representing the path to +#' the directory where the output files are created. +#' +#' @param fileReferenceGDS a \code{character} string representing the file +#' name of the Population Reference GDS file. The file must exist. +#' +#' @param fileReferenceAnnotGDS a \code{character} string representing the +#' file name of the Population Reference GDS Annotation file. The file +#' must exist. +#' +#' @param chrInfo a \code{vector} of positive \code{integer} values +#' representing the length of the chromosomes. See 'details' section. +#' +#' @param syntheticRefDF a \code{data.frame} containing a subset of +#' reference profiles for each sub-population present in the Reference GDS +#' file. The \code{data.frame} must have those columns: +#' \describe{ +#' \item{sample.id}{ a \code{character} string representing the sample +#' identifier. } +#' \item{pop.group}{ a \code{character} string representing the +#' subcontinental population assigned to the sample. } +#' \item{superPop}{ a \code{character} string representing the +#' super-population assigned to the sample. } +#' } +#' +#' @param genoSource a \code{character} string with two possible values: +#' 'snp-pileup', 'generic' or 'VCF'. It specifies if the genotype files +#' are generated by snp-pileup (Facets) or are a generic format CSV file +#' with at least those columns: +#' 'Chromosome', 'Position', 'Ref', 'Alt', 'Count', 'File1R' and 'File1A'. +#' The 'Count' is the depth at the specified position; +#' 'FileR' is the depth of the reference allele and +#' 'File1A' is the depth of the specific alternative allele. +#' Finally the file can be a VCF file with at least those genotype +#' fields: GT, AD, DP. +#' +#' @param np a single positive \code{integer} specifying the number of +#' threads to be used. Default: \code{1L}. +#' +#' @param blockTypeID a \code{character} string corresponding to the block +#' type used to extract the block identifiers. The block type must be +#' present in the GDS Reference Annotation file. +#' +#' @param verbose a \code{logical} indicating if messages should be printed +#' to show how the different steps in the function. Default: \code{FALSE}. +#' +#' @return The integer \code{0L} when successful. See details section for +#' more information about the generated output files. +#' +#' @details +#' +#' The runExomeAncestry() function generates 3 types of files +#' in the OUTPUT directory. +#' \describe{ +#' \item{Ancestry Inference}{ The ancestry inference CSV file +#' (".Ancestry.csv" file)} +#' \item{Inference Informaton}{ The inference information RDS file +#' (".infoCall.rds" file)} +#' \item{Synthetic Information}{ The parameter information RDS files +#' from the synthetic inference ("KNN.synt.*.rds" files in a sub-directory)} +#' } +#' +#' In addition, a sub-directory (named using the profile ID) is +#' also created. +#' +#' @references +#' +#' Galinsky KJ, Bhatia G, Loh PR, Georgiev S, Mukherjee S, Patterson NJ, +#' Price AL. Fast Principal-Component Analysis Reveals Convergent Evolution +#' of ADH1B in Europe and East Asia. Am J Hum Genet. 2016 Mar 3;98(3):456-72. +#' doi: 10.1016/j.ajhg.2015.12.022. Epub 2016 Feb 25. +#' +#' @examples +#' +#' ## Required library for GDS +#' library(SNPRelate) +#' +#' ## Path to the demo 1KG GDS file is located in this package +#' dataDir <- system.file("extdata", package="RAIDS") +#' +#' ################################################################# +#' ## Load the information about the profile +#' ################################################################# +#' data(demoPedigreeEx1) +#' head(demoPedigreeEx1) +#' +#' ################################################################# +#' ## The 1KG GDS file and the 1KG SNV Annotation GDS file +#' ## need to be located in the same directory +#' ## Note that the 1KG GDS file used for this example is a +#' ## simplified version and CANNOT be used for any real analysis +#' ################################################################# +#' path1KG <- file.path(dataDir, "tests") +#' +#' fileReferenceGDS <- file.path(path1KG, "ex1_good_small_1KG.gds") +#' fileAnnotGDS <- file.path(path1KG, "ex1_good_small_1KG_Annot.gds") +#' +#' ################################################################# +#' ## The Sample SNP pileup files (one per sample) need +#' ## to be located in the same directory. +#' ################################################################# +#' pathGeno <- file.path(dataDir, "example", "snpPileup") +#' +#' ################################################################# +#' ## The path where the Profile GDS Files (one per sample) +#' ## will be created need to be specified. +#' ################################################################# +#' pathProfileGDS <- file.path(tempdir(), "out.tmp") +#' +#' pathOut <- file.path(tempdir(), "res.out") +#' +#' ################################################################# +#' ## A data frame containing general information about the study +#' ## is also required. The data frame must have +#' ## those 3 columns: "studyID", "study.desc", "study.platform" +#' ################################################################# +#' studyDF <- data.frame(study.id="MYDATA", +#' study.desc="Description", +#' study.platform="PLATFORM", +#' stringsAsFactors=FALSE) +#' +#' #################################################################### +#' ## Fix seed to ensure reproducible results +#' #################################################################### +#' set.seed(3043) +#' +#' gds1KG <- snpgdsOpen(fileReferenceGDS) +#' dataRef <- select1KGPop(gds1KG, nbProfiles=2L) +#' closefn.gds(gds1KG) +#' +#' ## Required library for this example to run correctly +#' if (requireNamespace("GenomeInfoDb", quietly=TRUE) && +#' requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) { +#' +#' ## Chromosome length information +#' ## chr23 is chrX, chr24 is chrY and chrM is 25 +#' chrInfo <- GenomeInfoDb::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25] +#' +#' \donttest{ +#' +#' runRNAAncestry(pedStudy=demoPedigreeEx1, studyDF=studyDF, +#' pathProfileGDS=pathProfileGDS, +#' pathGeno=pathGeno, +#' pathOut=pathOut, +#' fileReferenceGDS=fileReferenceGDS, +#' fileReferenceAnnotGDS=fileAnnotGDS, +#' chrInfo=chrInfo, +#' syntheticRefDF=dataRef, +#' blockTypeID="GeneS.Ensembl.Hsapiens.v86", +#' genoSource="snp-pileup") +#' +#' unlink(pathProfileGDS, recursive=TRUE, force=TRUE) +#' unlink(pathOut, recursive=TRUE, force=TRUE) +#' +#' } +#' } +#' +#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz +#' @importFrom utils write.csv +#' @importFrom rlang arg_match +#' @encoding UTF-8 +#' @export +runRNAAncestry <- function(pedStudy, studyDF, pathProfileGDS, + pathGeno, pathOut, fileReferenceGDS, fileReferenceAnnotGDS, + chrInfo, syntheticRefDF, + genoSource=c("snp-pileup", "generic", "VCF"), np=1L, + blockTypeID, verbose=FALSE) { + + ## Validate parameters + validateRunExomeOrRNAAncestry(pedStudy=pedStudy, studyDF=studyDF, + pathProfileGDS=pathProfileGDS, pathGeno=pathGeno, pathOut=pathOut, + fileReferenceGDS=fileReferenceGDS, + fileReferenceAnnotGDS=fileReferenceAnnotGDS, chrInfo=chrInfo, + syntheticRefDF=syntheticRefDF, genoSource=genoSource, verbose=verbose) + + genoSource <- arg_match(genoSource) + + r <- runWrapperAncestry(pedStudy, studyDF, pathProfileGDS, + pathGeno, pathOut, fileReferenceGDS, fileReferenceAnnotGDS, + chrInfo, syntheticRefDF, genoSource, studyType="RNA", np=np, + blockTypeID=blockTypeID, verbose) + + ## Successful + return(r) +} + +#' @title Run most steps leading to the ancestry inference call on a specific +#' RNA profile +#' +#' @description This function runs most steps leading to the ancestry inference +#' call on a specific RNA profile. First, the function creates the +#' Profile GDS file for the specific profile using the information from a +#' RDS Sample description file and the Population Reference GDS file. +#' +#' @param profileFile a \code{character} string representing the path and the +#' file name of the genotype file or the bam if genoSource is snp-pileup the +#' fine extension must be .txt.gz, if VCF the extension must be .vcf.gz +#' +#' @param pathProfileGDS a \code{character} string representing the path to +#' the directory where the GDS Profile files will be created. +#' Default: \code{NULL}. +#' +#' @param fileReferenceGDS a \code{character} string representing the file +#' name of the Population Reference GDS file. The file must exist. +#' +#' @param fileReferenceAnnotGDS a \code{character} string representing the +#' file name of the Population Reference GDS Annotation file. The file +#' must exist. +#' +#' @param chrInfo a \code{vector} of positive \code{integer} values +#' representing the length of the chromosomes. See 'details' section. +#' +#' @param syntheticRefDF a \code{data.frame} containing a subset of +#' reference profiles for each sub-population present in the Reference GDS +#' file. The \code{data.frame} must have those columns: +#' \describe{ +#' \item{sample.id}{ a \code{character} string representing the sample +#' identifier. } +#' \item{pop.group}{ a \code{character} string representing the +#' subcontinental population assigned to the sample. } +#' \item{superPop}{ a \code{character} string representing the +#' super-population assigned to the sample. } +#' } +#' +#' @param genoSource a \code{character} string with four possible values: +#' 'snp-pileup', 'generic', 'VCF' or 'bam'. It specifies if the genotype files +#' are generated by snp-pileup (Facets) or are a generic format CSV file +#' with at least those columns: +#' 'Chromosome', 'Position', 'Ref', 'Alt', 'Count', 'File1R' and 'File1A'. +#' The 'Count' is the depth at the specified position; +#' 'FileR' is the depth of the reference allele and +#' 'File1A' is the depth of the specific alternative allele. +#' Finally the file can be a VCF file with at least those genotype +#' fields: GT, AD, DP. +#' +#' @param np a single positive \code{integer} specifying the number of +#' threads to be used. Default: \code{1L}. +#' +#' @param blockTypeID a \code{character} string corresponding to the block +#' type used to extract the block identifiers. The block type must be +#' present in the GDS Reference Annotation file. +#' +#' @param verbose a \code{logical} indicating if messages should be printed +#' to show how the different steps in the function. Default: \code{FALSE}. +#' +#' @return a \code{list} containing 4 entries: +#' \describe{ +#' \item{\code{pcaSample}}{ a \code{list} containing the information related +#' to the eigenvectors. The \code{list} contains those 3 entries: +#' \describe{ +#' \item{\code{sample.id}}{ a \code{character} string representing the unique +#' identifier of the current profile.} +#' \item{\code{eigenvector.ref}}{ a \code{matrix} of \code{numeric} containing +#' the eigenvectors for the reference profiles.} +#' \item{\code{eigenvector}}{ a \code{matrix} of \code{numeric} containing the +#' eigenvectors for the current profile projected on the PCA from the +#' reference profiles.} +#' } +#' } +#' \item{\code{paraSample}}{ a \code{list} containing the results with +#' different \code{D} and \code{K} values that lead to optimal parameter +#' selection. The \code{list} contains those entries: +#' \describe{ +#' \item{\code{dfPCA}}{ a \code{data.frame} containing statistical results +#' on all combined synthetic results done with a fixed value of \code{D} (the +#' number of dimensions). The \code{data.frame} contains those columns: +#' \describe{ +#' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +#' number of dimensions).} +#' \item{\code{median}}{ a \code{numeric} representing the median of the +#' minimum AUROC obtained (within super populations) for all combination of +#' the fixed \code{D} value and all tested \code{K} values. } +#' \item{\code{mad}}{ a \code{numeric} representing the MAD of the minimum +#' AUROC obtained (within super populations) for all combination of the fixed +#' \code{D} value and all tested \code{K} values. } +#' \item{\code{upQuartile}}{ a \code{numeric} representing the upper quartile +#' of the minimum AUROC obtained (within super populations) for all +#' combination of the fixed \code{D} value and all tested \code{K} values. } +#' \item{\code{k}}{ a \code{numeric} representing the optimal \code{K} value +#' (the number of neighbors) for a fixed \code{D} value. } +#' } +#' } +#' \item{\code{dfPop}}{ a \code{data.frame} containing statistical results on +#' all combined synthetic results done with different values of \code{D} (the +#' number of dimensions) and \code{K} (the number of neighbors). +#' The \code{data.frame} contains those columns: +#' \describe{ +#' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +#' number of dimensions).} +#' \item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +#' number of neighbors).} +#' \item{\code{AUROC.min}}{ a \code{numeric} representing the minimum accuracy +#' obtained by grouping all the synthetic results by super-populations, for +#' the specified values of \code{D} and \code{K}.} +#' \item{\code{AUROC}}{ a \code{numeric} representing the accuracy obtained +#' by grouping all the synthetic results for the specified values of \code{D} +#' and \code{K}.} +#' \item{\code{Accu.CM}}{ a \code{numeric} representing the value of accuracy +#' of the confusion matrix obtained by grouping all the synthetic results for +#' the specified values of \code{D} and \code{K}.} +#' } +#' } +#' \item{\code{dfAUROC}}{ a \code{data.frame} the summary of the results by +#' super-population. The \code{data.frame} contains +#' those columns: +#' \describe{ +#' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +#' number of dimensions).} +#' \item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +#' number of neighbors).} +#' \item{\code{Call}}{ a \code{character} string representing the +#' super-population.} +#' \item{\code{L}}{ a \code{numeric} representing the lower value of the 95% +#' confidence interval for the AUROC obtained for the fixed values of +#' super-population, \code{D} and \code{K}.} +#' \item{\code{AUROC}}{ a \code{numeric} representing the AUROC obtained for +#' the fixed values of super-population, \code{D} and \code{K}.} +#' \item{\code{H}}{ a \code{numeric} representing the higher value of the 95% +#' confidence interval for the AUROC obtained for the fixed values of +#' super-population, \code{D} and \code{K}.} +#' } +#' } +#' \item{\code{D}}{ a \code{numeric} representing the optimal \code{D} value +#' (the number of dimensions) for the specific profile.} +#' \item{\code{K}}{ a \code{numeric} representing the optimal \code{K} value +#' (the number of neighbors) for the specific profile.} +#' \item{\code{listD}}{ a \code{numeric} representing the optimal \code{D} +#' values (the number of dimensions) for the specific profile. More than one +#' \code{D} is possible.} +#' } +#' } +#' \item{\code{KNNSample}}{ a \code{data.frame} containing the inferred +#' ancestry for different values of \code{K} and \code{D}. The +#' \code{data.frame} contains those columns: +#' \describe{ +#' \item{\code{sample.id}}{ a \code{character} string representing the unique +#' identifier of the current profile.} +#' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +#' number of dimensions) used to infer the ancestry. } +#' \item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +#' number of neighbors) used to infer the ancestry. } +#' \item{\code{SuperPop}}{ a \code{character} string representing the inferred +#' ancestry for the specified \code{D} and \code{K} values.} +#' } +#' } +#' \item{\code{KNNSynthetic}}{ a \code{data.frame} containing the inferred +#' ancestry for each synthetic data for different values of \code{K} and +#' \code{D}. +#' The \code{data.frame} +#' contains those columns: +#' \describe{ +#' \item{\code{sample.id}}{ a \code{character} string representing the unique +#' identifier of the current synthetic data.} +#' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +#' number of dimensions) used to infer the ancestry. } +#' \item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +#' number of neighbors) used to infer the ancestry. } +#' \item{\code{infer.superPop}}{ a \code{character} string representing the +#' inferred ancestry for the specified \code{D} and \code{K} values.} +#' \item{\code{ref.superPop}}{ a \code{character} string representing the known +#' ancestry from the reference} +#' } +#' } +#' \item{\code{Ancestry}}{ a \code{data.frame} containing the inferred +#' ancestry for the current profile. The \code{data.frame} contains those +#' columns: +#' \describe{ +#' \item{\code{sample.id}}{ a \code{character} string representing the unique +#' identifier of the current profile.} +#' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +#' number of dimensions) used to infer the ancestry.} +#' \item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +#' number of neighbors) used to infer the ancestry.} +#' \item{\code{SuperPop}}{ a \code{character} string representing the inferred +#' ancestry.} +#' } +#' } +#' } +#' +#' @details +#' +#' The runExomeAncestry() function generates 3 types of files +#' in the OUTPUT directory. +#' \describe{ +#' \item{Ancestry Inference}{ The ancestry inference CSV file +#' (".Ancestry.csv" file)} +#' \item{Inference Informaton}{ The inference information RDS file +#' (".infoCall.rds" file)} +#' \item{Synthetic Information}{ The parameter information RDS files +#' from the synthetic inference ("KNN.synt.*.rds" files in a sub-directory)} +#' } +#' +#' In addition, a sub-directory (named using the profile ID) is +#' also created. +#' +#' @references +#' +#' Galinsky KJ, Bhatia G, Loh PR, Georgiev S, Mukherjee S, Patterson NJ, +#' Price AL. Fast Principal-Component Analysis Reveals Convergent Evolution +#' of ADH1B in Europe and East Asia. Am J Hum Genet. 2016 Mar 3;98(3):456-72. +#' doi: 10.1016/j.ajhg.2015.12.022. Epub 2016 Feb 25. +#' +#' @examples +#' +#' ## Required library for GDS +#' library(SNPRelate) +#' +#' ## Path to the demo 1KG GDS file is located in this package +#' dataDir <- system.file("extdata", package="RAIDS") +#' +#' +#' ################################################################# +#' ## The 1KG GDS file and the 1KG SNV Annotation GDS file +#' ## need to be located in the same directory +#' ## Note that the 1KG GDS file used for this example is a +#' ## simplified version and CANNOT be used for any real analysis +#' ################################################################# +#' path1KG <- file.path(dataDir, "tests") +#' +#' fileReferenceGDS <- file.path(path1KG, "ex1_good_small_1KG.gds") +#' fileAnnotGDS <- file.path(path1KG, "ex1_good_small_1KG_Annot.gds") +#' +#' ################################################################# +#' ## The Sample SNP pileup files (one per sample) need +#' ## to be located in the same directory. +#' ################################################################# +#' demoProfileEx1 <- file.path(dataDir, "example", "snpPileup", "ex1.txt.gz") +#' +#' ################################################################# +#' ## The path where the Profile GDS Files (one per sample) +#' ## will be created need to be specified. +#' ################################################################# +#' pathProfileGDS <- file.path(tempdir(), "out.tmp") +#' +#' #################################################################### +#' ## Fix seed to ensure reproducible results +#' #################################################################### +#' set.seed(3043) +#' +#' gds1KG <- snpgdsOpen(fileReferenceGDS) +#' dataRef <- select1KGPop(gds1KG, nbProfiles=2L) +#' closefn.gds(gds1KG) +#' +#' ## Required library for this example to run correctly +#' if (requireNamespace("GenomeInfoDb", quietly=TRUE) && +#' requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) { +#' +#' ## Chromosome length information +#' ## chr23 is chrX, chr24 is chrY and chrM is 25 +#' chrInfo <- GenomeInfoDb::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25] +#' +#' \donttest{ +#' +#' res <- inferAncestryGeneAware(profileFile=demoProfileEx1, +#' pathProfileGDS=pathProfileGDS, +#' fileReferenceGDS=fileReferenceGDS, +#' fileReferenceAnnotGDS=fileAnnotGDS, +#' chrInfo=chrInfo, +#' syntheticRefDF=dataRef, +#' blockTypeID="GeneS.Ensembl.Hsapiens.v86", +#' genoSource="snp-pileup") +#' +#' unlink(pathProfileGDS, recursive=TRUE, force=TRUE) +#' +#' } +#' } +#' +#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz +#' @importFrom utils write.csv +#' @importFrom rlang arg_match +#' @encoding UTF-8 +#' @export +inferAncestryGeneAware <- function(profileFile, pathProfileGDS, + fileReferenceGDS, fileReferenceAnnotGDS, + chrInfo, syntheticRefDF, + genoSource=c("snp-pileup", "generic", "VCF", "bam"), np=1L, + blockTypeID, verbose=FALSE) { + + profileBaseName <- basename(profileFile) + pathGeno <- dirname(profileFile) + + genoSource <- arg_match(genoSource) + + # if(genoSource == "bam"){ + # stop("The bam is not release yet look to get a \'Devel\' version ", + # "or contact us") + # } + + profileName <- gsub("\\.gz$", "", profileBaseName, ignore.case = TRUE) + for(extCur in c( "\\.vcf$", "\\.txt$", "\\.bam", "\\.tsv", "\\.csv")){ + profileName <- gsub(extCur, "", profileName, ignore.case = TRUE) + } + #profileName <- "profile" + studyDF <- data.frame(study.id="NotDef", + study.desc="NotDef", + study.platform="NotDef", + stringsAsFactors=FALSE) + pedStudy <- data.frame(Name.ID=c(profileName), + Case.ID=c(profileName), + Sample.Type=c("RNA"), + Diagnosis="NotDef", + Source=c("NotDef"), + stringsAsFactors=FALSE) + row.names(pedStudy) <- pedStudy$Name.ID + + ## Validate parameters + validateRunExomeOrRNAAncestry(pedStudy=pedStudy, studyDF=studyDF, + pathProfileGDS=pathProfileGDS, pathGeno=pathGeno, pathOut="./", + fileReferenceGDS=fileReferenceGDS, + fileReferenceAnnotGDS=fileReferenceAnnotGDS, chrInfo=chrInfo, + syntheticRefDF=syntheticRefDF, genoSource=genoSource, verbose=verbose) + + + if(genoSource %in% c("snp-pileup", "generic", "VCF", "bam")){ + + r <- wrapperAncestry(pedStudy, studyDF, pathProfileGDS, + profileFile, fileReferenceGDS, fileReferenceAnnotGDS, + chrInfo, syntheticRefDF, genoSource=genoSource, studyType="GeneAware", np=np, + blockTypeID=blockTypeID, verbose=verbose) + }else{ + stop(paste0("The format ", genoSource," is not implemented yet\n")) + } + ## Successful + return(r) +} diff --git a/R/processStudy_internal.R b/R/processStudy_internal.R index c69ef1173..e9a2799b5 100644 --- a/R/processStudy_internal.R +++ b/R/processStudy_internal.R @@ -3,8 +3,8 @@ #' @description This function validates the input parameters for the #' \code{\link{pruningSample}} function. #' -#' @param gdsReference an object of class \link[gdsfmt]{gds.class} (a GDS file), the -#' 1 KG GDS file. +#' @param gdsReference an object of class \link[gdsfmt]{gds.class} +#' (a GDS file), the Population Reference GDS file. #' #' @param method a \code{character} string that represents the method that will #' be used to calculate the linkage disequilibrium in the @@ -68,11 +68,15 @@ #' #' @examples #' +#' ## Required library +#' library(gdsfmt) +#' #' ## Directory where demo GDS files are located #' dataDir <- system.file("extdata", package="RAIDS") #' #' ## The 1KG GDS file (opened) -#' gds1KG <- openfn.gds(file.path(dataDir, "gds1KG.gds"), readonly=TRUE) +#' gds1KG <- openfn.gds(file.path(dataDir, +#' "PopulationReferenceDemo.gds"), readonly=TRUE) #' #' ## The validation should be successful #' RAIDS:::validatePruningSample(gdsReference=gds1KG, method="corr", @@ -160,11 +164,12 @@ validatePruningSample <- function(gdsReference, method, currentProfile, studyID, #' opened Profile GDS file. #' #' @param sampleRM a \code{vector} of \code{character} strings representing -#' the identifiers of the 1KG reference samples that should not be used to -#' create the reference PCA. +#' the identifiers of the population reference samples that should not +#' be used to create the reference PCA. #' #' @param spRef a \code{vector} of \code{character} strings representing the -#' known super population ancestry for the 1KG profiles. The 1KG profile +#' known super population ancestry for the population reference profiles. +#' The population reference profile #' identifiers are used as names for the \code{vector}. #' #' @param studyIDSyn a \code{character} string corresponding to the study @@ -224,7 +229,7 @@ validatePruningSample <- function(gdsReference, method, currentProfile, studyID, #' gdsSample <- openfn.gds(file.path(dataDir, #' "GDS_Sample_with_study_demo.gds"), readonly=TRUE) #' -#' ## The known super population ancestry for the 1KG profiles +#' ## The known super population ancestry for the population reference profiles #' spRef <- c("EUR", "SAS", "EAS", "EUR", "AFR") #' names(spRef) <- c("HG00100", "HG00101", "HG00102", "HG00103", "HG00104") #' @@ -311,7 +316,7 @@ validateComputePoolSyntheticAncestryGr <- function(gdsProfile, sampleRM, #' \code{\link{estimateAllelicFraction}} function. #' #' @param gdsReference an object of class \code{\link[gdsfmt]{gds.class}} -#' (a GDS file), the 1KG GDS file. +#' (a GDS file), the Population Reference GDS file. #' #' @param gdsProfile an object of class \code{\link[gdsfmt]{gds.class}} #' (a GDS file), the Profile GDS file. @@ -333,7 +338,8 @@ validateComputePoolSyntheticAncestryGr <- function(gdsProfile, sampleRM, #' @param minCov a single positive \code{integer} representing the minimum #' required coverage. #' -#' @param minProb a single \code{numeric} between 0 and 1 representing TODO. +#' @param minProb a single positive \code{numeric} between 0 and 1 that +#' represents the probability that the genotype is correct. #' #' @param eProb a single \code{numeric} between 0 and 1 representing the #' probability of sequencing error. @@ -351,7 +357,7 @@ validateComputePoolSyntheticAncestryGr <- function(gdsProfile, sampleRM, #' log score, that the SNVs in a gene are allelic fraction different from 0.5. #' #' @param gdsRefAnnot an object of class \code{\link[gdsfmt]{gds.class}} -#' (a GDS file), the1 1KG SNV Annotation GDS file. +#' (a GDS file), the1 Population Reference SNV Annotation GDS file. #' This parameter is RNA specific. #' #' @param blockID a \code{character} string corresponding to the block @@ -364,37 +370,42 @@ validateComputePoolSyntheticAncestryGr <- function(gdsProfile, sampleRM, #' #' @examples #' +#' +#' ## Required library +#' library(gdsfmt) +#' #' ## Directory where demo GDS files are located #' dataDir <- system.file("extdata", package="RAIDS") #' -#' ## The 1KG GDS file (opened) -#' gds1KG <- openfn.gds(file.path(dataDir, "gds1KG.gds"), readonly=TRUE) +#' ## The 1KG Population Reference GDS Demo file (opened) +#' gds1KG <- openfn.gds(file.path(dataDir, +#' "PopulationReferenceDemo.gds"), readonly=TRUE) #' #' ## The GDS Sample (opened) #' gdsSample <- openfn.gds(file.path(dataDir, #' "GDS_Sample_with_study_demo.gds"), readonly=TRUE) #' -#' ## Get chromosome length information -#' ## Information from BSgenome.Hsapiens.UCSC.hg38 package version 1.4.4 -#' ## Order by chromosomes 1 to 25 -#' ## chr23 is chrX, chr24 is chrY and chrM is 25 -#' chrInfo <- c(248956422L, 242193529L, 198295559L, 190214555L, -#' 181538259L, 170805979L, 159345973L, 145138636L, 138394717L, 133797422L, -#' 135086622L, 133275309L, 114364328L, 107043718L, 101991189L, 90338345L, -#' 83257441L, 80373285L, 58617616L, 64444167L, 46709983L, 50818468L, -#' 156040895L, 57227415L, 16569L) +#' ## Required library for this example to run correctly +#' if (requireNamespace("GenomeInfoDb", quietly=TRUE) && +#' requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) { #' -#' ## The validation should be successful -#' RAIDS:::validateEstimateAllelicFraction(gdsReference=gds1KG, -#' gdsProfile=gdsSample, -#' currentProfile="Sample01", studyID="Synthetic", chrInfo=chrInfo, -#' studyType="DNA", minCov=10L, minProb=0.03, eProb=0.002, cutOffLOH=10, -#' cutOffHomoScore=11, wAR=2, cutOffAR=10, gdsRefAnnot=gds1KG, -#' blockID="1", verbose=FALSE) +#' ## Chromosome length information +#' ## chr23 is chrX, chr24 is chrY and chrM is 25 +#' chrInfo <- GenomeInfoDb::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25] #' -#' ## All GDS file must be closed -#' closefn.gds(gdsfile=gds1KG) -#' closefn.gds(gdsfile=gdsSample) +#' ## The validation should be successful +#' RAIDS:::validateEstimateAllelicFraction(gdsReference=gds1KG, +#' gdsProfile=gdsSample, +#' currentProfile="Sample01", studyID="Synthetic", chrInfo=chrInfo, +#' studyType="DNA", minCov=10L, minProb=0.03, eProb=0.002, cutOffLOH=10, +#' cutOffHomoScore=11, wAR=2, cutOffAR=10, gdsRefAnnot=gds1KG, +#' blockID="1", verbose=FALSE) +#' +#' ## All GDS file must be closed +#' closefn.gds(gdsfile=gds1KG) +#' closefn.gds(gdsfile=gdsSample) +#' +#' } #' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz #' @importFrom S4Vectors isSingleNumber @@ -476,7 +487,7 @@ validateEstimateAllelicFraction <- function(gdsReference, gdsProfile, #' can be defined. #' #' @param fileNameGDS a \code{character} string representing the file name of -#' the 1KG GDS file. The file must exist. +#' the Population Reference GDS file. The file must exist. #' #' @param batch a single positive \code{integer} representing the current #' identifier for the batch. Beware, this field is not stored anymore. @@ -505,7 +516,8 @@ validateEstimateAllelicFraction <- function(gdsReference, gdsProfile, #' ## Path to the demo pedigree file is located in this package #' dataDir <- system.file("extdata", package="RAIDS") #' -#' gds1KG <- file.path(dataDir, "gds1KG.gds") +#' ## Demo 1KG Population Reference GDS file +#' gds1KG <- file.path(dataDir, "PopulationReferenceDemo.gds") #' #' ## The data.frame containing the information about the study #' ## The 3 mandatory columns: "study.id", "study.desc", "study.platform" @@ -547,7 +559,7 @@ validateCreateStudy2GDS1KG <- function(pathGeno, pedStudy, fileNameGDS, batch, ## The fileNameGDS must be a character string and the file must exists if (!(is.character(fileNameGDS) && (file.exists(fileNameGDS)))) { stop("The \'fileNameGDS\' must be a character string representing ", - "the GDS 1KG file. The file must exist.") + "the Population Reference GDS file. The file must exist.") } ## The batch must be a single numeric @@ -559,16 +571,150 @@ validateCreateStudy2GDS1KG <- function(pathGeno, pedStudy, fileNameGDS, batch, validateStudyDataFrameParameter(studyDF=studyDF) ## The listProfiles must be a vector of character string - if (!(is.character(listProfiles) || is.null(listProfiles))) { + if (!(is.character(listProfiles) || is.null(listProfiles))) { # stop("The \'listProfiles\' must be a vector ", "of character strings (1 entry or more) or NULL.") } + ## The pathProfileGDS must be a character string if (!is.character(pathProfileGDS)) { stop("The \'pathProfileGDS\' must be a character string representing", " the path where the Profile GDS files will be generated.") } + if(is.character(listProfiles)){ + for(profileCur in listProfiles){ + if(file.exists(file.path(pathProfileGDS, paste0(profileCur, ".gds")))){ + stop(paste0("The gds file for ", profileCur, " already exist.")) + } + } + } + ## The genoSource must be a character string + if(!(is.character(genoSource))) { + stop("The \'genoSource\' parameter must be a character string.") + } + + ## The verbose parameter must be a logical + validateLogical(logical=verbose, "verbose") + + return(0L) +} + +#' @title Validate input parameters for createProfile() function +#' +#' @description This function validates the input parameters for the +#' \code{\link{createStudy2GDS1KG}} function. +#' +#' @param pedStudy a \code{data.frame} with those mandatory columns: "Name.ID", +#' "Case.ID", "Sample.Type", "Diagnosis", "Source". All columns must be in +#' \code{character} strings (no factor). The \code{data.frame} +#' must contain the information for all the samples passed in the +#' \code{listSamples} parameter. Only \code{filePedRDS} or \code{pedStudy} +#' can be defined. +#' +#' @param fileNameGDS a \code{character} string representing the file name of +#' the Population Reference GDS file. The file must exist. +#' +#' @param batch a single positive \code{integer} representing the current +#' identifier for the batch. Beware, this field is not stored anymore. +#' +#' @param studyDF a \code{data.frame} containing the information about the +#' study associated to the analysed sample(s). The \code{data.frame} must have +#' those 3 columns: "study.id", "study.desc", "study.platform". All columns +#' must be in \code{character} strings (no factor). +#' +#' @param listProfiles a \code{vector} of \code{character} string corresponding +#' to the profile identifiers that will have a GDS Sample file created. The +#' profile identifiers must be present in the "Name.ID" column of the RDS file +#' passed to the \code{filePedRDS} parameter. +#' If \code{NULL}, all profiles in the \code{filePedRDS} are selected. +#' +#' @param pathProfileGDS a \code{character} string representing the path to +#' the directory where the Profile GDS files will be created. +#' +#' @param verbose a \code{logical} indicating if message information should be +#' printed. +#' +#' @return The function returns \code{0L} when successful. +#' +#' @examples +#' +#' ## Path to the demo pedigree file is located in this package +#' dataDir <- system.file("extdata", package="RAIDS") +#' +#' ## Demo 1KG Population Reference GDS file +#' gds1KG <- file.path(dataDir, "PopulationReferenceDemo.gds") +#' +#' ## The data.frame containing the information about the study +#' ## The 3 mandatory columns: "study.id", "study.desc", "study.platform" +#' ## The entries should be strings, not factors (stringsAsFactors=FALSE) +#' studyInfo <- data.frame(study.id="Pancreatic.WES", +#' study.desc="Pancreatic study", +#' study.platform="WES", +#' stringsAsFactors=FALSE) +#' +#' ## PED Study +#' ped <- data.frame(Name.ID=c("Sample_01", "Sample_02"), +#' Case.ID=c("TCGA-H01", "TCGA-H02"), +#' Sample.Type=c("DNA", "DNA"), +#' Diagnosis=c("Cancer", "Cancer"), Source=c("TCGA", "TCGA")) +#' +#' ## The validation should be successful +#' RAIDS:::validateCreateStudy2GDS1KG(pathGeno=dataDir, pedStudy=ped, +#' fileNameGDS=gds1KG, batch=1, studyDF=studyInfo, +#' listProfiles=c("Sample_01", "Sample_02"), +#' pathProfileGDS=dataDir, +#' genoSource="snp-pileup", verbose=TRUE) +#' +#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz +#' @importFrom S4Vectors isSingleNumber +#' @encoding UTF-8 +#' @keywords internal +validatecreateProfile <- function(pedStudy, + fileNameGDS, batch, + studyDF, listProfiles, pathProfileGDS, + genoSource, verbose) { + # profileFile, profileName, + # filePedRDS=NULL, pedStudy=NULL, fileNameGDS, + # batch=1, studyDF, listProfiles=NULL, + # pathProfileGDS=NULL, + # genoSource=c("snp-pileup", "generic", "VCF", "bam"), + # paramProfile=list(ScanBamParam=NULL, + # PileupParam=NULL, + # yieldSize=10000000), + # verbose=FALSE + ## The pathGeno must be a existing directory + + + ## The PED study must have the mandatory columns + validatePEDStudyParameter(pedStudy=pedStudy) + + ## The fileNameGDS must be a character string and the file must exists + if (!(is.character(fileNameGDS) && (file.exists(fileNameGDS)))) { + stop("The \'fileNameGDS\' must be a character string representing ", + "the Population Reference GDS file. The file must exist.") + } + + ## The batch must be a single numeric + if(!(isSingleNumber(batch))) { + stop("The \'batch\' must be a single integer.") + } + + ## The Study DF must have the mandatory columns + validateStudyDataFrameParameter(studyDF=studyDF) + + ## The listProfiles must be a vector of character string + if (!(is.character(listProfiles) || is.null(listProfiles))) { # + stop("The \'listProfiles\' must be a vector ", + "of character strings (1 entry or more) or NULL.") + } + + + ## The pathProfileGDS must be a character string + if (!is.character(pathProfileGDS)) { + stop("The \'pathProfileGDS\' must be a character string representing", + " the path where the Profile GDS files will be generated.") + } ## The genoSource must be a character string if(!(is.character(genoSource))) { @@ -589,7 +735,7 @@ validateCreateStudy2GDS1KG <- function(pathGeno, pedStudy, fileNameGDS, batch, #' \code{\link{computeAncestryFromSyntheticFile}} function. #' #' @param gdsReference an object of class \link[gdsfmt]{gds.class} (a GDS -#' file), the opened 1KG GDS file. +#' file), the opened Population Reference GDS file. #' #' @param gdsProfile an object of class \code{\link[gdsfmt]{gds.class}} #' (a GDS file), the opened Profile GDS file. @@ -603,7 +749,8 @@ validateCreateStudy2GDS1KG <- function(pathGeno, pedStudy, fileNameGDS, batch, #' identifier of the current profile on which ancestry will be inferred. #' #' @param spRef a \code{vector} of \code{character} strings representing the -#' known super population ancestry for the 1KG profiles. The 1KG profile +#' known super population ancestry for the 1KG profiles. The Population +#' Reference profile #' identifiers are used as names for the \code{vector}. #' #' @param studyIDSyn a \code{character} string corresponding to the study @@ -657,11 +804,15 @@ validateCreateStudy2GDS1KG <- function(pathGeno, pedStudy, fileNameGDS, batch, #' #' @examples #' +#' ## Required library +#' library(gdsfmt) +#' #' ## Directory where demo GDS files are located #' dataDir <- system.file("extdata", package="RAIDS") #' -#' ## The 1KG GDS file (opened) -#' gds1KG <- openfn.gds(file.path(dataDir, "gds1KG.gds"), readonly=TRUE) +#' ## The 1KG Population Reference GDS demo file (opened) +#' gds1KG <- openfn.gds(file.path(dataDir, +#' "PopulationReferenceDemo.gds"), readonly=TRUE) #' #' ## The Profile GDS (opened) #' gdsSample <- openfn.gds(file.path(dataDir, @@ -847,131 +998,14 @@ validateComputePCARefSample <- function(gdsProfile, currentProfile, studyIDRef, } -#' @title Validate input parameters for appendStudy2GDS1KG() function -#' -#' @description This function validates the input parameters for the -#' \code{\link{appendStudy2GDS1KG}} function. -#' -#' @param pathGeno a \code{character} string representing the path to the -#' directory containing the output of SNP-pileup, a VCF Sample file, for -#' each sample. -#' -#' @param filePedRDS a \code{character} string representing the path to the -#' RDS file that contains the information about the sample to analyse. -#' -#' @param fileNameGDS a \code{character} string representing the file name of -#' the 1KG GDS file. The file must exist. -#' -#' @param batch a single positive \code{integer} representing the current -#' identifier for the batch. Beware, this field is not stored anymore. -#' -#' @param studyDF a \code{data.frame} containing the information about the -#' study associated to the analysed sample(s). The \code{data.frame} must have -#' those 3 columns: "studyID", "study.desc", "study.platform". All columns -#' must be in \code{character} strings. -#' -#' @param listSamples a \code{vector} of \code{character} string corresponding -#' to the sample identifiers that will have a GDS Sample file created. The -#' sample identifiers must be present in the "Name.ID" column of the RDS file -#' passed to the \code{filePedRDS} parameter. -#' If \code{NULL}, all samples in the \code{filePedRDS} are selected. -#' -#' @param pathProfileGDS a \code{character} string representing the path to -#' the directory where the GDS Sample files will be created. -#' -#' @param genoSource a \code{character} string with two possible values: -#' 'snp-pileup' or 'generic'. It specifies if the genotype files -#' are generate by snp-pileup (Facets) or generic format csv. -#' -#' @param verbose a \code{logical} indicating if message information should be -#' printed. -#' -#' @return The function returns \code{0L} when successful. -#' -#' @examples -#' -#' ## Path to the demo pedigree file is located in this package -#' dataDir <- system.file("extdata", package="RAIDS") -#' -#' gds1KG <- file.path(dataDir, "1KG_Demo.gds") -#' ped <- file.path(dataDir, "unrelatedPatientsInfo_Demo.rds") -#' -#' ## The data.frame containing the information about the study -#' ## The 3 mandatory columns: "studyID", "study.desc", "study.platform" -#' ## The entries should be strings, not factors (stringsAsFactors=FALSE) -#' studyInfo <- data.frame(study.id="Pancreatic.WES", -#' study.desc="Pancreatic study", -#' study.platform="WES", -#' stringsAsFactors=FALSE) -#' -#' ## The validatiion should be successful -#' RAIDS:::validateAppendStudy2GDS1KG(pathGeno=dataDir, -#' filePedRDS=ped, fileNameGDS=gds1KG, -#' batch=1L, studyDF=studyInfo, listSamples=c("HC01", "HC02"), -#' pathProfileGDS=dataDir, genoSource="snp-pileup", verbose=TRUE) -#' -#' -#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz -#' @importFrom S4Vectors isSingleNumber -#' @encoding UTF-8 -#' @keywords internal -validateAppendStudy2GDS1KG <- function(pathGeno, filePedRDS, fileNameGDS, - batch, studyDF, listSamples, pathProfileGDS, - genoSource, verbose) { - - ## The pathGeno must be a character string and the path must exists - if (!(is.character(pathGeno) && (dir.exists(pathGeno)))) { - stop("The \'pathGeno\' must be a character string representing ", - "a path. The path must exist.") - } - - ## The filePedRDS must be a character string and the file must exists - if (!(is.character(filePedRDS) && (file.exists(filePedRDS)))) { - stop("The \'filePedRDS\' must be a character string representing ", - "the RDS Sample information file. The file must exist.") - } - - ## The fileNameGDS must be a character string and the file must exists - if (!(is.character(fileNameGDS) && (file.exists(fileNameGDS)))) { - stop("The \'fileNameGDS\' must be a character string representing ", - "the GDS 1KG file. The file must exist.") - } - - ## The batch must be a single numeric - if (!(isSingleNumber(batch))) { - stop("The \'batch\' must be a single integer.") - } - - if (!(is.data.frame(studyDF) && all(c("study.id", "study.desc", - "study.platform") %in% colnames(studyDF)))) { - stop("The \'studyDF\' must be a data.frame and contain those 3 ", - "columns: \'study.id\', \'study.desc\' and \'study.platform\'.") - } - - ## The listSamples must be a vector of character string - if (!(is.character(listSamples) || is.null(listSamples))) { - stop("The \'listSamples\' must be a vector ", - "of character strings (1 entry or more) or NULL.") - } - - ## The genoSource must be a character string - if (!is.character(genoSource)) { - stop("The \'genoSource\' parameter must be a character string.") - } - - ## The verbose parameter must be a logical - validateLogical(logical=verbose, name="verbose") - - return(0L) -} - #' @title Validate input parameters for add1KG2SampleGDS() function #' #' @description This function validates the input parameters for the #' \code{\link{add1KG2SampleGDS}} function. #' #' @param gdsReference an object of class -#' \link[gdsfmt]{gds.class} (a GDS file), the opened 1KG GDS file. +#' \link[gdsfmt]{gds.class} (a GDS file), the opened Population Reference +#' GDS file. #' #' @param gdsProfileFile a \code{character} string representing the path and #' file name of the Profile GDS file. The Profile GDS file must exist. @@ -986,11 +1020,15 @@ validateAppendStudy2GDS1KG <- function(pathGeno, filePedRDS, fileNameGDS, #' #' @examples #' +#' ## Required library +#' library(gdsfmt) +#' #' ## Path to the demo pedigree file is located in this package #' dataDir <- system.file("extdata", package="RAIDS") #' -#' ## The 1KG GDS file (opened) -#' gds1KG <- openfn.gds(file.path(dataDir, "gds1KG.gds"), readonly=TRUE) +#' ## The 1KG Population Reference GDS demo file (opened) +#' gds1KG <- openfn.gds(file.path(dataDir, +#' "PopulationReferenceDemo.gds"), readonly=TRUE) #' #' ## The validatiion should be successful #' RAIDS:::validateAdd1KG2SampleGDS(gdsReference=gds1KG, @@ -1003,8 +1041,8 @@ validateAppendStudy2GDS1KG <- function(pathGeno, filePedRDS, fileNameGDS, #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz #' @encoding UTF-8 #' @keywords internal -validateAdd1KG2SampleGDS <- function(gdsReference, gdsProfileFile, currentProfile, - studyID) { +validateAdd1KG2SampleGDS <- function(gdsReference, gdsProfileFile, + currentProfile, studyID) { ## The gds must be an object of class "gds.class" validateGDSClass(gds=gdsReference, name="gdsReference") @@ -1064,21 +1102,22 @@ validateAdd1KG2SampleGDS <- function(gdsReference, gdsProfileFile, currentProfil #' the directory where the output files are created. #' #' @param fileReferenceGDS a \code{character} string representing the file -#' name of the 1KG GDS file. The file must exist. +#' name of the Population Reference GDS file. The file must exist. #' #' @param fileReferenceAnnotGDS a \code{character} string representing the -#' file name of the 1KG GDS annotation file. The file must exist. +#' file name of the Population Reference GDS annotation file. +#' The file must exist. #' #' @param chrInfo a \code{vector} of positive \code{integer} values #' representing the length of the chromosomes. See 'details' section. #' #' @param syntheticRefDF a \code{data.frame} containing those columns: -#' \itemize{ -#' \item{sample.id} { a \code{character} string representing the sample +#' \describe{ +#' \item{sample.id}{ a \code{character} string representing the sample #' identifier. } -#' \item{pop.group} { a \code{character} string representing the +#' \item{pop.group}{ a \code{character} string representing the #' subcontinental population assigned to the sample. } -#' \item{superPop} { a \code{character} string representing the +#' \item{superPop}{ a \code{character} string representing the #' super-population assigned to the sample. } #' } #' @@ -1101,9 +1140,10 @@ validateAdd1KG2SampleGDS <- function(gdsReference, gdsProfileFile, currentProfil #' study.platform = "PLATFORM", #' stringsAsFactors = FALSE) #' -#' gds1KG <- file.path(dataDir, "gds1KG.gds") +#' ## Population Reference GDS demo file +#' gdsRef <- file.path(dataDir, "PopulationReferenceDemo.gds") #' -#' gdsAnnot1KG <- file.path(dataDir, "gdsAnnot1KG.gds") +#' gdsAnnotRef <- file.path(dataDir, "PopulationReferenceSNVAnnotationDemo.gds") #' #' ## Pedigree Study data frame #' ped <- data.frame(Name.ID=c("Sample_01", "Sample_02"), @@ -1111,30 +1151,32 @@ validateAdd1KG2SampleGDS <- function(gdsReference, gdsProfileFile, currentProfil #' Sample.Type=c("DNA", "DNA"), #' Diagnosis=c("Cancer", "Cancer"), Source=c("TCGA", "TCGA")) #' -#' ## Chromosome length information -#' ## chr23 is chrX, chr24 is chrY and chrM is 25 -#' chrInfo <- c(248956422L, 242193529L, 198295559L, 190214555L, -#' 181538259L, 170805979L, 159345973L, 145138636L, 138394717L, 133797422L, -#' 135086622L, 133275309L, 114364328L, 107043718L, 101991189L, 90338345L, -#' 83257441L, 80373285L, 58617616L, 64444167L, 46709983L, 50818468L, -#' 156040895L, 57227415L, 16569L) +#' ## Required library for this example to run correctly +#' if (requireNamespace("GenomeInfoDb", quietly=TRUE) && +#' requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) { #' -#' ## Profiles used for synthetic data set -#' syntheticRefDF <- data.frame(sample.id=c("HG00150", "HG00138", "HG00330", -#' "HG00275"), pop.group=c("GBR", "GBR","FIN", "FIN"), -#' superPop=c("EUR", "EUR", "EUR", "EUR"), stringsAsFactors=FALSE) +#' ## Chromosome length information +#' ## chr23 is chrX, chr24 is chrY and chrM is 25 +#' chrInfo <- GenomeInfoDb::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25] #' -#' ## Returns OL when all parameters are valid -#' RAIDS:::validateRunExomeAncestry(pedStudy=ped, studyDF=study, -#' pathProfileGDS=dataDir, pathGeno=dataDir, pathOut=pathOut, -#' fileReferenceGDS=gds1KG, fileReferenceAnnotGDS=gdsAnnot1KG, -#' chrInfo=chrInfo, syntheticRefDF=syntheticRefDF, genoSource="snp-pileup", -#' verbose=FALSE) +#' ## Profiles used for synthetic data set +#' syntheticRefDF <- data.frame(sample.id=c("HG00150", "HG00138", "HG00330", +#' "HG00275"), pop.group=c("GBR", "GBR","FIN", "FIN"), +#' superPop=c("EUR", "EUR", "EUR", "EUR"), stringsAsFactors=FALSE) +#' +#' ## Returns OL when all parameters are valid +#' RAIDS:::validateRunExomeOrRNAAncestry(pedStudy=ped, studyDF=study, +#' pathProfileGDS=dataDir, pathGeno=dataDir, pathOut=pathOut, +#' fileReferenceGDS=gdsRef, fileReferenceAnnotGDS=gdsAnnotRef, +#' chrInfo=chrInfo, syntheticRefDF=syntheticRefDF, +#' genoSource="snp-pileup", verbose=FALSE) +#' +#' } #' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz #' @encoding UTF-8 #' @keywords internal -validateRunExomeAncestry <- function(pedStudy, studyDF, pathProfileGDS, +validateRunExomeOrRNAAncestry <- function(pedStudy, studyDF, pathProfileGDS, pathGeno, pathOut, fileReferenceGDS, fileReferenceAnnotGDS, chrInfo, syntheticRefDF, genoSource, verbose) { @@ -1341,7 +1383,7 @@ validateStudyDataFrameParameter <- function(studyDF) { #' #' ## Path to the demo 1KG GDS file is located in this package #' dataDir <- system.file("extdata", package="RAIDS") -#' fileReferenceGDS <- file.path(dataDir, "1KG_Demo.gds") +#' fileReferenceGDS <- file.path(dataDir, "PopulationReferenceDemo.gds") #' gds1KG <- snpgdsOpen(fileReferenceGDS) #' #' ## Path to demo Profile GDS file @@ -1375,187 +1417,31 @@ validateAddStudy1Kg <- function(gdsReference, fileProfileGDS, verbose) { } -#' @title Validate the input parameters for computePoolSyntheticAncestry() -#' function -#' -#' @description The function validates the input parameters for the -#' \code{\link{computePoolSyntheticAncestry}} function. -#' When a parameter is not as expected, an error message is generated. -#' -#' @param gdsReference an object of class \link[gdsfmt]{gds.class} -#' (a GDS file), the opened 1KG GDS file. -#' -#' @param profileGDS an object of class \link[gdsfmt]{gds.class} (a GDS file), -#' an opened Profile GDS file. + +#' @title Validate that the Profile GDS file exists for the specified profile #' -#' @param profileID a single \code{character} string representing the -#' profile identifier. +#' @description The function validates that the Profile GDS file associated +#' to a profile identifier exists in the specified directory. #' -#' @param dataRef a \code{data.frame} TODO +#' @param pathProfile a \code{character} string representing the directory +#' where the Profile GDS files will be created. The directory must exist. #' -#' @param spRef TODO +#' @param profile a \code{character} string +#' corresponding to the profile identifier. A Profile GDS file +#' corresponding to the profile identifier must exist and be located in the +#' \code{pathProfile} directory. #' -#' @param studyIDSyn a \code{character} string corresponding to the study -#' identifier. The study identifier must be present in the GDS Sample file. +#' @return a \code{character} string representing the path to the existing +#' Profile GDS file. #' -#' @param np a single positive \code{integer} representing the number of -#' threads. Default: \code{1L}. +#' @examples #' -#' @param listCatPop a \code{vector} of \code{character} string -#' representing the list of possible ancestry assignations. Default: -#' \code{("EAS", "EUR", "AFR", "AMR", "SAS")}. +#' ## Path to the demo 1KG GDS file is located in this package +#' dataDir <- system.file("extdata/tests", package="RAIDS") #' -#' @param fieldPopIn1KG a \code{character} string representing the TODO -#' -#' @param fieldPopInfAnc a \code{character} string representing the name of -#' the column that will contain the inferred ancestry for the specified -#' dataset. Default: \code{"SuperPop"}. -#' -#' @param kList a \code{vector} of \code{integer} representing the list of -#' values tested for the _K_ parameter. The _K_ parameter represents the -#' number of neighbors used in the K-nearest neighbors analysis. If -#' \code{NULL}, the value \code{seq(2,15,1)} is assigned. -#' Default: \code{seq(2,15,1)}. -#' -#' @param pcaList a \code{vector} of \code{integer} representing the list of -#' values tested for the _D_ parameter. The _D_ parameter represents the -#' number of dimensions used in the PCA analysis. If \code{NULL}, -#' the value \code{seq(2,15,1)} is assigned. -#' Default: \code{seq(2,15,1)}. -#' -#' @param algorithm a \code{character} string representing the algorithm used -#' to calculate the PCA. The 2 choices are "exact" (traditional exact -#' calculation) and "randomized" (fast PCA with randomized algorithm -#' introduced in Galinsky et al. 2016). Default: \code{"exact"}. -#' -#' @param eigen.cnt a single \code{integer} indicating the number of -#' eigenvectors that will be in the output of the \link[SNPRelate]{snpgdsPCA} -#' function; if 'eigen.cnt' <= 0, then all eigenvectors are returned. -#' -#' @param missing.rate a \code{numeric} value representing the threshold -#' missing rate at with the SNVs are discarded; the SNVs are retained in the -#' \link[SNPRelate]{snpgdsPCA} -#' with "<= missing.rate" only; if \code{NaN}, no missing threshold. -#' -#' -#' @return The integer \code{0L} when successful. -#' -#' @examples -#' -#' ## Path to the demo 1KG GDS file is located in this package -#' dataDir <- system.file("extdata/tests", package="RAIDS") -#' fileGDS <- file.path(dataDir, "ex1_good_small_1KG_GDS.gds") -#' fileProfileGDS <- file.path(dataDir, "ex1_demo.gds") -#' -#' ## Open GDS files -#' gds1KG <- snpgdsOpen(fileGDS) -#' gdsProfile <- openfn.gds(fileProfileGDS) -#' -#' dataRef <- data.frame(test=c(1,2), stringAsFactro=FALSE) -#' -#' ## The function returns 0L when all parameters are valid -#' RAIDS:::validateComputePoolSyntheticAncestry(gdsReference=gds1KG, -#' profileGDS=gdsProfile, profileID="SampleID", -#' dataRef=dataRef, spRef=NULL, studyIDSyn="MyStudy", -#' np=1L, listCatPop=c("EAS", "EUR", "AFR", "AMR", "SAS"), -#' fieldPopIn1KG="SuperPop", fieldPopInfAnc="SuperPop", -#' kList=seq(2,15,1), pcaList=seq(2,15,1), -#' algorithm="exact", eigenCount=32L, missingRate=0.025) -#' -#' ## Close GDS files (it is important to always close the GDS files) -#' closefn.gds(gds1KG) -#' closefn.gds(gdsProfile) -#' -#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz -#' @importFrom S4Vectors isSingleNumber -#' @encoding UTF-8 -#' @keywords internal -validateComputePoolSyntheticAncestry <- function(gdsReference, profileGDS, - profileID, dataRef, spRef, studyIDSyn, np, - listCatPop, fieldPopIn1KG, fieldPopInfAnc, - kList, pcaList, algorithm, eigenCount, missingRate) { - - ## The gds and profileGDS must be objects of class "gds.class" - validateGDSClass(gds=gdsReference, "gdsReference") - validateGDSClass(gds=profileGDS, "profileGDS") - - ## The profileID must be one character string - if (!(is.character(profileID) && length(profileID) == 1)) { - stop("The \'profileID\' parameter must be a character string.") - } - - ## The dataRef must be an data.frame object - if (!is.data.frame(dataRef)) { - stop("The \'dataRef\' must be a data.frame object.") - } - - ## The studyIDSyn must be a character string - if (!(is.character(studyIDSyn) && length(studyIDSyn) == 1)) { - stop("The \'studyIDSyn\' parameter must be a character string.") - } - - ## The listCatPop must be a character string vector - if (!(is.character(listCatPop) && is.vector(listCatPop))) { - stop("The \'listCatPop\' parameter must be a vector of ", - "character strings.") - } - - ## The population name in 1KG must be a character string - if (!(is.character(fieldPopIn1KG) && length(fieldPopIn1KG) == 1)) { - stop("The \'fieldPopIn1KG\' parameter must be a character string.") - } - - ## The population inferred must be a character string - if (!(is.character(fieldPopInfAnc) && length(fieldPopInfAnc) == 1)) { - stop("The \'fieldPopInfAnc\' parameter must be a character string.") - } - - ## The parameters must be vectors of positive integers - validatePositiveIntegerVector(kList, "kList") - validatePositiveIntegerVector(pcaList, "pcaList") - - ## The algorithm must be a character string - if (!(is.character(algorithm) && length(algorithm) == 1)) { - stop("The \'algorithm\' parameter must be a character string.") - } - - ## The eigenCount must be a single integer - if (!(isSingleNumber(eigenCount))) { - stop("The \'eigenCount\' parameter must be a single integer.") - } - - ## The missingRate must be a numeric of NaN - if (!(isSingleNumber(missingRate) || is.nan(missingRate))) { - stop("The \'missingRate\' parameter must be a single numeric or NaN.") - } - - return(0L) -} - -#' @title Validate that the Profile GDS file exists for the specified profile -#' -#' @description The function validates that the Profile GDS file associated -#' to a profile identifier exists in the specified directory. -#' -#' @param pathProfile a \code{character} string representing the directory -#' where the Profile GDS files will be created. The directory must exist. -#' -#' @param currentProfile a \code{character} string -#' corresponding to the profile identifier. A Profile GDS file -#' corresponding to the profile identifier must exist and be located in the -#' \code{pathProfile} directory. -#' -#' @return a \code{character} string representing the path to the existing -#' Profile GDS file. -#' -#' @examples -#' -#' ## Path to the demo 1KG GDS file is located in this package -#' dataDir <- system.file("extdata/tests", package="RAIDS") -#' -#' ## The function returns the path to the existing Profile GDS file -#' RAIDS:::validateProfileGDSExist(pathProfile=dataDir, -#' profile="ex1_demo") +#' ## The function returns the path to the existing Profile GDS file +#' RAIDS:::validateProfileGDSExist(pathProfile=dataDir, +#' profile="ex1_demo") #' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz #' @encoding UTF-8 @@ -1602,6 +1488,9 @@ validateProfileGDSExist <- function(pathProfile, profile) { #' #' @examples #' +#' ## Loading demo PCA on subset of 1KG reference dataset +#' data(demoPCA1KG) +#' #' ## Path to the demo GDS file is located in this package #' dataDir <- system.file("extdata/demoKNNSynthetic", package="RAIDS") #' fileProfileGDS <- file.path(dataDir, "ex1.gds") @@ -1609,11 +1498,9 @@ validateProfileGDSExist <- function(pathProfile, profile) { #' ## Open GDS files #' gdsProfile <- openfn.gds(fileProfileGDS) #' -#' pca <- readRDS(file.path(dataDir, "pca1KG.RDS")) -#' #' ## The function returns 0L when all parameters are valid #' RAIDS:::validateComputePCAMultiSynthetic(gdsProfile=gdsProfile, -#' listPCA=pca, sampleRef=c("HG00246", "HG00325"), +#' listPCA=demoPCA1KG, sampleRef=c("HG00246", "HG00325"), #' studyIDSyn="MyStudy", verbose=FALSE) #' #' ## Close GDS file (it is important to always close the GDS files) @@ -1693,24 +1580,25 @@ validateComputePCAMultiSynthetic <- function(gdsProfile, listPCA, sampleRef, #' #' @examples #' +#' ## Load the demo PCA on the synthetic profiles projected on the +#' ## demo 1KG reference PCA +#' data(demoPCASyntheticProfiles) +#' +#' ## Load the known ancestry for the demo 1KG reference profiles +#' data(demoKnownSuperPop1KG) #' #' ## Path to the demo GDS file is located in this package #' dataDir <- system.file("extdata/demoKNNSynthetic", package="RAIDS") #' fileProfileGDS <- file.path(dataDir, "ex1.gds") #' -#' pcaSynthetic <- readRDS(file.path(dataDir, "pcaSynthetic.RDS")) -#' -#' ## The known ancestry for the 1KG reference profiles -#' refKnownSuperPop <- readRDS(file.path(dataDir, "knownSuperPop1KG.RDS")) -#' #' ## Open GDS files #' gdsProfile <- openfn.gds(fileProfileGDS) #' #' ## The function returns 0L when all parameters are valid #' RAIDS:::validateComputeKNNRefSynthetic(gdsProfile=gdsProfile, -#' listEigenvector=pcaSynthetic, +#' listEigenvector=demoPCASyntheticProfiles, #' listCatPop=c("EAS", "EUR", "AFR", "AMR", "SAS"), -#' studyIDSyn="MyStudy", spRef=refKnownSuperPop, +#' studyIDSyn="MyStudy", spRef=demoKnownSuperPop1KG, #' fieldPopInfAnc="Superpop", kList=c(10, 11, 12), #' pcaList=c(13, 14, 15)) #' @@ -1805,20 +1693,20 @@ validateComputeKNNRefSynthetic <- function(gdsProfile, listEigenvector, #' #' @examples #' +#' ## Load the demo PCA on the synthetic profiles projected on the +#' ## demo 1KG reference PCA +#' data(demoPCASyntheticProfiles) #' -#' ## Path to the demo GDS file is located in this package -#' dataDir <- system.file("extdata/demoKNNSynthetic", package="RAIDS") +#' ## Load the known ancestry for the demo 1KG reference profiles +#' data(demoKnownSuperPop1KG) #' -#' pcaSynthetic <- readRDS(file.path(dataDir, "pcaSynthetic.RDS")) +#' pcaSynthetic <- demoPCASyntheticProfiles #' pcaSynthetic$sample.id <- pcaSynthetic$sample.id[1] #' -#' ## The known ancestry for the 1KG reference profiles -#' refKnownSuperPop <- readRDS(file.path(dataDir, "knownSuperPop1KG.RDS")) -#' #' ## The function returns 0L when all parameters are valid #' RAIDS:::validateComputeKNNRefSample(listEigenvector=pcaSynthetic, #' listCatPop=c("EAS", "EUR", "AFR", "AMR", "SAS"), -#' spRef=refKnownSuperPop, fieldPopInfAnc="Superpop", +#' spRef=demoKnownSuperPop1KG, fieldPopInfAnc="Superpop", #' kList=c(10, 11, 12), pcaList=c(13, 14, 15)) #' #' @@ -1912,7 +1800,7 @@ validateComputeKNNRefSample <- function(listEigenvector, listCatPop, spRef, #' printed. #' #' @return a \code{list} containing 2 entries: -#' \itemize{ +#' \describe{ #' \item{pruned}{ a \code{vector} of SNV identifiers specifying selected SNVs #' for the PCA analysis.} #' \item{pca.unrel}{ a \code{snpgdsPCAClass} object containing the eigenvalues @@ -1928,8 +1816,11 @@ validateComputeKNNRefSample <- function(listEigenvector, listCatPop, spRef, #' #' @examples #' -#' ## Path to the demo Profile GDS file is located in this package -#' dataDir <- system.file("extdata/demoKNNSynthetic", package="RAIDS") +#' ## Required library +#' library(SNPRelate) +#' +#' ## Load the known ancestry for the demo 1KG reference profiles +#' data(demoKnownSuperPop1KG) #' #' # The name of the synthetic study #' studyID <- "MYDATA.Synthetic" @@ -1942,8 +1833,8 @@ validateComputeKNNRefSample <- function(listEigenvector, listCatPop, spRef, #' "NA12751", "NA19107", "NA18548", "NA19075", "NA19475", "NA19712", #' "NA19731", "NA20528", "NA20908") #' -#' ## The known ancestry for the 1KG reference profiles -#' refKnownSuperPop <- readRDS(file.path(dataDir, "knownSuperPop1KG.RDS")) +#' ## Path to the demo Profile GDS file is located in this package +#' dataDir <- system.file("extdata/demoKNNSynthetic", package="RAIDS") #' #' ## Open the Profile GDS file #' gdsProfile <- snpgdsOpen(file.path(dataDir, "ex1.gds")) @@ -1951,7 +1842,7 @@ validateComputeKNNRefSample <- function(listEigenvector, listCatPop, spRef, #' ## Compute PCA for the 1KG reference profiles excluding #' ## the profiles used to generate the synthetic profiles #' results <- RAIDS:::computePCARefRMMulti(gdsProfile=gdsProfile, -#' refProfileIDs=names(refKnownSuperPop), listRM=samplesRM, np=1L, +#' refProfileIDs=names(demoKnownSuperPop1KG), listRM=samplesRM, np=1L, #' algorithm="exact", eigenCount=32L, missingRate=0.025, verbose=FALSE) #' #' ## The PCA on the pruned SNVs data set for selected profiles @@ -1989,388 +1880,1825 @@ computePCARefRMMulti <- function(gdsProfile, refProfileIDs, listRM, np=1L, } -#' @title Deprecated -#' -#' @description Deprecated -#' -#' @param listEigenvector TODO see return of computePCAsynthetic -#' -#' @param sample.ref TODO -#' -#' @param study.annot a \code{data.frame} with one entry from study.annot in -#' the gds +#' @title Compile all the inferred ancestry results done on the +#' synthetic profiles for different D and K values in the objective of +#' selecting the optimal D and K values for a specific profile #' -#' @param spRef TODO +#' @description The function calculates the accuracy of the inferred ancestry +#' called done on the synthetic profiles for different D and K values. The +#' accuracy is also calculated for each super-population used to generate +#' the synthetic profiles. The known ancestry from the reference profiles +#' used to generate the synthetic profiles is required to calculate the +#' accuracy. #' -#' @param kList a \code{vector} of \code{integer} representing the list of -#' values tested for the K parameter. The K parameter represents the -#' number of neighbors used in the K-nearest neighbors analysis. If -#' \code{NULL}, the value \code{seq_len(15)} is assigned. -#' Default: \code{seq_len(15)}. +#' @param matKNN a \code{data.frame} containing the inferred ancestry for the +#' synthetic profiles for different _K_ and _D_ values. The \code{data.frame} +#' must contained those columns: "sample.id", "D", "K" and the fourth column +#' name must correspond to the \code{predCall} argument. #' -#' @param pcaList TODO array of the pca dimension possible values +#' @param pedCall a \code{data.frame} containing the information about +#' the super-population information from the 1KG GDS file +#' for profiles used to generate the synthetic profiles. The \code{data.frame} +#' must contained a column named as the \code{refCall} argument. #' -#' @return A \code{list} TODO with the sample.id and eigenvectors -#' and a table with KNN callfor different K and pca dimension. +#' @param refCall a \code{character} string representing the name of the +#' column that contains the known ancestry for the reference profiles in +#' the Reference GDS file. #' -#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz -#' @importFrom gdsfmt add.gdsn index.gdsn -#' @importFrom SNPRelate snpgdsPCA snpgdsPCASampLoading snpgdsPCASampLoading -#' @importFrom class knn -#' @encoding UTF-8 -#' @keywords internal -computeKNNSuperPoprSynthetic <- function(listEigenvector, sample.ref, - study.annot, spRef, kList=seq_len(15), - pcaList=seq(2, 15, 1)) { - - ## The number of rows in study.annot must be one. - if(nrow(study.annot) != 1) { - stop("Number of samples in study.annot not equal to 1\n") - } - - if(is.null(kList)){ - kList <- seq_len(15) #c(seq_len(14), seq(15,100, by=5)) - } - if(is.null(pcaList)){ - pcaList <- seq(2, 15, 1) - } - - resMat <- data.frame(sample.id=rep(listEigenvector$sample.id, - length(pcaList) * length(kList)), - D=rep(0,length(pcaList) * length(kList)), - K=rep(0,length(pcaList) * length(kList)), - SuperPop=character(length(pcaList) * length(kList)), - stringsAsFactors=FALSE) - - listSuperPop <- c("EAS", "EUR", "AFR", "AMR", "SAS") - - #curPCA <- listPCA.Samples[[sample.id[sample.pos]]] - eigenvect <- rbind(listEigenvector$eigenvector.ref, - listEigenvector$eigenvector) - - rownames(eigenvect) <- c(sample.ref[which(sample.ref != - study.annot$case.id[1])], - listEigenvector$sample.id) - - totR <- 1 - for(pcaD in pcaList) { - for(kV in seq_len(length(kList))) { - dCur <- paste0("d", pcaD) - kCur <- paste0("k", kList[kV]) - resMat[totR,c("D", "K")] <- c(pcaD, kList[kV]) - - pcaND <- eigenvect[ ,seq_len(pcaD)] - y_pred <- knn(train=pcaND[rownames(eigenvect)[-1*nrow(eigenvect)],], - test=pcaND[rownames(eigenvect)[nrow(eigenvect)],, - drop=FALSE], - cl=factor(spRef[rownames(eigenvect)[-1*nrow(eigenvect)]], - levels=listSuperPop, labels=listSuperPop), - k=kList[kV], - prob=FALSE) - - resMat[totR, paste0("SuperPop")] <- listSuperPop[as.integer(y_pred)] - - totR <- totR + 1 - } # end k - } # end pca Dim - listKNN <- list(sample.id=listEigenvector$sample.id, - sample1Kg=study.annot$case.id[1], - sp=spRef[study.annot$case.id[1]], matKNN=resMat) - - return(listKNN) -} - - -#' @title Deprecated +#' @param predCall a \code{character} string representing the name of +#' the column that contains the inferred ancestry for the specified +#' profiles. The column must be present in the \code{matKNN} \code{data.frame} +#' argument. #' -#' @description Deprecated +#' @param listCall a \code{vector} of \code{character} strings representing +#' the list of possible ancestry assignations. #' -#' @param listEigenvector TODO see return of computePCARefSample +#' @param kList a \code{vector} of \code{integer} representing the list of +#' values tested for the _K_ parameter. The _K_ parameter represents the +#' number of neighbors used in the K-nearest neighbor analysis. +#' Default: \code{seq(3,15,1)}. #' -#' @param sample.ref TODO +#' @param pcaList a \code{vector} of \code{integer} representing the list of +#' values tested for the _D_ parameter. The _D_ parameter represents the +#' number of dimensions used in the PCA analysis. +#' Default: \code{seq(2,15,1)}. #' -#' @param study.annot a \code{data.frame} with one entry from study.annot in -#' the gds +#' @return a \code{list} containing 5 entries: +#' \describe{ +#' \item{\code{dfPCA}}{ a \code{data.frame} containing statistical results +#' on all combined synthetic results done with a fixed value of \code{D} (the +#' number of dimensions). The \code{data.frame} contains those columns: +#' \describe{ +#' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +#' number of dimensions).} +#' \item{\code{median}}{ a \code{numeric} representing the median of the +#' minimum AUROC obtained (within super populations) for all combination of +#' the fixed \code{D} value and all tested \code{K} values. } +#' \item{\code{mad}}{ a \code{numeric} representing the MAD of the minimum +#' AUROC obtained (within super populations) for all combination of the fixed +#' \code{D} value and all tested \code{K} values. } +#' \item{\code{upQuartile}}{ a \code{numeric} representing the upper quartile +#' of the minimum AUROC obtained (within super populations) for all +#' combination of the fixed \code{D} value and all tested \code{K} values. } +#' \item{\code{k}}{ a \code{numeric} representing the optimal \code{K} value +#' (the number of neighbors) for a fixed \code{D} value. } +#' } +#' } +#' \item{\code{dfPop}}{ a \code{data.frame} containing statistical results on +#' all combined synthetic results done with different values of \code{D} (the +#' number of dimensions) and \code{K} (the number of neighbors). +#' The \code{data.frame} contains those columns: +#' \describe{ +#' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +#' number of dimensions).} +#' \item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +#' number of neighbors).} +#' \item{\code{AUROC.min}}{ a \code{numeric} representing the minimum accuracy +#' obtained by grouping all the synthetic results by super-populations, for +#' the specified values of \code{D} and \code{K}.} +#' \item{\code{AUROC}}{ a \code{numeric} representing the accuracy obtained +#' by grouping all the synthetic results for the specified values of \code{D} +#' and \code{K}.} +#' \item{\code{Accu.CM}}{ a \code{numeric} representing the value of accuracy +#' of the confusion matrix obtained by grouping all the synthetic results for +#' the specified values of \code{D} and \code{K}.} +#' } +#' } +#' \item{\code{D}}{ a \code{numeric} representing the optimal \code{D} value +#' (the number of dimensions) for the specific profile.} +#' \item{\code{K}}{ a \code{numeric} representing the optimal \code{K} value +#' (the number of neighbors) for the specific profile.} +#' \item{\code{listD}}{ a \code{numeric} representing the optimal \code{D} +#' values (the number of dimensions) for the specific profile. More than one +#' \code{D} is possible.} +#' } #' -#' @param spRef TODO +#' @examples #' -#' @param kList a \code{vector} of \code{integer} representing the list of -#' values tested for the _K_ parameter. The _K_ parameter represents the -#' number of neighbors used in the K-nearest neighbor analysis. If \code{NULL}, -#' the value \code{seq_len(15)} is assigned. -#' Default: \code{seq_len(15)}. +#' ## Loading demo dataset containing pedigree information for synthetic +#' ## profiles and known ancestry of the profiles used to generate the +#' ## synthetic profiles +#' data(pedSynthetic) #' -#' @param pcaList TODO array of the pca dimension possible values +#' ## Loading demo dataset containing the inferred ancestry results +#' ## for the synthetic data +#' data(matKNNSynthetic) #' -#' @return A \code{list} TODO with the sample.id and eigenvectors -#' and a table with KNN callfor different K and pca dimension. +#' ## Compile all the results for ancestry inference done on the +#' ## synthetic profiles for different D and K values +#' ## Select the optimal D and K values +#' results <- RAIDS:::selParaPCAUpQuartile(matKNN=matKNNSynthetic, +#' pedCall=pedSynthetic, refCall="superPop", predCall="SuperPop", +#' listCall=c("EAS", "EUR", "AFR", "AMR", "SAS"), kList=seq(3,15,1), +#' pcaList=seq(2,15,1)) +#' results$D +#' results$K #' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz -#' @importFrom gdsfmt add.gdsn index.gdsn -#' @importFrom SNPRelate snpgdsPCA snpgdsPCASampLoading snpgdsPCASampLoading -#' @importFrom class knn +#' @importFrom stats mad median quantile #' @encoding UTF-8 #' @keywords internal -computeKNNSuperPopSample <- function(gdsSample, listEigenvector, name.id, - spRef, studyIDRef="Ref.1KG", - kList=seq_len(15), pcaList=2:15) { +selParaPCAUpQuartile <- function(matKNN, pedCall, refCall, + predCall, listCall, kList=seq(3,15,1), + pcaList=seq(2,15,1)) { - if(is.null(kList)) { - kList <- seq_len(15)#c(seq_len(14), seq(15,100, by=5)) - } - if(is.null(pcaList)) { - pcaList <- 2:15 - } - if(length(name.id) != 1) { - stop("Number of sample in study.annot not equal to 1\n") + if (min(kList) < 3) { + warning("A K smaller than 3 could not give robust results.\n") } - study.annot.all <- read.gdsn(index.gdsn(gdsSample, "study.annot")) - - sample.ref <- study.annot.all[which(study.annot.all$study.id == - studyIDRef), "data.id"] - - resMat <- data.frame(sample.id=rep(listEigenvector$sample.id, - length(pcaList) * length(kList)), - D=rep(0,length(pcaList) * length(kList)), - K=rep(0,length(pcaList) * length(kList)), - SuperPop=character(length(pcaList) * length(kList)), - stringsAsFactors=FALSE) - - listSuperPop <- c("EAS", "EUR", "AFR", "AMR", "SAS") - - eigenvect <- rbind(listEigenvector$eigenvector.ref, - listEigenvector$eigenvector) - - rownames(eigenvect) <- c(sample.ref, listEigenvector$sample.id) - - totR <- 1 - for(pcaD in pcaList) { - for(kV in seq_len(length(kList))) { - dCur <- paste0("d", pcaD) - kCur <- paste0("k", kList[kV]) - resMat[totR,c("D", "K")] <- c(pcaD, kList[kV]) - - pcaND <- eigenvect[ ,seq_len(pcaD)] - y_pred <- knn(train=pcaND[rownames(eigenvect)[-1*nrow(eigenvect)],], - test=pcaND[rownames(eigenvect)[nrow(eigenvect)],, drop=FALSE], - cl=factor(spRef[rownames(eigenvect)[-1*nrow(eigenvect)]], - levels=listSuperPop, labels=listSuperPop), - k=kList[kV],prob=FALSE) - - resMat[totR, paste0("SuperPop")] <- listSuperPop[as.integer(y_pred)] - - totR <- totR + 1 - } # end k - } # end pca Dim - listKNN <- list(sample.id=listEigenvector$sample.id, matKNN=resMat) - - return(listKNN) -} - + tableSyn <- list() + tableCall <- list() + tableAUROC <- list() + i <- 1 -#' @title Deprecated - Project patients onto existing principal component -#' axes (PCA) -#' -#' @description This function calculates the patient eigenvectors using -#' the specified SNP loadings. Deprecated -#' -#' @param gds an object of class -#' \code{\link[SNPRelate:SNPGDSFileClass]{SNPRelate::SNPGDSFileClass}}, a SNP -#' GDS file. + ## Loop on all PCA dimension values + for (D in pcaList) { + matKNNCurD <- matKNN[which(matKNN$D == D), ] + listTMP <- list() + listTMP.AUROC <- list() + j <- 1 + ## Loop on all k neighbor values + for (K in kList) { + matKNNCur <- matKNNCurD[which(matKNNCurD$K == K), ] + ## Calculate accuracy for fixed D and K values + res <- computeSyntheticConfMat(matKNN=matKNNCur, + matKNNAncestryColumn=predCall, pedCall=pedCall, + pedCallAncestryColumn=refCall, listCall=listCall) + resROC <- computeSyntheticROC(matKNN=matKNNCur, + matKNNAncestryColumn=predCall, pedCall=pedCall, + pedCallAncestryColumn=refCall, listCall=listCall) + + df <- data.frame(D=D, K=K, AUROC.min=min(resROC$matAUROC.Call$AUC), + AUROC=resROC$matAUROC.All$ROC.AUC, + Accu.CM=res$matAccuracy$Accu.CM) + + listTMP[[j]] <- df + listTMP.AUROC[[j]] <- resROC$matAUROC.Call + j <- j + 1 + } + df <- do.call(rbind, listTMP) + + tableCall[[i]] <- df + tableAUROC[[i]] <- do.call(rbind, listTMP.AUROC) + maxAUROC <- max(df[df$K %in% kList, "AUROC.min"]) + kMax <- df[df$K %in% kList & abs(df$AUROC.min-maxAUROC) < 1e-3, "K"] + kV <- kMax[(length(kMax) + length(kMax)%%2)/2] + dfPCA <- data.frame(D=D, + median=median(df[df$K %in% kList, "AUROC.min"]), + mad=mad(df[df$K %in% kList, "AUROC.min"]), + upQuartile=quantile(df[df$K %in% kList, "AUROC.min"], 0.75), K=kV) + tableSyn[[i]] <- dfPCA + i <- i + 1 + } + + dfPCA <- do.call(rbind, tableSyn) + dfCall <- do.call(rbind, tableCall) + dfAUROC <- do.call(rbind, tableAUROC) + selD <- dfPCA$D[which.max(dfPCA$upQuartile)] + selK <- dfPCA$K[which.max(dfPCA$upQuartile)] + tmp <- max(dfPCA$upQuartile) + listD <- dfPCA$D[which(abs(dfPCA$upQuartile - tmp) < 1e-3)] + + res <- list(dfPCA=dfPCA, dfPop=dfCall, dfAUROC=dfAUROC, + D=selD, K=selK, listD=listD) + return(res) +} + +#' @title Run most steps leading to the ancestry inference call on a +#' specific profile (RNA or DNA) +#' +#' @description This function runs most steps leading to the ancestry inference +#' call on a specific profile. First, the function creates the Profile GDS file +#' for the specific profile using the information from a RDS Sample +#' description file and the Population reference GDS file. +#' +#' @param gdsReference an object of class \code{\link[gdsfmt]{gds.class}} +#' (a GDS file), the opened Population Reference GDS file. +#' +#' @param gdsRefAnnot an object of class \code{\link[gdsfmt]{gds.class}} +#' (a GDS file), the opened Population Reference SNV Annotation GDS file. +#' This parameter is RNA specific. +#' +#' @param studyDF a \code{data.frame} containing the information about the +#' study associated to the analysed sample(s). The \code{data.frame} must have +#' those 3 columns: "study.id", "study.desc", "study.platform". All columns +#' must be in \code{character} strings (no factor). +#' +#' @param currentProfile a \code{character} string representing the profile +#' identifier. +#' +#' @param pathProfileGDS a \code{character} string representing the path to +#' the directory where the GDS Profile files will be created. +#' Default: \code{NULL}. +#' +#' @param pathOut a \code{character} string representing the path to +#' the directory where the output files are created. +#' +#' @param chrInfo a \code{vector} of positive \code{integer} values +#' representing the length of the chromosomes. See 'details' section. +#' +#' @param syntheticRefDF a \code{data.frame} containing a subset of +#' reference profiles for each sub-population present in the Reference GDS +#' file. The \code{data.frame} must have those columns: +#' \describe{ +#' \item{sample.id}{ a \code{character} string representing the sample +#' identifier. } +#' \item{pop.group}{ a \code{character} string representing the +#' subcontinental population assigned to the sample. } +#' \item{superPop}{ a \code{character} string representing the +#' super-population assigned to the sample. } +#' } +#' +#' @param studyDFSyn a \code{data.frame} containing the information about the +#' synthetic data to the analysed sample(s). The \code{data.frame} must have +#' those 3 columns: "study.id", "study.desc", "study.platform". All columns +#' must be in \code{character} strings (no factor). +#' +#' @param listProfileRef a \code{vector} of \code{character} string +#' representing the +#' identifiers of the selected 1KG profiles that will be used as reference to +#' generate the synthetic profiles. +#' +#' @param studyType a \code{character} string representing the type of study. +#' The possible choices are: "DNA" and "RNA". The type of study affects the +#' way the estimation of the allelic fraction is done. Default: \code{"DNA"}. +#' +#' @param np a single positive \code{integer} specifying the number of +#' threads to be used. Default: \code{1L}. +#' +#' @param blockTypeID a \code{character} string corresponding to the block +#' type used to extract the block identifiers. The block type must be +#' present in the GDS Reference Annotation file. +#' +#' @param verbose a \code{logical} indicating if messages should be printed +#' to show how the different steps in the function. Default: \code{FALSE}. +#' +#' @return The integer \code{0L} when successful. See details section for +#' more information about the generated output files. +#' +#' @details +#' +#' The runWrapperAncestry() function generates 3 types of files +#' in the \code{pathOut} directory: +#' \describe{ +#' \item{Ancestry Inference}{ The ancestry inference CSV file +#' (".Ancestry.csv" file)} +#' \item{Inference Informaton}{ The inference information RDS file +#' (".infoCall.rds" file)} +#' \item{Synthetic Information}{ The parameter information RDS files +#' from the synthetic inference ("KNN.synt.*.rds" files in a sub-directory)} +#' } +#' +#' In addition, a sub-directory (named using the profile ID) is +#' also created. +#' +#' @references +#' +#' Galinsky KJ, Bhatia G, Loh PR, Georgiev S, Mukherjee S, Patterson NJ, +#' Price AL. Fast Principal-Component Analysis Reveals Convergent Evolution +#' of ADH1B in Europe and East Asia. Am J Hum Genet. 2016 Mar 3;98(3):456-72. +#' doi: 10.1016/j.ajhg.2015.12.022. Epub 2016 Feb 25. +#' +#' @examples +#' +#' ## Required library for GDS +#' library(SNPRelate) +#' +#' ## Path to the demo 1KG GDS file is located in this package +#' dataDir <- system.file("extdata", package="RAIDS") +#' +#' ################################################################# +#' ## Load the information about the profile +#' ################################################################# +#' data(demoPedigreeEx1) +#' head(demoPedigreeEx1) +#' +#' ################################################################# +#' ## The 1KG GDS file and the 1KG SNV Annotation GDS file +#' ## need to be located in the same directory +#' ## Note that the 1KG GDS file used for this example is a +#' ## simplified version and CANNOT be used for any real analysis +#' ################################################################# +#' path1KG <- file.path(dataDir, "tests") +#' +#' fileReferenceGDS <- file.path(path1KG, "ex1_good_small_1KG.gds") +#' fileAnnotGDS <- file.path(path1KG, "ex1_good_small_1KG_Annot.gds") +#' +#' ################################################################# +#' ## The Sample SNP pileup files (one per sample) need +#' ## to be located in the same directory. +#' ################################################################# +#' pathGeno <- file.path(dataDir, "example", "snpPileup") +#' +#' ################################################################# +#' ## The path where the Profile GDS Files (one per sample) +#' ## will be created need to be specified. +#' ################################################################# +#' +#' pathProfileGDS <- file.path(tempdir(), "outTest.tmp") +#' +#' pathOut <- file.path(tempdir(), "resTest.out") +#' +#' ################################################################# +#' ## A data frame containing general information about the study +#' ## is also required. The data frame must have +#' ## those 3 columns: "studyID", "study.desc", "study.platform" +#' ################################################################# +#' studyDF <- data.frame(study.id="MYDATA", +#' study.desc="Description", +#' study.platform="PLATFORM", +#' stringsAsFactors=FALSE) +#' +#' #################################################################### +#' ## Fix seed to ensure reproducible results +#' #################################################################### +#' set.seed(3043) +#' +#' gdsReference <- snpgdsOpen(fileReferenceGDS) +#' dataRef <- select1KGPop(gdsReference, nbProfiles=2L) +#' closefn.gds(gdsReference) +#' +#' ## Required library for this example to run correctly +#' if (requireNamespace("GenomeInfoDb", quietly=TRUE) && +#' requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) { +#' +#' ## Chromosome length information +#' ## chr23 is chrX, chr24 is chrY and chrM is 25 +#' chrInfo <- GenomeInfoDb::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25] +#' +#' studyDFSyn <- data.frame(study.id=paste0(studyDF$study.id, ".Synthetic"), +#' study.desc=paste0(studyDF$study.id, " synthetic data"), +#' study.platform=studyDF$study.platform, stringsAsFactors=FALSE) +#' +#' listProfileRef <- dataRef$sample.id +#' profileFile <- file.path(pathProfileGDS, "ex1.gds") +#' +#' \dontrun{ +#' +#' dir.create(pathProfileGDS) +#' dir.create(pathOut) +#' file.copy(file.path(dataDir, "tests", "ex1_demo.gds"), profileFile) +#' +#' gdsReference <- snpgdsOpen(fileReferenceGDS) +#' gdsRefAnnot <- openfn.gds(fileAnnotGDS) +#' +#' RAIDS:::runProfileAncestry(gdsReference=gdsReference, +#' gdsRefAnnot=gdsRefAnnot, +#' studyDF=studyDF, currentProfile=ped[1,"Name.ID"], +#' pathProfileGDS=pathProfileGDS, +#' pathOut=pathOut, +#' chrInfo=chrInfo, +#' syntheticRefDF=dataRef, +#' studyDFSyn=studyDFSyn, +#' listProfileRef=listProfileRef, +#' studyType="DNA") +#' +#' closefn.gds(gdsReference) +#' closefn.gds(gdsRefAnnot) +#' +#' unlink(pathProfileGDS, recursive=TRUE, force=TRUE) +#' unlink(pathOut, recursive=TRUE, force=TRUE) +#' +#' } +#' } +#' +#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz +#' @importFrom utils write.csv +#' @importFrom rlang arg_match +#' @encoding UTF-8 +#' @keywords internal +runProfileAncestry <- function(gdsReference, gdsRefAnnot, studyDF, + currentProfile, pathProfileGDS, pathOut, chrInfo, syntheticRefDF, + studyDFSyn, listProfileRef, studyType=c("DNA", "RNA"), + np=1L, blockTypeID=NULL, verbose=FALSE) { + + studyType <- arg_match(studyType) + + pruningSample(gdsReference=gdsReference, currentProfile=currentProfile, + studyID=studyDF$study.id, pathProfileGDS=pathProfileGDS, np=np) + + fileGDSProfile <- file.path(pathProfileGDS, + paste0(currentProfile, ".gds")) + add1KG2SampleGDS(gdsReference=gdsReference, fileProfileGDS=fileGDSProfile, + currentProfile=currentProfile, studyID=studyDF$study.id) + + addStudy1Kg(gdsReference, fileGDSProfile) + + gdsProfile <- openfn.gds(fileGDSProfile, readonly=FALSE) + + estimateAllelicFraction(gdsReference=gdsReference, gdsProfile=gdsProfile, + currentProfile=currentProfile, studyID=studyDF$study.id, + chrInfo=chrInfo, studyType=studyType, gdsRefAnnot=gdsRefAnnot, + blockID=blockTypeID, verbose=verbose) + closefn.gds(gdsProfile) + + ## Add information related to the synthetic profiles in Profile GDS file + prepSynthetic(fileProfileGDS=fileGDSProfile, + listSampleRef=listProfileRef, profileID=currentProfile, + studyDF=studyDFSyn, prefix="1", verbose=verbose) + + resG <- syntheticGeno(gdsReference=gdsReference, gdsRefAnnot=gdsRefAnnot, + fileProfileGDS=fileGDSProfile, profileID=currentProfile, + listSampleRef=listProfileRef, prefix="1") + + if(! file.exists(pathOut)) { + dir.create(pathOut) + } + spRef <- getRef1KGPop(gdsReference, "superPop") + sampleRM <- splitSelectByPop(syntheticRefDF) + + pathOutProfile <- file.path(pathOut, currentProfile) + if(! file.exists(pathOutProfile)) { + dir.create(pathOutProfile) + } + + ## Open the Profile GDS file + gdsProfile <- snpgdsOpen(fileGDSProfile) + + + ## This variable will contain the results from the PCA analyses + ## For each row of the sampleRM matrix + apply(t(t(seq_len(nrow(sampleRM)))), 1, FUN=function(x, sampleRM, + gdsProfile, studyDFSyn, spRef, + pathOutProfile, currentProfile) { + synthKNN <- computePoolSyntheticAncestryGr(gdsProfile=gdsProfile, + sampleRM=sampleRM[x,], + studyIDSyn=studyDFSyn$study.id, + np=np, spRef=spRef, eigenCount=15L, + verbose=verbose) + + ## Results are saved + saveRDS(synthKNN$matKNN, file.path(pathOutProfile, + paste0("KNN.synt.", currentProfile, ".", x, ".rds"))) + return(NULL) + }, sampleRM=sampleRM, gdsProfile=gdsProfile, + studyDFSyn=studyDFSyn, spRef=spRef, pathOutProfile=pathOutProfile, + currentProfile=currentProfile) + + + ## Directory where the KNN results have been saved + pathKNN <- file.path(pathOut, currentProfile) + listFilesName <- dir(file.path(pathKNN), ".rds") + ## List of the KNN result files from PCA on synthetic data + listFiles <- file.path(file.path(pathKNN) , listFilesName) + + resCall <- computeAncestryFromSyntheticFile(gdsReference=gdsReference, + gdsProfile=gdsProfile, listFiles=listFiles, + currentProfile=currentProfile, spRef=spRef, + studyIDSyn=studyDFSyn$study.id, np=np) + + saveRDS(resCall, file.path(pathOut, + paste0(currentProfile, ".infoCall", ".rds"))) + + write.csv(x=resCall$Ancestry, file=file.path(pathOut, + paste0(currentProfile, ".Ancestry",".csv")), quote=FALSE, + row.names=FALSE) + + ## Close Profile GDS file (important) + closefn.gds(gdsProfile) + + return(0L) +} + +#' @title Run most steps leading to the ancestry inference call on a +#' specific profile (LD or geneAware) +#' +#' @description This function runs most steps leading to the ancestry inference +#' call on a specific profile. First, the function creates the Profile GDS file +#' for the specific profile using the information from a RDS Sample +#' description file and the Population reference GDS file. +#' +#' @param gdsReference an object of class \code{\link[gdsfmt]{gds.class}} +#' (a GDS file), the opened Population Reference GDS file. +#' +#' @param gdsRefAnnot an object of class \code{\link[gdsfmt]{gds.class}} +#' (a GDS file), the opened Population Reference SNV Annotation GDS file. +#' This parameter is RNA specific. +#' +#' @param studyDF a \code{data.frame} containing the information about the +#' study associated to the analysed sample(s). The \code{data.frame} must have +#' those 3 columns: "study.id", "study.desc", "study.platform". All columns +#' must be in \code{character} strings (no factor). +#' +#' @param currentProfile a \code{character} string representing the profile +#' identifier. +#' +#' @param pathProfileGDS a \code{character} string representing the path to +#' the directory where the GDS Profile files will be created. +#' Default: \code{NULL}. +#' +#' @param chrInfo a \code{vector} of positive \code{integer} values +#' representing the length of the chromosomes. See 'details' section. +#' +#' @param syntheticRefDF a \code{data.frame} containing a subset of +#' reference profiles for each sub-population present in the Reference GDS +#' file. The \code{data.frame} must have those columns: +#' \describe{ +#' \item{sample.id}{ a \code{character} string representing the sample +#' identifier. } +#' \item{pop.group}{ a \code{character} string representing the +#' subcontinental population assigned to the sample. } +#' \item{superPop}{ a \code{character} string representing the +#' super-population assigned to the sample. } +#' } +#' +#' @param studyDFSyn a \code{data.frame} containing the information about the +#' synthetic data to the analysed sample(s). The \code{data.frame} must have +#' those 3 columns: "study.id", "study.desc", "study.platform". All columns +#' must be in \code{character} strings (no factor). +#' +#' @param listProfileRef a \code{vector} of \code{character} string +#' representing the +#' identifiers of the selected 1KG profiles that will be used as reference to +#' generate the synthetic profiles. +#' +#' @param studyType a \code{character} string representing the type of study. +#' The possible choices are: "LD" and "GeneAware". The type of study affects the +#' way the estimation of the allelic fraction is done. Default: \code{"LD"}. +#' +#' @param np a single positive \code{integer} specifying the number of +#' threads to be used. Default: \code{1L}. +#' +#' @param blockTypeID a \code{character} string corresponding to the block +#' type used to extract the block identifiers. The block type must be +#' present in the GDS Reference Annotation file. +#' +#' @param verbose a \code{logical} indicating if messages should be printed +#' to show how the different steps in the function. Default: \code{FALSE}. +#' +#' @return a \code{list} containing 4 entries: +#' \describe{ +#' \item{\code{pcaSample}}{ a \code{list} containing the information related +#' to the eigenvectors. The \code{list} contains those 3 entries: +#' \describe{ +#' \item{\code{sample.id}}{ a \code{character} string representing the unique +#' identifier of the current profile.} +#' \item{\code{eigenvector.ref}}{ a \code{matrix} of \code{numeric} containing +#' the eigenvectors for the reference profiles.} +#' \item{\code{eigenvector}}{ a \code{matrix} of \code{numeric} containing the +#' eigenvectors for the current profile projected on the PCA from the +#' reference profiles.} +#' } +#' } +#' \item{\code{paraSample}}{ a \code{list} containing the results with +#' different \code{D} and \code{K} values that lead to optimal parameter +#' selection. The \code{list} contains those entries: +#' \describe{ +#' \item{\code{dfPCA}}{ a \code{data.frame} containing statistical results +#' on all combined synthetic results done with a fixed value of \code{D} (the +#' number of dimensions). The \code{data.frame} contains those columns: +#' \describe{ +#' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +#' number of dimensions).} +#' \item{\code{median}}{ a \code{numeric} representing the median of the +#' minimum AUROC obtained (within super populations) for all combination of +#' the fixed \code{D} value and all tested \code{K} values. } +#' \item{\code{mad}}{ a \code{numeric} representing the MAD of the minimum +#' AUROC obtained (within super populations) for all combination of the fixed +#' \code{D} value and all tested \code{K} values. } +#' \item{\code{upQuartile}}{ a \code{numeric} representing the upper quartile +#' of the minimum AUROC obtained (within super populations) for all +#' combination of the fixed \code{D} value and all tested \code{K} values. } +#' \item{\code{k}}{ a \code{numeric} representing the optimal \code{K} value +#' (the number of neighbors) for a fixed \code{D} value. } +#' } +#' } +#' \item{\code{dfPop}}{ a \code{data.frame} containing statistical results on +#' all combined synthetic results done with different values of \code{D} (the +#' number of dimensions) and \code{K} (the number of neighbors). +#' The \code{data.frame} contains those columns: +#' \describe{ +#' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +#' number of dimensions).} +#' \item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +#' number of neighbors).} +#' \item{\code{AUROC.min}}{ a \code{numeric} representing the minimum accuracy +#' obtained by grouping all the synthetic results by super-populations, for +#' the specified values of \code{D} and \code{K}.} +#' \item{\code{AUROC}}{ a \code{numeric} representing the accuracy obtained +#' by grouping all the synthetic results for the specified values of \code{D} +#' and \code{K}.} +#' \item{\code{Accu.CM}}{ a \code{numeric} representing the value of accuracy +#' of the confusion matrix obtained by grouping all the synthetic results for +#' the specified values of \code{D} and \code{K}.} +#' } +#' } +#' \item{\code{dfAUROC}}{ a \code{data.frame} the summary of the results by +#' super-population. The \code{data.frame} contains +#' those columns: +#' \describe{ +#' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +#' number of dimensions).} +#' \item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +#' number of neighbors).} +#' \item{\code{Call}}{ a \code{character} string representing the +#' super-population.} +#' \item{\code{L}}{ a \code{numeric} representing the lower value of the 95% +#' confidence interval for the AUROC obtained for the fixed values of +#' super-population, \code{D} and \code{K}.} +#' \item{\code{AUROC}}{ a \code{numeric} representing the AUROC obtained for the +#' fixed values of super-population, \code{D} and \code{K}.} +#' \item{\code{H}}{ a \code{numeric} representing the higher value of the 95% +#' confidence interval for the AUROC obtained for the fixed values of +#' super-population, \code{D} and \code{K}.} +#' } +#' } +#' \item{\code{D}}{ a \code{numeric} representing the optimal \code{D} value +#' (the number of dimensions) for the specific profile.} +#' \item{\code{K}}{ a \code{numeric} representing the optimal \code{K} value +#' (the number of neighbors) for the specific profile.} +#' \item{\code{listD}}{ a \code{numeric} representing the optimal \code{D} +#' values (the number of dimensions) for the specific profile. More than one +#' \code{D} is possible.} +#' } +#' } +#' \item{\code{KNNSample}}{ a \code{data.frame} containing the inferred ancestry +#' for different values of \code{K} and \code{D}. The \code{data.frame} +#' contains those columns: +#' \describe{ +#' \item{\code{sample.id}}{ a \code{character} string representing the unique +#' identifier of the current profile.} +#' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +#' number of dimensions) used to infer the ancestry. } +#' \item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +#' number of neighbors) used to infer the ancestry. } +#' \item{\code{SuperPop}}{ a \code{character} string representing the inferred +#' ancestry for the specified \code{D} and \code{K} values.} +#' } +#' } +#' \item{\code{KNNSynthetic}}{ a \code{data.frame} containing the inferred ancestry +#' for each synthetic data for different values of \code{K} and \code{D}. +#' The \code{data.frame} +#' contains those columns: "sample.id", "D", "K", "infer.superPop", "ref.superPop" +#' \describe{ +#' \item{\code{sample.id}}{ a \code{character} string representing the unique +#' identifier of the current synthetic data.} +#' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +#' number of dimensions) used to infer the ancestry. } +#' \item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +#' number of neighbors) used to infer the ancestry. } +#' \item{\code{infer.superPop}}{ a \code{character} string representing the inferred +#' ancestry for the specified \code{D} and \code{K} values.} +#' \item{\code{ref.superPop}}{ a \code{character} string representing the known +#' ancestry from the reference} +#' } +#' } +#' \item{\code{Ancestry}}{ a \code{data.frame} containing the inferred +#' ancestry for the current profile. The \code{data.frame} contains those +#' columns: +#' \describe{ +#' \item{\code{sample.id}}{ a \code{character} string representing the unique +#' identifier of the current profile.} +#' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +#' number of dimensions) used to infer the ancestry.} +#' \item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +#' number of neighbors) used to infer the ancestry.} +#' \item{\code{SuperPop}}{ a \code{character} string representing the inferred +#' ancestry.} +#' } +#' } +#' } +#' +#' @references +#' +#' Galinsky KJ, Bhatia G, Loh PR, Georgiev S, Mukherjee S, Patterson NJ, +#' Price AL. Fast Principal-Component Analysis Reveals Convergent Evolution +#' of ADH1B in Europe and East Asia. Am J Hum Genet. 2016 Mar 3;98(3):456-72. +#' doi: 10.1016/j.ajhg.2015.12.022. Epub 2016 Feb 25. +#' +#' @examples +#' +#' ## Required library for GDS +#' library(SNPRelate) +#' +#' ## Path to the demo 1KG GDS file is located in this package +#' dataDir <- system.file("extdata", package="RAIDS") +#' +#' ################################################################# +#' ## Load the information about the profile +#' ################################################################# +#' data(demoPedigreeEx1) +#' head(demoPedigreeEx1) +#' +#' ################################################################# +#' ## The 1KG GDS file and the 1KG SNV Annotation GDS file +#' ## need to be located in the same directory +#' ## Note that the 1KG GDS file used for this example is a +#' ## simplified version and CANNOT be used for any real analysis +#' ################################################################# +#' path1KG <- file.path(dataDir, "tests") +#' +#' fileReferenceGDS <- file.path(path1KG, "ex1_good_small_1KG.gds") +#' fileAnnotGDS <- file.path(path1KG, "ex1_good_small_1KG_Annot.gds") +#' +#' ################################################################# +#' ## The Sample SNP pileup files (one per sample) need +#' ## to be located in the same directory. +#' ################################################################# +#' pathGeno <- file.path(dataDir, "example", "snpPileup") +#' +#' ################################################################# +#' ## The path where the Profile GDS Files (one per sample) +#' ## will be created need to be specified. +#' ################################################################# +#' +#' pathProfileGDS <- file.path(tempdir(), "outTest.tmp") +#' +#' +#' ################################################################# +#' ## A data frame containing general information about the study +#' ## is also required. The data frame must have +#' ## those 3 columns: "studyID", "study.desc", "study.platform" +#' ################################################################# +#' studyDF <- data.frame(study.id="MYDATA", +#' study.desc="Description", +#' study.platform="PLATFORM", +#' stringsAsFactors=FALSE) +#' +#' #################################################################### +#' ## Fix seed to ensure reproducible results +#' #################################################################### +#' set.seed(3043) +#' +#' dataRef <- select1KGPopForSynthetic(fileReferenceGDS, nbProfiles=2L) +#' +#' ## Required library for this example to run correctly +#' if (requireNamespace("GenomeInfoDb", quietly=TRUE) && +#' requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) { +#' +#' ## Chromosome length information +#' ## chr23 is chrX, chr24 is chrY and chrM is 25 +#' chrInfo <- GenomeInfoDb::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25] +#' +#' studyDFSyn <- data.frame(study.id=paste0(studyDF$study.id, ".Synthetic"), +#' study.desc=paste0(studyDF$study.id, " synthetic data"), +#' study.platform=studyDF$study.platform, stringsAsFactors=FALSE) +#' +#' listProfileRef <- dataRef$sample.id +#' profileFile <- file.path(pathProfileGDS, "ex1.gds") +#' +#' \dontrun{ +#' +#' dir.create(pathProfileGDS) +#' file.copy(file.path(dataDir, "tests", "ex1_demo.gds"), profileFile) +#' +#' gdsReference <- snpgdsOpen(fileReferenceGDS) +#' gdsRefAnnot <- openfn.gds(fileAnnotGDS) +#' +#' res <- RAIDS:::profileAncestry(gdsReference=gdsReference, +#' gdsRefAnnot=gdsRefAnnot, +#' studyDF=studyDF, currentProfile=demoPedigreeEx1[1,"Name.ID"], +#' pathProfileGDS=pathProfileGDS, +#' chrInfo=chrInfo, +#' syntheticRefDF=dataRef, +#' studyDFSyn=studyDFSyn, +#' listProfileRef=listProfileRef, +#' studyType="LD") +#' +#' closefn.gds(gdsReference) +#' closefn.gds(gdsRefAnnot) +#' +#' +#' } +#' } +#' +#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz +#' @importFrom utils write.csv +#' @importFrom rlang arg_match +#' @encoding UTF-8 +#' @keywords internal +profileAncestry <- function(gdsReference, gdsRefAnnot, studyDF, + currentProfile, pathProfileGDS, chrInfo, syntheticRefDF, + studyDFSyn, listProfileRef, studyType=c("LD", "GeneAware"), + np=1L, blockTypeID=NULL, verbose=FALSE) { + # This part can be share with runProfileAncestry + studyType <- arg_match(studyType) + if(verbose){ + message("pruningSample start ", Sys.time()) + } + pruningSample(gdsReference=gdsReference, currentProfile=currentProfile, + studyID=studyDF$study.id, pathProfileGDS=pathProfileGDS, np=np) + if(verbose){ + message("pruningSample end ", Sys.time()) + } + fileGDSProfile <- file.path(pathProfileGDS, + paste0(currentProfile, ".gds")) + + add1KG2SampleGDS(gdsReference=gdsReference, fileProfileGDS=fileGDSProfile, + currentProfile=currentProfile, studyID=studyDF$study.id) + if(verbose){ + message("add1KG start ", Sys.time()) + } + addStudy1Kg(gdsReference, fileGDSProfile) + + gdsProfile <- openfn.gds(fileGDSProfile, readonly=FALSE) + # Change for the old studyType + studyTypeLeg <- ifelse(studyType=="LD", "DNA", "RNA") + estimateAllelicFraction(gdsReference=gdsReference, gdsProfile=gdsProfile, + currentProfile=currentProfile, studyID=studyDF$study.id, + chrInfo=chrInfo, studyType=studyTypeLeg, gdsRefAnnot=gdsRefAnnot, + blockID=blockTypeID, verbose=verbose) + closefn.gds(gdsProfile) + + ## Add information related to the synthetic profiles in Profile GDS file + prepSynthetic(fileProfileGDS=fileGDSProfile, + listSampleRef=listProfileRef, profileID=currentProfile, + studyDF=studyDFSyn, prefix="1", verbose=verbose) + if(verbose){ + message("syntheticGeno start ", Sys.time()) + } + resG <- syntheticGeno(gdsReference=gdsReference, gdsRefAnnot=gdsRefAnnot, + fileProfileGDS=fileGDSProfile, profileID=currentProfile, + listSampleRef=listProfileRef, prefix="1") + + # if(! file.exists(pathOut)) { + # dir.create(pathOut) + # } + # + spRef <- getRef1KGPop(gdsReference, "superPop") + sampleRM <- splitSelectByPop(syntheticRefDF) + # + # pathOutProfile <- file.path(pathOut, currentProfile) + # if(! file.exists(pathOutProfile)) { + # dir.create(pathOutProfile) + # } + + ##### End share with runProfileAncestry + + ## Open the Profile GDS file + gdsProfile <- snpgdsOpen(fileGDSProfile) + + if(verbose){ + message("SyntheticAncestry start ", Sys.time()) + } + ## This variable will contain the results from the PCA analyses + ## For each row of the sampleRM matrix + resSyn <- lapply(seq_len(nrow(sampleRM)), FUN=function(x, sampleRM, + gdsProfile, studyDFSyn, spRef, currentProfile) { + synthKNN <- computePoolSyntheticAncestryGr(gdsProfile=gdsProfile, + sampleRM=sampleRM[x,], + studyIDSyn=studyDFSyn$study.id, + np=np, spRef=spRef, eigenCount=15L, + verbose=verbose) + ## Results are saved + # saveRDS(synthKNN$matKNN, file.path(pathOutProfile, + # paste0("KNN.synt.", currentProfile, ".", x, ".rds"))) + return(synthKNN$matKNN) + }, sampleRM=sampleRM, gdsProfile=gdsProfile, studyDFSyn=studyDFSyn, + spRef=spRef, currentProfile=currentProfile) + + if(verbose){ + message("SyntheticAncestry end ", Sys.time()) + } + resSyn <- do.call(rbind, resSyn) + ## Extract the super-population information from the 1KG GDS file + ## for profiles associated to the synthetic study + pedSyn <- prepPedSynthetic1KG(gdsReference=gdsReference, + gdsSample=gdsProfile, studyID=studyDFSyn$study.id, popName="superPop") + + + # idCur <- matrix(unlist(strsplit(resSyn$sample.id, "\\.")), nr=4) + # resKNN$superPopObs <- df[idCur[3,], 3] + # ## Directory where the KNN results have been saved + # pathKNN <- file.path(pathOut, currentProfile) + # listFilesName <- dir(file.path(pathKNN), ".rds") + # ## List of the KNN result files from PCA on synthetic data + # listFiles <- file.path(file.path(pathKNN) , listFilesName) + + resCall <- computeAncestryFromSynthetic(gdsReference=gdsReference, + gdsProfile=gdsProfile, syntheticKNN=resSyn, + pedSyn=pedSyn, currentProfile=currentProfile, spRef=spRef, + studyIDSyn=studyDFSyn$study.id, np=np) + if(verbose){ + message("Ancestry end ", Sys.time()) + } + # saveRDS(resCall, file.path(pathOut, + # paste0(currentProfile, ".infoCall", ".rds"))) + # + # write.csv(x=resCall$Ancestry, file=file.path(pathOut, + # paste0(currentProfile, ".Ancestry",".csv")), quote=FALSE, + # row.names=FALSE) + + ## Close Profile GDS file (important) + closefn.gds(gdsProfile) + resSyn[[paste0("ref.superPop")]] <- pedSyn[resSyn$sample.id, "superPop"] + + colnames(resSyn) <- c("sample.id", "D", "K", "infer.superPop", + "ref.superPop") + + res <- list(pcaSample=resCall$pcaSample, # PCA of the profile + 1KG + paraSample=resCall$paraSample, # Result of the parameter selection + KNNSample=resCall$KNNSample$matKNN, # KNN for the profile + KNNSynthetic=resSyn, # KNN results for synthetic data + Ancestry=resCall$Ancestry) # the ancestry call fo the profile + + return(res) +} + +#' @title Select the optimal K and D parameters using the synthetic data and +#' infer the ancestry of a specific profile +#' +#' @description The function select the optimal K and D parameters for a +#' specific profile. The results on the synthetic data are used for the +#' parameter selection. Once the optimal parameters are selected, the +#' ancestry is inferred for the specific profile. +#' +#' @param gdsReference an object of class \link[gdsfmt]{gds.class} (a GDS +#' file), the opened 1KG GDS file. +#' +#' @param gdsProfile an object of class \code{\link[gdsfmt]{gds.class}} +#' (a GDS file), the opened Profile GDS file. +#' +#' @param syntheticKNN a \code{vector} of \code{character} strings representing +#' the name of files that contain the results of ancestry inference done on +#' the synthetic profiles for multiple values of _D_ and _K_. The files must +#' exist. +#' +#' @param pedSyn a \code{data.frame} containing the columns extracted from the +#' GDS Sample 'study.annot' node with a extra column named as the 'popName' +#' parameter that has been extracted from the 1KG GDS 'sample.annot' node. +#' +#' @param currentProfile a \code{character} string representing the profile +#' identifier of the current profile on which ancestry will be inferred. +#' +#' @param spRef a \code{vector} of \code{character} strings representing the +#' known super population ancestry for the 1KG profiles. The 1KG profile +#' identifiers are used as names for the \code{vector}. +#' +#' @param studyIDSyn a \code{character} string corresponding to the study +#' identifier. The study identifier must be present in the GDS Sample file. +#' +#' @param np a single positive \code{integer} representing the number of +#' threads. Default: \code{1L}. +#' +#' @param listCatPop a \code{vector} of \code{character} string +#' representing the list of possible ancestry assignations. Default: +#' \code{("EAS", "EUR", "AFR", "AMR", "SAS")}. +#' +#' @param fieldPopIn1KG a \code{character} string representing the name of the +#' column that contains the known ancestry for the reference profiles in +#' the Reference GDS file. +#' +#' @param fieldPopInfAnc a \code{character} string representing the name of +#' the column that will contain the inferred ancestry for the specified +#' profiles. Default: \code{"SuperPop"}. +#' +#' @param kList a \code{vector} of \code{integer} representing the list of +#' values tested for the _K_ parameter. The _K_ parameter represents the +#' number of neighbors used in the K-nearest neighbor analysis. If \code{NULL}, +#' the value \code{seq(2,15,1)} is assigned. +#' Default: \code{seq(2,15,1)}. +#' +#' @param pcaList a \code{vector} of \code{integer} representing the list of +#' values tested for the _D_ parameter. The _D_ parameter represents the +#' number of dimensions used in the PCA analysis. If \code{NULL}, +#' the value \code{seq(2,15,1)} is assigned. +#' Default: \code{seq(2,15,1)}. +#' +#' @param algorithm a \code{character} string representing the algorithm used +#' to calculate the PCA. The 2 choices are "exact" (traditional exact +#' calculation) and "randomized" (fast PCA with randomized algorithm +#' introduced in Galinsky et al. 2016). Default: \code{"exact"}. +#' +#' @param eigenCount a single \code{integer} indicating the number of +#' eigenvectors that will be in the output of the \link[SNPRelate]{snpgdsPCA} +#' function; if 'eigenCount' <= 0, then all eigenvectors are returned. +#' Default: \code{32L}. +#' +#' @param missingRate a \code{numeric} value representing the threshold +#' missing rate at with the SNVs are discarded; the SNVs are retained in the +#' \link[SNPRelate]{snpgdsPCA} +#' with "<= missingRate" only; if \code{NaN}, no missing threshold. +#' Default: \code{NaN}. +#' +#' @param verbose a \code{logical} indicating if messages should be printed +#' to show how the different steps in the function. Default: \code{FALSE}. +#' +#' @return a \code{list} containing 4 entries: +#' \describe{ +#' \item{\code{pcaSample}}{ a \code{list} containing the information related +#' to the eigenvectors. The \code{list} contains those 3 entries: +#' \describe{ +#' \item{\code{sample.id}}{ a \code{character} string representing the unique +#' identifier of the current profile.} +#' \item{\code{eigenvector.ref}}{ a \code{matrix} of \code{numeric} containing +#' the eigenvectors for the reference profiles.} +#' \item{\code{eigenvector}}{ a \code{matrix} of \code{numeric} containing the +#' eigenvectors for the current profile projected on the PCA from the +#' reference profiles.} +#' } +#' } +#' \item{\code{paraSample}}{ a \code{list} containing the results with +#' different \code{D} and \code{K} values that lead to optimal parameter +#' selection. The \code{list} contains those entries: +#' \describe{ +#' \item{\code{dfPCA}}{ a \code{data.frame} containing statistical results +#' on all combined synthetic results done with a fixed value of \code{D} (the +#' number of dimensions). The \code{data.frame} contains those columns: +#' \describe{ +#' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +#' number of dimensions).} +#' \item{\code{median}}{ a \code{numeric} representing the median of the +#' minimum AUROC obtained (within super populations) for all combination of +#' the fixed \code{D} value and all tested \code{K} values. } +#' \item{\code{mad}}{ a \code{numeric} representing the MAD of the minimum +#' AUROC obtained (within super populations) for all combination of the fixed +#' \code{D} value and all tested \code{K} values. } +#' \item{\code{upQuartile}}{ a \code{numeric} representing the upper quartile +#' of the minimum AUROC obtained (within super populations) for all +#' combination of the fixed \code{D} value and all tested \code{K} values. } +#' \item{\code{k}}{ a \code{numeric} representing the optimal \code{K} value +#' (the number of neighbors) for a fixed \code{D} value. } +#' } +#' } +#' \item{\code{dfPop}}{ a \code{data.frame} containing statistical results on +#' all combined synthetic results done with different values of \code{D} (the +#' number of dimensions) and \code{K} (the number of neighbors). +#' The \code{data.frame} contains those columns: +#' \describe{ +#' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +#' number of dimensions).} +#' \item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +#' number of neighbors).} +#' \item{\code{AUROC.min}}{ a \code{numeric} representing the minimum accuracy +#' obtained by grouping all the synthetic results by super-populations, for +#' the specified values of \code{D} and \code{K}.} +#' \item{\code{AUROC}}{ a \code{numeric} representing the accuracy obtained +#' by grouping all the synthetic results for the specified values of \code{D} +#' and \code{K}.} +#' \item{\code{Accu.CM}}{ a \code{numeric} representing the value of accuracy +#' of the confusion matrix obtained by grouping all the synthetic results for +#' the specified values of \code{D} and \code{K}.} +#' } +#' } +#' \item{\code{dfAUROC}}{ a \code{data.frame} the summary of the results by +#' super-population. The \code{data.frame} contains +#' those columns: +#' \describe{ +#' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +#' number of dimensions).} +#' \item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +#' number of neighbors).} +#' \item{\code{Call}}{ a \code{character} string representing the +#' super-population.} +#' \item{\code{L}}{ a \code{numeric} representing the lower value of the 95% +#' confidence interval for the AUROC obtained for the fixed values of +#' super-population, \code{D} and \code{K}.} +#' \item{\code{AUROC}}{ a \code{numeric} representing the AUROC obtained for the +#' fixed values of super-population, \code{D} and \code{K}.} +#' \item{\code{H}}{ a \code{numeric} representing the higher value of the 95% +#' confidence interval for the AUROC obtained for the fixed values of +#' super-population, \code{D} and \code{K}.} +#' } +#' } +#' \item{\code{D}}{ a \code{numeric} representing the optimal \code{D} value +#' (the number of dimensions) for the specific profile.} +#' \item{\code{K}}{ a \code{numeric} representing the optimal \code{K} value +#' (the number of neighbors) for the specific profile.} +#' \item{\code{listD}}{ a \code{numeric} representing the optimal \code{D} +#' values (the number of dimensions) for the specific profile. More than one +#' \code{D} is possible.} +#' } +#' } +#' \item{\code{KNNSample}}{ a \code{list} containing the inferred ancestry +#' using different \code{D} and \code{K} values. The \code{list} contains +#' those entries: +#' \describe{ +#' \item{\code{sample.id}}{ a \code{character} string representing the unique +#' identifier of the current profile.} +#' \item{\code{matKNN}}{ a \code{data.frame} containing the inferred ancestry +#' for different values of \code{K} and \code{D}. The \code{data.frame} +#' contains those columns: +#' \describe{ +#' \item{\code{sample.id}}{ a \code{character} string representing the unique +#' identifier of the current profile.} +#' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +#' number of dimensions) used to infer the ancestry. } +#' \item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +#' number of neighbors) used to infer the ancestry. } +#' \item{\code{SuperPop}}{ a \code{character} string representing the inferred +#' ancestry for the specified \code{D} and \code{K} values.} +#' } +#' } +#' } +#' } +#' \item{\code{Ancestry}}{ a \code{data.frame} containing the inferred +#' ancestry for the current profile. The \code{data.frame} contains those +#' columns: +#' \describe{ +#' \item{\code{sample.id}}{ a \code{character} string representing the unique +#' identifier of the current profile.} +#' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +#' number of dimensions) used to infer the ancestry.} +#' \item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +#' number of neighbors) used to infer the ancestry.} +#' \item{\code{SuperPop}}{ a \code{character} string representing the inferred +#' ancestry.} +#' } +#' } +#' } +#' +#' +#' @references +#' +#' Galinsky KJ, Bhatia G, Loh PR, Georgiev S, Mukherjee S, Patterson NJ, +#' Price AL. Fast Principal-Component Analysis Reveals Convergent Evolution +#' of ADH1B in Europe and East Asia. Am J Hum Genet. 2016 Mar 3;98(3):456-72. +#' doi: 10.1016/j.ajhg.2015.12.022. Epub 2016 Feb 25. +#' +#' @examples +#' +#' +#' ## Required library +#' library(gdsfmt) +#' +#' ## Load the known ancestry for the demo 1KG reference profiles +#' data(demoKnownSuperPop1KG) +#' +#' ## The Reference GDS file +#' path1KG <- system.file("extdata/tests", package="RAIDS") +#' +#' ## Open the Reference GDS file +#' gdsRef <- snpgdsOpen(file.path(path1KG, "ex1_good_small_1KG.gds")) +#' +#' ## Path to the demo synthetic results files +#' ## List of the KNN result files from PCA run on synthetic data +#' dataDirRes <- system.file("extdata/demoAncestryCall/ex1", package="RAIDS") +#' listFilesName <- dir(file.path(dataDirRes), ".rds") +#' listFiles <- file.path(file.path(dataDirRes) , listFilesName) +#' syntheticKNN <- lapply(listFiles, FUN=function(x){return(readRDS(x))}) +#' syntheticKNN <- do.call(rbind, syntheticKNN) +#' +#' # The name of the synthetic study +#' studyID <- "MYDATA.Synthetic" +#' +#' ## Path to the demo Profile GDS file is located in this package +#' dataDir <- system.file("extdata/demoAncestryCall", package="RAIDS") +#' +#' ## Open the Profile GDS file +#' gdsProfile <- snpgdsOpen(file.path(dataDir, "ex1.gds")) +#' \dontrun{ +#' pedSyn <- RAIDS:::prepPedSynthetic1KG(gdsReference=gdsRef, +#' gdsSample=gdsProfile, studyID=studyID, popName="superPop") +#' +#' ## Run the ancestry inference on one profile called 'ex1' +#' ## The values of K and D used for the inference are selected using the +#' ## synthetic results listFiles=listFiles, +#' resCall <- RAIDS:::computeAncestryFromSynthetic(gdsReference=gdsRef, +#' gdsProfile=gdsProfile, +#' syntheticKNN = syntheticKNN, +#' pedSyn = pedSyn, +#' currentProfile=c("ex1"), +#' spRef=demoKnownSuperPop1KG, +#' studyIDSyn=studyID, np=1L) +#' +#' ## The ancestry called with the optimal D and K values +#' resCall$Ancestry +#' } +#' ## Close the GDS files (important) +#' closefn.gds(gdsProfile) +#' closefn.gds(gdsRef) +#' +#' +#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz +#' @importFrom rlang arg_match +#' @encoding UTF-8 +#' @export +computeAncestryFromSynthetic <- function(gdsReference, gdsProfile, + syntheticKNN, + pedSyn, + currentProfile, + spRef, + studyIDSyn, + np=1L, + listCatPop=c("EAS", "EUR", "AFR", "AMR", "SAS"), + fieldPopIn1KG="superPop", + fieldPopInfAnc="SuperPop", + kList=seq(2, 15, 1), + pcaList=seq(2, 15, 1), + algorithm=c("exact", "randomized"), + eigenCount=32L, + missingRate=NaN, verbose=FALSE) { + + if(is.null(pcaList)) { + pcaList <- seq(2, 15, 1) + } + + if (is.null(kList)) { + kList <- seq(2, 15, 1) + } + + ## Validate input parameters + # validateComputeAncestryFromSynthetic(gdsReference=gdsReference, + # gdsProfile=gdsProfile, syntheticKNN=syntheticKNN, + # pedSyn=pedSyn, + # currentProfile=currentProfile, spRef=spRef, studyIDSyn=studyIDSyn, + # np=np, listCatPop=listCatPop, fieldPopIn1KG=fieldPopIn1KG, + # fieldPopInfAnc=fieldPopInfAnc, kList=kList, pcaList=pcaList, + # algorithm=algorithm, eigenCount=eigenCount, missingRate=missingRate, + # verbose=verbose) + + ## Matches a character method against a table of candidate values + algorithm <- arg_match(algorithm) + + + + ## Compile all the inferred ancestry results for different values of + ## D and K to select the optimal parameters + listParaSample <- selParaPCAUpQuartile(matKNN=syntheticKNN, + pedCall=pedSyn, refCall=fieldPopIn1KG, predCall=fieldPopInfAnc, + listCall=listCatPop) + + ## Project profile on the PCA created with the reference profiles + listPCAProfile <- computePCARefSample(gdsProfile=gdsProfile, + currentProfile=currentProfile, studyIDRef="Ref.1KG", np=np, + algorithm=algorithm, eigenCount=eigenCount, missingRate=missingRate, + verbose=verbose) + + ## Run a k-nearest neighbors analysis on one specific profile + listKNNSample <- computeKNNRefSample(listEigenvector=listPCAProfile, + listCatPop=listCatPop, spRef=spRef, fieldPopInfAnc=fieldPopInfAnc, + kList=kList, pcaList=pcaList) + + ## The ancestry call for the current profile + resCall <- listKNNSample$matKNN[ + which(listKNNSample$matKNN$D == listParaSample$D & + listKNNSample$matKNN$K == listParaSample$K ),] + colnames(listParaSample$dfAUROC) <- c("D", "K", "Call", "L", "AUROC", "H") + res <- list(pcaSample=listPCAProfile, # PCA of the profile + 1KG + paraSample=listParaSample, # Result of the parameter selection + KNNSample=listKNNSample, # KNN for the profile + Ancestry=resCall) # the ancestry call fo the profile + + return(res) +} + + +#' @title Run most steps leading to the ancestry inference call +#' on a specific profile (RNA or DNA) +#' +#' @description This function runs most steps leading to the ancestry inference +#' call on a specific profile. First, the function creates the Profile GDS file +#' for the specific profile using the information from a RDS Sample +#' description file and the Population reference GDS file. +#' +#' @param pedStudy a \code{data.frame} with those mandatory columns: "Name.ID", +#' "Case.ID", "Sample.Type", "Diagnosis", "Source". All columns must be in +#' \code{character} strings (no factor). The \code{data.frame} +#' must contain the information for all the samples passed in the +#' \code{listSamples} parameter. Only \code{filePedRDS} or \code{pedStudy} +#' can be defined. +#' +#' @param studyDF a \code{data.frame} containing the information about the +#' study associated to the analysed sample(s). The \code{data.frame} must have +#' those 3 columns: "study.id", "study.desc", "study.platform". All columns +#' must be in \code{character} strings (no factor). +#' +#' @param pathProfileGDS a \code{character} string representing the path to +#' the directory where the GDS Profile files will be created. +#' Default: \code{NULL}. +#' +#' @param pathGeno a \code{character} string representing the path to the +#' directory containing the VCF output of SNP-pileup for each sample. The +#' SNP-pileup files must be compressed (gz files) and have the name identifiers +#' of the samples. A sample with "Name.ID" identifier would have an +#' associated file called +#' if genoSource is "VCF", then "Name.ID.vcf.gz", +#' if genoSource is "generic", then "Name.ID.generic.txt.gz" +#' if genoSource is "snp-pileup", then "Name.ID.txt.gz". +#' +#' @param pathOut a \code{character} string representing the path to +#' the directory where the output files are created. +#' +#' @param fileReferenceGDS a \code{character} string representing the file +#' name of the Reference GDS file. The file must exist. +#' +#' @param fileReferenceAnnotGDS a \code{character} string representing the +#' file name of the Reference GDS Annotation file. The file must exist. +#' +#' @param chrInfo a \code{vector} of positive \code{integer} values +#' representing the length of the chromosomes. See 'details' section. +#' +#' @param syntheticRefDF a \code{data.frame} containing a subset of +#' reference profiles for each sub-population present in the Reference GDS +#' file. The \code{data.frame} must have those columns: +#' \describe{ +#' \item{sample.id}{ a \code{character} string representing the sample +#' identifier. } +#' \item{pop.group}{ a \code{character} string representing the +#' subcontinental population assigned to the sample. } +#' \item{superPop}{ a \code{character} string representing the +#' super-population assigned to the sample. } +#' } +#' +#' @param studyType a \code{character} string representing the type of study. +#' The possible choices are: "DNA" and "RNA". The type of study affects the +#' way the estimation of the allelic fraction is done. Default: \code{"DNA"}. +#' +#' @param genoSource a \code{character} string with two possible values: +#' 'snp-pileup', 'generic' or 'VCF'. It specifies if the genotype files +#' are generated by snp-pileup (Facets) or are a generic format CSV file +#' with at least those columns: +#' 'Chromosome', 'Position', 'Ref', 'Alt', 'Count', 'File1R' and 'File1A'. +#' The 'Count' is the depth at the specified position; +#' 'FileR' is the depth of the reference allele and +#' 'File1A' is the depth of the specific alternative allele. +#' Finally the file can be a VCF file with at least those genotype +#' fields: GT, AD, DP. +#' +#' @param np a single positive \code{integer} specifying the number of +#' threads to be used. Default: \code{1L}. +#' +#' @param blockTypeID a \code{character} string corresponding to the block +#' type used to extract the block identifiers. The block type must be +#' present in the GDS Reference Annotation file. +#' +#' @param verbose a \code{logical} indicating if messages should be printed +#' to show how the different steps in the function. Default: \code{FALSE}. +#' +#' @return The integer \code{0L} when successful. See details section for +#' more information about the generated output files. +#' +#' @details +#' +#' The runWrapperAncestry() function generates 3 types of files +#' in the \code{pathOut} directory. +#' \describe{ +#' \item{Ancestry Inference}{ The ancestry inference CSV file +#' (".Ancestry.csv" file)} +#' \item{Inference Informaton}{ The inference information RDS file +#' (".infoCall.rds" file)} +#' \item{Synthetic Information}{ The parameter information RDS files +#' from the synthetic inference ("KNN.synt.*.rds" files in a sub-directory)} +#' } +#' +#' In addition, a sub-directory (named using the profile ID) is +#' also created. +#' +#' @references +#' +#' Galinsky KJ, Bhatia G, Loh PR, Georgiev S, Mukherjee S, Patterson NJ, +#' Price AL. Fast Principal-Component Analysis Reveals Convergent Evolution +#' of ADH1B in Europe and East Asia. Am J Hum Genet. 2016 Mar 3;98(3):456-72. +#' doi: 10.1016/j.ajhg.2015.12.022. Epub 2016 Feb 25. +#' +#' @examples +#' +#' ## Required library for GDS +#' library(SNPRelate) +#' +#' ## Path to the demo 1KG GDS file is located in this package +#' dataDir <- system.file("extdata", package="RAIDS") +#' +#' ################################################################# +#' ## Load the information about the profile +#' ################################################################# +#' data(demoPedigreeEx1) +#' head(demoPedigreeEx1) +#' +#' ################################################################# +#' ## The 1KG GDS file and the 1KG SNV Annotation GDS file +#' ## need to be located in the same directory +#' ## Note that the 1KG GDS file used for this example is a +#' ## simplified version and CANNOT be used for any real analysis +#' ################################################################# +#' path1KG <- file.path(dataDir, "tests") +#' +#' fileReferenceGDS <- file.path(path1KG, "ex1_good_small_1KG.gds") +#' fileAnnotGDS <- file.path(path1KG, "ex1_good_small_1KG_Annot.gds") +#' +#' ################################################################# +#' ## The Sample SNP pileup files (one per sample) need +#' ## to be located in the same directory. +#' ################################################################# +#' pathGeno <- file.path(dataDir, "example", "snpPileup") +#' +#' ################################################################# +#' ## The path where the Profile GDS Files (one per sample) +#' ## will be created need to be specified. +#' ################################################################# +#' pathProfileGDS <- file.path(tempdir(), "out.tmp") +#' +#' pathOut <- file.path(tempdir(), "res.out") +#' +#' ################################################################# +#' ## A data frame containing general information about the study +#' ## is also required. The data frame must have +#' ## those 3 columns: "studyID", "study.desc", "study.platform" +#' ################################################################# +#' studyDF <- data.frame(study.id="MYDATA", +#' study.desc="Description", +#' study.platform="PLATFORM", +#' stringsAsFactors=FALSE) +#' +#' #################################################################### +#' ## Fix seed to ensure reproducible results +#' #################################################################### +#' set.seed(3043) +#' +#' gds1KG <- snpgdsOpen(fileReferenceGDS) +#' dataRef <- select1KGPop(gds1KG, nbProfiles=2L) +#' closefn.gds(gds1KG) +#' +#' ## Required library for this example to run correctly +#' if (requireNamespace("GenomeInfoDb", quietly=TRUE) && +#' requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) { +#' +#' ## Chromosome length information +#' ## chr23 is chrX, chr24 is chrY and chrM is 25 +#' chrInfo <- GenomeInfoDb::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25] +#' +#' \dontrun{ +#' +#' RAIDS:::runWrapperAncestry(pedStudy=demoPedigreeEx1, studyDF=studyDF, +#' pathProfileGDS=pathProfileGDS, +#' pathGeno=pathGeno, pathOut=pathOut, +#' fileReferenceGDS=fileReferenceGDS, +#' fileReferenceAnnotGDS=fileAnnotGDS, +#' chrInfo=chrInfo, syntheticRefDF=dataRef, +#' studyType="DNA", genoSource="snp-pileup") +#' +#' unlink(pathProfileGDS, recursive=TRUE, force=TRUE) +#' unlink(pathOut, recursive=TRUE, force=TRUE) +#' +#' } +#' } +#' +#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz +#' @importFrom utils write.csv +#' @importFrom rlang arg_match +#' @encoding UTF-8 +#' @keywords internal +runWrapperAncestry <- function(pedStudy, studyDF, pathProfileGDS, + pathGeno, pathOut, fileReferenceGDS, fileReferenceAnnotGDS, + chrInfo, syntheticRefDF, + genoSource=c("snp-pileup", "generic", "VCF"), + studyType=c("DNA", "RNA"), np=1L, blockTypeID=NULL, + verbose=FALSE) { + + genoSource <- arg_match(genoSource) + + listProfiles <- pedStudy[, "Name.ID"] + + createStudy2GDS1KG(pathGeno=pathGeno, pedStudy=pedStudy, + fileNameGDS=fileReferenceGDS, listProfiles=listProfiles, + studyDF=studyDF, pathProfileGDS=pathProfileGDS, genoSource=genoSource, + verbose=verbose) + + ## Open the 1KG GDS file (demo version) + gdsReference <- snpgdsOpen(fileReferenceGDS) + ## Open the 1KG GDS file and 1KG SNV Annotation file + gdsRefAnnot <- openfn.gds(fileReferenceAnnotGDS) + + listProfileRef <- syntheticRefDF$sample.id + studyDFSyn <- data.frame(study.id=paste0(studyDF$study.id, ".Synthetic"), + study.desc=paste0(studyDF$study.id, " synthetic data"), + study.platform=studyDF$study.platform, stringsAsFactors=FALSE) + + apply(pedStudy[,"Name.ID", drop=FALSE],1,FUN=function(x,gdsReference, + gdsRefAnnot, studyDF,pathProfileGDS, + pathOut, chrInfo, syntheticRefDF, studyDFSyn, + listProfileRef, studyType, verbose) { + runProfileAncestry(gdsReference, gdsRefAnnot, studyDF, + currentProfile=x, pathProfileGDS, pathOut, chrInfo, + syntheticRefDF, studyDFSyn, listProfileRef, + studyType, np=np, blockTypeID=blockTypeID, verbose=verbose) + return(NULL) + }, gdsReference=gdsReference, gdsRefAnnot=gdsRefAnnot, + studyDF=studyDF, pathProfileGDS=pathProfileGDS, pathOut=pathOut, + chrInfo=chrInfo, syntheticRefDF=syntheticRefDF, + listProfileRef=listProfileRef, + studyDFSyn=studyDFSyn, studyType=studyType, verbose=verbose) + + ## Close all GDS files + closefn.gds(gdsReference) + closefn.gds(gdsRefAnnot) + + ## Successful + return(0L) +} + +#' @title Create the Profile GDS file(s) for one or multiple specific profiles +#' using the information from a RDS Sample description file and the 1KG +#' GDS file +#' +#' @description The function uses the information for the Reference GDS file +#' and the RDS Sample Description file to create the Profile GDS file. One +#' Profile GDS file is created per profile. One Profile GDS file will be +#' created for each entry present in the \code{listProfiles} parameter. +#' +#' @param profileFile a \code{character} string representing the path to the +#' file: with genotype and the allele information of the profile A profile would have an +#' associated file called +#' if genoSource is "VCF", then "*vcf.gz", +#' if genoSource is "generic", then "*.txt.gz" +#' if genoSource is "snp-pileup", then "*.txt.gz". +#' if genoSource is "bam", then "*.bam" and "*.bai". +#' +#' @param profileName a \code{character} string representing the the profile Name.ID +#' +#' @param filePedRDS a \code{character} string representing the path to the +#' RDS file that contains the information about the sample to analyse. +#' The RDS file must +#' include a \code{data.frame} with those mandatory columns: "Name.ID", +#' "Case.ID", "Sample.Type", "Diagnosis", "Source". All columns must be in +#' \code{character} strings. The \code{data.frame} +#' must contain the information for all the samples passed in the +#' \code{listSamples} parameter. Only \code{filePedRDS} or \code{pedStudy} +#' can be defined. +#' +#' @param pedStudy a \code{data.frame} with those mandatory columns: "Name.ID", +#' "Case.ID", "Sample.Type", "Diagnosis", "Source". All columns must be in +#' \code{character} strings (no factor). The \code{data.frame} +#' must contain the information for all the samples passed in the +#' \code{listSamples} parameter. Only \code{filePedRDS} or \code{pedStudy} +#' can be defined. #' -#' @param pathProfileGDS the path of an object of class \code{gds} related to -#' the sample +#' @param fileNameGDS a \code{character} string representing the file name of +#' the Reference GDS file. The file must exist. #' -#' @param listSamples a \code{vector} of string representing the samples for -#' which compute the PCA. +#' @param batch a single positive \code{integer} representing the current +#' identifier for the batch. Beware, this field is not stored anymore. +#' Default: \code{1}. #' -#' @param np a single positive \code{integer} representing the number of -#' threads. Default: \code{1L}. +#' @param studyDF a \code{data.frame} containing the information about the +#' study associated to the analysed sample(s). The \code{data.frame} must have +#' those 3 columns: "study.id", "study.desc", "study.platform". All columns +#' must be in \code{character} strings (no factor). #' -#' @return The integer \code{0L} when successful. +#' @param listProfiles a \code{vector} of \code{character} string corresponding +#' to the profile identifiers that will have a Profile GDS file created. The +#' profile identifiers must be present in the "Name.ID" column of the Profile +#' RDS file passed to the \code{filePedRDS} parameter. +#' If \code{NULL}, all profiles present in the \code{filePedRDS} are selected. +#' Default: \code{NULL}. #' -#' @details +#' @param pathProfileGDS a \code{character} string representing the path to +#' the directory where the Profile GDS files will be created. +#' Default: \code{NULL}. +#' +#' @param genoSource a \code{character} string with two possible values: +#' 'snp-pileup', 'generic' or 'VCF'. It specifies if the genotype files +#' are generated by snp-pileup (Facets) or are a generic format CSV file +#' with at least those columns: +#' 'Chromosome', 'Position', 'Ref', 'Alt', 'Count', 'File1R' and 'File1A'. +#' The 'Count' is the depth at the specified position; +#' 'FileR' is the depth of the reference allele and +#' 'File1A' is the depth of the specific alternative allele. +#' Finally the file can be a VCF file with at least those genotype +#' fields: GT, AD, DP. +#' +#' @param verbose a \code{logical} indicating if message information should be +#' printed. Default: \code{FALSE}. +#' +#' @return The function returns \code{0L} when successful. +#' +#' @examples +#' +#' ## Path to the demo 1KG GDS file is located in this package +#' dataDir <- system.file("extdata/tests", package="RAIDS") +#' fileGDS <- file.path(dataDir, "ex1_good_small_1KG.gds") +#' +#' ## The data.frame containing the information about the study +#' ## The 3 mandatory columns: "study.id", "study.desc", "study.platform" +#' ## The entries should be strings, not factors (stringsAsFactors=FALSE) +#' studyDF <- data.frame(study.id = "MYDATA", +#' study.desc = "Description", +#' study.platform = "PLATFORM", +#' stringsAsFactors = FALSE) #' -#' More information about the method used to calculate the patient eigenvectors -#' can be found at the Bioconductor SNPRelate website: -#' https://bioconductor.org/packages/SNPRelate/ +#' ## The data.frame containing the information about the samples +#' ## The entries should be strings, not factors (stringsAsFactors=FALSE) +#' samplePED <- data.frame(Name.ID=c("ex1"), +#' Case.ID=c("Patient_h11"), +#' Diagnosis=rep("Cancer"), +#' Sample.Type=c("Primary Tumor"), +#' Source=c("Databank B"), stringsAsFactors=FALSE, +#' drop=FALSE) +#' rownames(samplePED) <- samplePED$Name.ID +#' +#' ## Create the Profile GDS File for samples in 'listSamples' vector +#' ## (in this case, samples "ex1") +#' ## The Profile GDS file is created in the pathProfileGDS directory +#' result <- RAIDS:::createProfile(profileFile=file.path(dataDir, "ex1.txt.gz"), +#' profileName="ex1", +#' pedStudy=samplePED, fileNameGDS=fileGDS, +#' studyDF=studyDF, listProfiles=c("ex1"), +#' pathProfileGDS=tempdir(), +#' genoSource="snp-pileup", +#' verbose=FALSE) +#' +#' ## The function returns OL when successful +#' result +#' +#' ## The Profile GDS file 'ex1.gds' has been created in the +#' ## specified directory +#' list.files(tempdir()) +#' +#' ## Remove Profile GDS file (created for demo purpose) +#' unlink(file.path(tempdir(), "ex1.gds"), force=TRUE) #' #' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz -#' @importFrom SNPRelate snpgdsPCASampLoading +#' @importFrom gdsfmt createfn.gds put.attr.gdsn closefn.gds read.gdsn #' @importFrom S4Vectors isSingleNumber +#' @importFrom rlang arg_match #' @encoding UTF-8 #' @keywords internal -computePCAForSamples <- function(gds, pathProfileGDS, listSamples, np=1L) { - - ## Validate that np is a single positive integer - if(! (isSingleNumber(np) && np > 0)) { - stop("The \'np\' parameter must be a single positive integer.") - } - for(i in seq_len(length(listSamples)) ){ - - gdsSample <- openfn.gds(file.path(pathProfileGDS, - paste0(listSamples[i], ".gds"))) - study.annot <- read.gdsn(index.gdsn(gdsSample, "study.annot")) - - if(length(which(study.annot$study.id == "Ref.1KG")) == 0) { - stop("The study Ref.1KG is not define you must run the ", - "function addStudy1Kg \n") +createProfile <- function(profileFile, profileName, + filePedRDS=NULL, pedStudy=NULL, fileNameGDS, + batch=1, studyDF, listProfiles=NULL, + pathProfileGDS=NULL, + genoSource=c("snp-pileup", "generic", "VCF", "bam"), + paramProfile=list(ScanBamParam=NULL, + PileupParam=NULL, + yieldSize=10000000), + verbose=FALSE) { + + ## When filePedRDS is defined and pedStudy is null + if (!(is.null(filePedRDS)) && is.null(pedStudy)) { + ## The filePedRDS must be a character string and the file must exists + if (!(is.character(filePedRDS) && (file.exists(filePedRDS)))) { + stop("The \'filePedRDS\' must be a character string representing", + " the RDS Sample information file. The file must exist.") } - - sample.Unrel.All <- study.annot$data.id[study.annot$study.id == - "Ref.1KG"] - - listPCA <- computePrunedPCARef(gdsSample, sample.Unrel.All, np) - - listPCA[["samp.load"]] <- projectSample2PCA(gdsSample, listPCA, - listSamples[i], np) - closefn.gds(gdsSample) - - saveRDS(listPCA, file.path(pathProfileGDS, paste0(listSamples[i], - ".pca.pruned.rds"))) - - } - + ## Open the RDS Sample information file + pedStudy <- readRDS(file=filePedRDS) + } else if (!(is.null(filePedRDS) || is.null(pedStudy))) { + stop("Both \'filePedRDS\' and \'pedStudy\' parameters cannot be ", + "defined at the same time.") + } else if (is.null(filePedRDS) && is.null(pedStudy)) { + stop("One of the parameter \'fineNamePED\' of \'pedStudy\' must ", + "be defined.") + } + + ## Validate input parameters + validatecreateProfile( pedStudy=pedStudy, + fileNameGDS=fileNameGDS, batch=batch, studyDF=studyDF, + listProfiles=listProfiles, pathProfileGDS=pathProfileGDS, + genoSource=genoSource, verbose=verbose) + + genoSource <- arg_match(genoSource) + + ## Read the Reference GDS file + gdsReference <- snpgdsOpen(filename=fileNameGDS) + + ## Extract the chromosome and position information for all SNPs in 1KG GDS + + listPos <- NULL + if(genoSource == "bam"){ + alDf <- read.gdsn(index.gdsn(gdsReference, "snp.allele")) + alDf <- matrix(unlist(strsplit(alDf,"\\/")),nrow=2) + listPos <- data.frame(chr = read.gdsn(index.gdsn(gdsReference, "snp.chromosome")), + start = read.gdsn(index.gdsn(gdsReference, "snp.position")), + REF = alDf[1,], + ALT = alDf[2,], + stringsAsFactors = FALSE + ) + # listChr <- unique(listPos$chr) + # We can optimize + # listPos <- lapply(listChr, + # FUN=function(x, varDf){ + # return(varDf[which(varDf$chr == x),]) + # }, + # varDf=listPos) + # names(listPos) <- paste0("chr", listChr) + rm(alDf) + } else{ + listPos <- data.frame(snp.chromosome=read.gdsn(index.gdsn(node=gdsReference, "snp.chromosome")), + snp.position=read.gdsn(index.gdsn(node=gdsReference, "snp.position"))) + } + ## Create a data.frame containing the information + + # Need to reformat for bam in varDf + + if(verbose) { + message("Start ", Sys.time()) + message("Sample info DONE ", Sys.time()) + } + + generateProfileGDS(profileFile=profileFile, profileName=profileName, + listPos=listPos, offset=-1, minCov=10, + minProb=0.999, seqError=0.001, dfPedProfile=pedStudy, batch=batch, + studyDF=studyDF, pathProfileGDS=pathProfileGDS, + genoSource=genoSource, paramProfileGDS=paramProfile, verbose=verbose) + + if(verbose) { + message("Genotype DONE ", Sys.time()) + } + + ## Close 1KG GDS file + closefn.gds(gdsReference) + + ## Return successful code return(0L) } -#' @title Deprecated Function -#' -#' @description Deprecated -#' -#' @param gdsSample an object of class \code{gds} opened related to -#' the sample -#' -#' @param pruned TODO +#' @title Run most steps leading to the ancestry inference call +#' on a specific profile (RNA or DNA) #' -#' @param sample.id TODO +#' @description This function runs most steps leading to the ancestry inference +#' call on a specific profile. First, the function creates the Profile GDS file +#' for the specific profile using the information from a RDS Sample +#' description file and the Population reference GDS file. #' -#' @param sample.ref TODO -#' -#' @param study.annot a \code{data.frame} with one entry from study.annot in -#' the gds -#' -#' @param algorithm a \code{character} string representing the algorithm used -#' to calculate the PCA. The 2 choices are "exact" (traditional exact -#' calculation) and "randomized" (fast PCA with randomized algorithm -#' introduced in Galinsky et al. 2016). Default: \code{"exact"}. +#' @param pedStudy a \code{data.frame} with those mandatory columns: "Name.ID", +#' "Case.ID", "Sample.Type", "Diagnosis", "Source". All columns must be in +#' \code{character} strings (no factor). The \code{data.frame} +#' must contain the information for all the samples passed in the +#' \code{listSamples} parameter. Only \code{filePedRDS} or \code{pedStudy} +#' can be defined. #' -#' @param eigen.cnt a single \code{integer} indicating the number of -#' eigenvectors that will be in the output of the \link[SNPRelate]{snpgdsPCA} -#' function; if 'eigen.cnt' <= 0, then all eigenvectors are returned. -#' Default: \code{32L}. +#' @param studyDF a \code{data.frame} containing the information about the +#' study associated to the analysed sample(s). The \code{data.frame} must have +#' those 3 columns: "study.id", "study.desc", "study.platform". All columns +#' must be in \code{character} strings (no factor). #' -#' @return A \code{list} TODO with the sample.id and eigenvectors. +#' @param pathProfileGDS a \code{character} string representing the path to +#' the directory where the GDS Profile files will be created. +#' Default: \code{NULL}. #' -#' @references +#' @param pathGeno a \code{character} string representing the path to the +#' directory containing the VCF output of SNP-pileup for each sample. The +#' SNP-pileup files must be compressed (gz files) and have the name identifiers +#' of the samples. A sample with "Name.ID" identifier would have an +#' associated file called +#' if genoSource is "VCF", then "Name.ID.vcf.gz", +#' if genoSource is "generic", then "Name.ID.generic.txt.gz" +#' if genoSource is "snp-pileup", then "Name.ID.txt.gz". #' -#' Galinsky KJ, Bhatia G, Loh PR, Georgiev S, Mukherjee S, Patterson NJ, -#' Price AL. Fast Principal-Component Analysis Reveals Convergent Evolution -#' of ADH1B in Europe and East Asia. Am J Hum Genet. 2016 Mar 3;98(3):456-72. -#' doi: 10.1016/j.ajhg.2015.12.022. Epub 2016 Feb 25. +#' @param fileReferenceGDS a \code{character} string representing the file +#' name of the Reference GDS file. The file must exist. #' -#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz -#' @importFrom gdsfmt add.gdsn index.gdsn -#' @importFrom SNPRelate snpgdsPCA snpgdsPCASampLoading snpgdsPCASampLoading -#' @encoding UTF-8 -#' @keywords internal -computePCAsynthetic <- function(gdsSample, pruned, sample.id, - sample.ref, study.annot, - algorithm="exact", eigen.cnt=32L) { - - if(nrow(study.annot) != 1) { - stop("Number of sample in study.annot not equal to 1\n") - } - - sample.pos <- which(sample.id == study.annot$data.id[1]) - sample.Unrel <- sample.ref[which(sample.ref != study.annot$case.id[1])] - - g <- read.gdsn(index.gdsn(gdsSample, "genotype"), - start=c(1, sample.pos), count=c(-1, 1)) - - listPCA <- list() - - listPCA[["pruned"]] <- pruned[which(g != 3)] - rm(g) - - listPCA[["pca.unrel"]] <- snpgdsPCA(gdsSample, - sample.id=sample.Unrel, - snp.id=listPCA[["pruned"]], - num.thread=1, - algorithm=algorithm, - eigen.cnt=eigen.cnt, - verbose=TRUE) - - listPCA[["snp.load"]] <- snpgdsPCASNPLoading(listPCA[["pca.unrel"]], - gdsobj=gdsSample, - num.thread=1, - verbose=TRUE) - - listPCA[["samp.load"]] <- snpgdsPCASampLoading(listPCA[["snp.load"]], - gdsobj=gdsSample, - sample.id=sample.id[sample.pos], - num.thread=1, verbose=TRUE) - - listRes <- list(sample.id=sample.id[sample.pos], - eigenvector.ref=listPCA[["pca.unrel"]]$eigenvect, - eigenvector=listPCA[["samp.load"]]$eigenvect) - return(listRes) -} - - -#' @title Compile all the inferred ancestry results done on the -#' synthetic profiles for different D and K values in the objective of -#' selecting the optimal D and K values for a specific profile +#' @param fileReferenceAnnotGDS a \code{character} string representing the +#' file name of the Reference GDS Annotation file. The file must exist. #' -#' @description The function calculates the accuracy of the inferred ancestry -#' called done on the synthetic profiles for different D and K values. The -#' accuracy is also calculated for each super-population used to generate -#' the synthetic profiles. The known ancestry from the reference profiles -#' used to generate the synthetic profiles is required to calculate the -#' accuracy. +#' @param chrInfo a \code{vector} of positive \code{integer} values +#' representing the length of the chromosomes. See 'details' section. #' -#' @param matKNN a \code{data.frame} containing the inferred ancestry for the -#' synthetic profiles for different _K_ and _D_ values. The \code{data.frame} -#' must contained those columns: "sample.id", "D", "K" and the fourth column -#' name must correspond to the \code{predCall} argument. +#' @param syntheticRefDF a \code{data.frame} containing a subset of +#' reference profiles for each sub-population present in the Reference GDS +#' file. The \code{data.frame} must have those columns: +#' \describe{ +#' \item{sample.id}{ a \code{character} string representing the sample +#' identifier. } +#' \item{pop.group}{ a \code{character} string representing the +#' subcontinental population assigned to the sample. } +#' \item{superPop}{ a \code{character} string representing the +#' super-population assigned to the sample. } +#' } #' -#' @param pedCall a \code{data.frame} containing the information about -#' the super-population information from the 1KG GDS file -#' for profiles used to generate the synthetic profiles. The \code{data.frame} -#' must contained a column named as the \code{refCall} argument. +#' @param studyType a \code{character} string representing the type of study. +#' The possible choices are: "DNA" and "RNA". The type of study affects the +#' way the estimation of the allelic fraction is done. Default: \code{"DNA"}. #' -#' @param refCall a \code{character} string representing the name of the -#' column that contains the known ancestry for the reference profiles in -#' the Reference GDS file. +#' @param genoSource a \code{character} string with two possible values: +#' 'snp-pileup', 'generic' or 'VCF', "bam". It specifies if the genotype files +#' are generated by snp-pileup (Facets) or are a generic format CSV file +#' with at least those columns: +#' 'Chromosome', 'Position', 'Ref', 'Alt', 'Count', 'File1R' and 'File1A'. +#' The 'Count' is the depth at the specified position; +#' 'FileR' is the depth of the reference allele and +#' 'File1A' is the depth of the specific alternative allele. +#' Finally the file can be a VCF file with at least those genotype +#' fields: GT, AD, DP. #' -#' @param predCall a \code{character} string representing the name of -#' the column that contains the inferred ancestry for the specified -#' profiles. The column must be present in the \code{matKNN} \code{data.frame} -#' argument. +#' @param np a single positive \code{integer} specifying the number of +#' threads to be used. Default: \code{1L}. #' -#' @param listCall a \code{vector} of \code{character} strings representing -#' the list of possible ancestry assignations. +#' @param blockTypeID a \code{character} string corresponding to the block +#' type used to extract the block identifiers. The block type must be +#' present in the GDS Reference Annotation file. #' -#' @param kList a \code{vector} of \code{integer} representing the list of -#' values tested for the _K_ parameter. The _K_ parameter represents the -#' number of neighbors used in the K-nearest neighbor analysis. -#' Default: \code{seq(3,15,1)}. +#' @param paramAncestry a \code{list} parameters ... #' -#' @param pcaList a \code{vector} of \code{integer} representing the list of -#' values tested for the _D_ parameter. The _D_ parameter represents the -#' number of dimensions used in the PCA analysis. -#' Default: \code{seq(2,15,1)}. +#' @param verbose a \code{logical} indicating if messages should be printed +#' to show how the different steps in the function. Default: \code{FALSE}. #' -#' @return a \code{list} containing 5 entries: -#' \itemize{ -#' \item{\code{dfPCA}} { a \code{data.frame} containing statistical results +#' @return a \code{list} containing 4 entries: +#' \describe{ +#' \item{\code{pcaSample}}{ a \code{list} containing the information related +#' to the eigenvectors. The \code{list} contains those 3 entries: +#' \describe{ +#' \item{\code{sample.id}}{ a \code{character} string representing the unique +#' identifier of the current profile.} +#' \item{\code{eigenvector.ref}}{ a \code{matrix} of \code{numeric} containing +#' the eigenvectors for the reference profiles.} +#' \item{\code{eigenvector}}{ a \code{matrix} of \code{numeric} containing the +#' eigenvectors for the current profile projected on the PCA from the +#' reference profiles.} +#' } +#' } +#' \item{\code{paraSample}}{ a \code{list} containing the results with +#' different \code{D} and \code{K} values that lead to optimal parameter +#' selection. The \code{list} contains those entries: +#' \describe{ +#' \item{\code{dfPCA}}{ a \code{data.frame} containing statistical results #' on all combined synthetic results done with a fixed value of \code{D} (the #' number of dimensions). The \code{data.frame} contains those columns: -#' \itemize{ +#' \describe{ #' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the #' number of dimensions).} #' \item{\code{median}}{ a \code{numeric} representing the median of the @@ -2386,11 +3714,11 @@ computePCAsynthetic <- function(gdsSample, pruned, sample.id, #' (the number of neighbors) for a fixed \code{D} value. } #' } #' } -#' \item{\code{dfPop}} { a \code{data.frame} containing statistical results on +#' \item{\code{dfPop}}{ a \code{data.frame} containing statistical results on #' all combined synthetic results done with different values of \code{D} (the #' number of dimensions) and \code{K} (the number of neighbors). #' The \code{data.frame} contains those columns: -#' \itemize{ +#' \describe{ #' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the #' number of dimensions).} #' \item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the @@ -2406,105 +3734,242 @@ computePCAsynthetic <- function(gdsSample, pruned, sample.id, #' the specified values of \code{D} and \code{K}.} #' } #' } -#' \item{\code{D}} { a \code{numeric} representing the optimal \code{D} value +#' \item{\code{dfAUROC}}{ a \code{data.frame} the summary of the results by +#' super-population. The \code{data.frame} contains +#' those columns: +#' \describe{ +#' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +#' number of dimensions).} +#' \item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +#' number of neighbors).} +#' \item{\code{Call}}{ a \code{character} string representing the +#' super-population.} +#' \item{\code{L}}{ a \code{numeric} representing the lower value of the 95% +#' confidence interval for the AUROC obtained for the fixed values of +#' super-population, \code{D} and \code{K}.} +#' \item{\code{AUROC}}{ a \code{numeric} representing the AUROC obtained for the +#' fixed values of super-population, \code{D} and \code{K}.} +#' \item{\code{H}}{ a \code{numeric} representing the higher value of the 95% +#' confidence interval for the AUROC obtained for the fixed values of +#' super-population, \code{D} and \code{K}.} +#' } +#' } +#' \item{\code{D}}{ a \code{numeric} representing the optimal \code{D} value #' (the number of dimensions) for the specific profile.} -#' \item{\code{K}} { a \code{numeric} representing the optimal \code{K} value +#' \item{\code{K}}{ a \code{numeric} representing the optimal \code{K} value #' (the number of neighbors) for the specific profile.} -#' \item{\code{listD}} { a \code{numeric} representing the optimal \code{D} +#' \item{\code{listD}}{ a \code{numeric} representing the optimal \code{D} #' values (the number of dimensions) for the specific profile. More than one #' \code{D} is possible.} #' } +#' } +#' \item{\code{KNNSample}}{ a \code{data.frame} containing the inferred ancestry +#' for different values of \code{K} and \code{D}. The \code{data.frame} +#' contains those columns: +#' \describe{ +#' \item{\code{sample.id}}{ a \code{character} string representing the unique +#' identifier of the current profile.} +#' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +#' number of dimensions) used to infer the ancestry. } +#' \item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +#' number of neighbors) used to infer the ancestry. } +#' \item{\code{SuperPop}}{ a \code{character} string representing the inferred +#' ancestry for the specified \code{D} and \code{K} values.} +#' } +#' } +#' \item{\code{KNNSynthetic}}{ a \code{data.frame} containing the inferred ancestry +#' for each synthetic data for different values of \code{K} and \code{D}. +#' The \code{data.frame} +#' contains those columns: "sample.id", "D", "K", "infer.superPop", "ref.superPop" +#' \describe{ +#' \item{\code{sample.id}}{ a \code{character} string representing the unique +#' identifier of the current synthetic data.} +#' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +#' number of dimensions) used to infer the ancestry. } +#' \item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +#' number of neighbors) used to infer the ancestry. } +#' \item{\code{infer.superPop}}{ a \code{character} string representing the inferred +#' ancestry for the specified \code{D} and \code{K} values.} +#' \item{\code{ref.superPop}}{ a \code{character} string representing the known +#' ancestry from the reference} +#' } +#' } +#' \item{\code{Ancestry}}{ a \code{data.frame} containing the inferred +#' ancestry for the current profile. The \code{data.frame} contains those +#' columns: +#' \describe{ +#' \item{\code{sample.id}}{ a \code{character} string representing the unique +#' identifier of the current profile.} +#' \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +#' number of dimensions) used to infer the ancestry.} +#' \item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +#' number of neighbors) used to infer the ancestry.} +#' \item{\code{SuperPop}}{ a \code{character} string representing the inferred +#' ancestry.} +#' } +#' } +#' } +#' +#' @references +#' +#' Galinsky KJ, Bhatia G, Loh PR, Georgiev S, Mukherjee S, Patterson NJ, +#' Price AL. Fast Principal-Component Analysis Reveals Convergent Evolution +#' of ADH1B in Europe and East Asia. Am J Hum Genet. 2016 Mar 3;98(3):456-72. +#' doi: 10.1016/j.ajhg.2015.12.022. Epub 2016 Feb 25. #' #' @examples #' -#' dataDirRes <- system.file("extdata/demoAncestryCall", package="RAIDS") +#' ## Required library for GDS +#' library(SNPRelate) #' -#' ## The inferred ancestry results for the synthetic data using different -#' ## values of D and K -#' matKNN <- readRDS(file.path(dataDirRes, "matKNN.RDS")) +#' ## Path to the demo 1KG GDS file is located in this package +#' dataDir <- system.file("extdata", package="RAIDS") #' -#' ## The known ancestry from the reference profiles used to generate the -#' ## synthetic profiles -#' syntheticInfo <- readRDS(file.path(dataDirRes, "pedSyn.RDS")) +#' ################################################################# +#' ## Load the information about the profile +#' ################################################################# +#' data(demoPedigreeEx1) +#' head(demoPedigreeEx1) +#' +#' ################################################################# +#' ## The 1KG GDS file and the 1KG SNV Annotation GDS file +#' ## need to be located in the same directory +#' ## Note that the 1KG GDS file used for this example is a +#' ## simplified version and CANNOT be used for any real analysis +#' ################################################################# +#' path1KG <- file.path(dataDir, "tests") +#' +#' fileReferenceGDS <- file.path(path1KG, "ex1_good_small_1KG.gds") +#' fileAnnotGDS <- file.path(path1KG, "ex1_good_small_1KG_Annot.gds") +#' +#' ################################################################# +#' ## The Sample SNP pileup files (one per sample) need +#' ## to be located in the same directory. +#' ################################################################# +#' pathGeno <- file.path(dataDir, "example", "snpPileup") +#' +#' ################################################################# +#' ## The path where the Profile GDS Files (one per sample) +#' ## will be created need to be specified. +#' ################################################################# +#' pathProfileGDS <- file.path(tempdir(), "out.tmp") +#' +#' +#' ################################################################# +#' ## A data frame containing general information about the study +#' ## is also required. The data frame must have +#' ## those 3 columns: "studyID", "study.desc", "study.platform" +#' ################################################################# +#' studyDF <- data.frame(study.id="MYDATA", +#' study.desc="Description", +#' study.platform="PLATFORM", +#' stringsAsFactors=FALSE) +#' +#' #################################################################### +#' ## Fix seed to ensure reproducible results +#' #################################################################### +#' set.seed(3043) #' -#' ## Compile all the results for ancestry inference done on the -#' ## synthetic profiles for different D and K values -#' ## Select the optimal D and K values -#' results <- RAIDS:::selParaPCAUpQuartile(matKNN=matKNN, -#' pedCall=syntheticInfo, refCall="superPop", predCall="SuperPop", -#' listCall=c("EAS", "EUR", "AFR", "AMR", "SAS"), kList=seq(3,15,1), -#' pcaList=seq(2,15,1)) -#' results$D -#' results$K +#' gds1KG <- snpgdsOpen(fileReferenceGDS) +#' dataRef <- select1KGPop(gds1KG, nbProfiles=2L) +#' closefn.gds(gds1KG) +#' +#' ## Required library for this example to run correctly +#' if (requireNamespace("GenomeInfoDb", quietly=TRUE) && +#' requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) { +#' +#' ## Chromosome length information +#' ## chr23 is chrX, chr24 is chrY and chrM is 25 +#' chrInfo <- GenomeInfoDb::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25] +#' +#' \dontrun{ +#' +#' res <- RAIDS:::wrapperAncestry(pedStudy=demoPedigreeEx1, studyDF=studyDF, +#' pathProfileGDS=pathProfileGDS, +#' pathGeno=pathGeno, +#' fileReferenceGDS=fileReferenceGDS, +#' fileReferenceAnnotGDS=fileAnnotGDS, +#' chrInfo=chrInfo, syntheticRefDF=dataRef, +#' studyType="LD", genoSource="snp-pileup") +#' +#' unlink(pathProfileGDS, recursive=TRUE, force=TRUE) +#' +#' } +#' } #' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz -#' @importFrom stats mad median quantile +#' @importFrom utils write.csv +#' @importFrom rlang arg_match #' @encoding UTF-8 #' @keywords internal -selParaPCAUpQuartile <- function(matKNN, pedCall, refCall, - predCall, listCall, kList=seq(3,15,1), - pcaList=seq(2,15,1)) { - - if (min(kList) < 3) { - warning("A K smaller than 3 could not give robust results.\n") +wrapperAncestry <- function(pedStudy, studyDF, pathProfileGDS, + profileFile, fileReferenceGDS, fileReferenceAnnotGDS, + chrInfo, syntheticRefDF, + genoSource=c("snp-pileup", "generic", "VCF", "bam"), + studyType=c("LD", "GeneAware"), np=1L, blockTypeID=NULL, + paramAncestry=list(ScanBamParam=NULL, + PileupParam=NULL, + yieldSize=10000000), + verbose=FALSE) { + + if(genoSource == "bam") { + message("Process from bam is a new feature;", + " if you have an issue, please let us know") + } + genoSource <- arg_match(genoSource) + + listProfiles <- pedStudy[, "Name.ID"] + # createProfile <- function(profileFile, profileName, + # filePedRDS=NULL, pedStudy=NULL, fileNameGDS, + # batch=1, studyDF, listProfiles=NULL, + # pathProfileGDS=NULL, + # genoSource=c("snp-pileup", "generic", "VCF", "bam"), + # paramProfile=list(ScanBamParam=NULL, + # PileupParam=NULL, + # yieldSize=5000000), + # verbose=FALSE) + if(is.character(listProfiles)){ + for(profileCur in listProfiles){ + if(file.exists(file.path(pathProfileGDS, paste0(profileCur, ".gds")))){ + stop(paste0("The gds file for ", profileCur, " already exist.")) + } + } + } + if(file.exists(file.path(pathProfileGDS, paste0(pedStudy$Name.ID[1], ".gds")))){ + stop(paste0("The gds file for ", pedStudy$Name.ID[1], " already exist.")) } + createProfile(profileFile=profileFile, profileName=pedStudy$Name.ID[1], + pedStudy=pedStudy, fileNameGDS=fileReferenceGDS, + studyDF=studyDF, pathProfileGDS=pathProfileGDS, + genoSource=genoSource, paramProfile=paramAncestry, + verbose=verbose) + # createStudy2GDS1KG(pathGeno=pathGeno, pedStudy=pedStudy, + # fileNameGDS=fileReferenceGDS, listProfiles=listProfiles, + # studyDF=studyDF, pathProfileGDS=pathProfileGDS, genoSource=genoSource, + # verbose=verbose) - tableSyn <- list() - tableCall <- list() - tableAUROC <- list() - i <- 1 - ## Loop on all PCA dimension values - for (D in pcaList) { - matKNNCurD <- matKNN[which(matKNN$D == D), ] - listTMP <- list() - listTMP.AUROC <- list() - j <- 1 - ## Loop on all k neighbor values - for (K in kList) { - matKNNCur <- matKNNCurD[which(matKNNCurD$K == K), ] - ## Calculate accuracy for fixed D and K values - res <- computeSyntheticConfMat(matKNN=matKNNCur, - matKNNAncestryColumn=predCall, pedCall=pedCall, - pedCallAncestryColumn=refCall, listCall=listCall) - resROC <- computeSyntheticROC(matKNN=matKNNCur, - matKNNAncestryColumn=predCall, pedCall=pedCall, - pedCallAncestryColumn=refCall, listCall=listCall) - df <- data.frame(D=D, K=K, AUROC.min=min(resROC$matAUROC.Call$AUC), - AUROC=resROC$matAUROC.All$ROC.AUC, - Accu.CM=res$matAccuracy$Accu.CM) + ## Open the 1KG GDS file (demo version) + gdsReference <- snpgdsOpen(fileReferenceGDS) + ## Open the 1KG GDS file and 1KG SNV Annotation file + gdsRefAnnot <- openfn.gds(fileReferenceAnnotGDS) - listTMP[[j]] <- df - listTMP.AUROC[[j]] <- resROC$matAUROC.Call - j <- j + 1 - } - df <- do.call(rbind, listTMP) + listProfileRef <- syntheticRefDF$sample.id + studyDFSyn <- data.frame(study.id=paste0(studyDF$study.id, ".Synthetic"), + study.desc=paste0(studyDF$study.id, " synthetic data"), + study.platform=studyDF$study.platform, stringsAsFactors=FALSE) - tableCall[[i]] <- df - tableAUROC[[i]] <- do.call(rbind, listTMP.AUROC) - maxAUROC <- max(df[df$K %in% kList, "AUROC.min"]) - kMax <- df[df$K %in% kList & abs(df$AUROC.min-maxAUROC) < 1e-3, "K"] - kV <- kMax[(length(kMax) + length(kMax)%%2)/2] - dfPCA <- data.frame(D=D, - median=median(df[df$K %in% kList, "AUROC.min"]), - mad=mad(df[df$K %in% kList, "AUROC.min"]), - upQuartile=quantile(df[df$K %in% kList, "AUROC.min"], 0.75), K=kV) - tableSyn[[i]] <- dfPCA - i <- i + 1 - } + res <- profileAncestry(gdsReference, gdsRefAnnot, studyDF, + currentProfile=pedStudy[1,"Name.ID"], pathProfileGDS, chrInfo, + syntheticRefDF, studyDFSyn, listProfileRef, + studyType, np=np, blockTypeID=blockTypeID, verbose=verbose) - dfPCA <- do.call(rbind, tableSyn) - dfCall <- do.call(rbind, tableCall) - dfAUROC <- do.call(rbind, tableAUROC) - selD <- dfPCA$D[which.max(dfPCA$upQuartile)] - selK <- dfPCA$K[which.max(dfPCA$upQuartile)] - tmp <- max(dfPCA$upQuartile) - listD <- dfPCA$D[which(abs(dfPCA$upQuartile - tmp) < 1e-3)] - res <- list(dfPCA=dfPCA, dfPop=dfCall, dfAUROC=dfAUROC, - D=selD, K=selK, listD=listD) + ## Close all GDS files + closefn.gds(gdsReference) + closefn.gds(gdsRefAnnot) + + ## Successful return(res) } - - diff --git a/R/synthetic.R b/R/synthetic.R index 7aa708127..aa9ca4004 100644 --- a/R/synthetic.R +++ b/R/synthetic.R @@ -1,3 +1,79 @@ +#' @title Random selection of a specific number of reference profiles in each +#' subcontinental population present in the 1KG GDS file ( same as select1KGPop +#' but the function doesn't need gds object as parameters but the file name +#' of the referenceGDS ) +#' +#' @description The function randomly selects a fixed number of reference +#' for each subcontinental population present in the 1KG GDS file. When a +#' subcontinental population has less samples than the fixed number, all +#' samples from the subcontinental population are selected. +#' +#' @param fileReferenceGDS a \code{character} string representing the file +#' name of the Reference GDS file. The file must exist. +#' +#' @param nbProfiles a single positive \code{integer} representing the number +#' of samples that will be selected for each subcontinental population present +#' in the 1KG GDS file. If the number of samples in a specific subcontinental +#' population is smaller than the \code{nbProfiles}, the number of samples +#' selected in this +#' subcontinental population will correspond to the size of this population. +#' +#' @return a \code{data.frame} containing those columns: +#' \describe{ +#' \item{sample.id}{ a \code{character} string representing the sample +#' identifier. } +#' \item{pop.group}{ a \code{character} string representing the +#' subcontinental population assigned to the sample. } +#' \item{superPop}{ a \code{character} string representing the +#' super-population assigned to the sample. } +#' } +#' +#' @examples +#' +#' ## Required library +#' library(gdsfmt) +#' +#' ## The number of samples needed by subcontinental population +#' ## The number is small for demonstration purpose +#' nbProfiles <- 5L +#' +#' ## 1KG GDS Demo file +#' ## This file only one superpopulation (for demonstration purpose) +#' dataDir <- system.file("extdata", package="RAIDS") +#' fileGDS <- file.path(dataDir, "PopulationReferenceDemo.gds") +#' +#' ## Extract a selected number of random samples +#' ## for each subcontinental population +#' ## In the 1KG GDS Demo file, there is one subcontinental population +#' dataR <- select1KGPopForSynthetic(fileReferenceGDS=fileGDS, nbProfiles=nbProfiles) +#' +#' +#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz +#' @importFrom gdsfmt index.gdsn read.gdsn +#' @importFrom S4Vectors isSingleNumber +#' @importFrom SNPRelate snpgdsOpen +#' @encoding UTF-8 +#' @export +select1KGPopForSynthetic <- function(fileReferenceGDS, nbProfiles) { + + ## The fileReferenceGDS must be a character string and the file must exists + if (!(is.character(fileReferenceGDS) && (file.exists(fileReferenceGDS)))) { + stop("The \'fileReferenceGDS\' must be a character string ", + "representing the Reference GDS file. The file must exist.") + } + ## Validate that nbProfiles parameter is a single positive numeric + if(! (isSingleNumber(nbProfiles) && nbProfiles > 0)) { + stop("The \'nbProfiles\' parameter must be a single positive integer.") + } + + gdsReference <- snpgdsOpen(filename=fileReferenceGDS) + df <- select1KGPop(gdsReference, nbProfiles) + closefn.gds(gdsReference) + + return(df) +} + + #' @title Random selection of a specific number of reference profiles in each #' subcontinental population present in the 1KG GDS file #' @@ -17,12 +93,12 @@ #' subcontinental population will correspond to the size of this population. #' #' @return a \code{data.frame} containing those columns: -#' \itemize{ -#' \item{sample.id} { a \code{character} string representing the sample +#' \describe{ +#' \item{sample.id}{ a \code{character} string representing the sample #' identifier. } -#' \item{pop.group} { a \code{character} string representing the +#' \item{pop.group}{ a \code{character} string representing the #' subcontinental population assigned to the sample. } -#' \item{superPop} { a \code{character} string representing the +#' \item{superPop}{ a \code{character} string representing the #' super-population assigned to the sample. } #' } #' @@ -38,8 +114,8 @@ #' ## Open 1KG GDS Demo file #' ## This file only one superpopulation (for demonstration purpose) #' dataDir <- system.file("extdata", package="RAIDS") -#' fileGDS <- file.path(dataDir, "gds1KG.gds") -#' gdsFileOpen <- openfn.gds(fileGDS) +#' fileGDS <- file.path(dataDir, "PopulationReferenceDemo.gds") +#' gdsFileOpen <- openfn.gds(fileGDS, readonly=TRUE) #' #' ## Extract a selected number of random samples #' ## for each subcontinental population @@ -77,20 +153,19 @@ select1KGPop <- function(gdsReference, nbProfiles) { "sample.annot"))[listKeep,] sample.id <- read.gdsn(index.gdsn(gdsReference, "sample.id"))[listKeep] listPop <- unique(sample.annot$pop.group) - listSel <- list() + ## For each subcontinental population, randomly select a fixed number of ## samples - for(i in seq_len(length(listPop))) { - listGroup <- which(sample.annot$pop.group == listPop[i]) - tmp <- sample(listGroup, min(nbProfiles, length(listGroup))) - listSel[[i]] <- data.frame(sample.id=sample.id[tmp], - pop.group=sample.annot$pop.group[tmp], - superPop=sample.annot$superPop[tmp], - stringsAsFactors=FALSE) - } + dfAll <- lapply(seq_len(length(listPop)), function(i) { + listGroup <- which(sample.annot$pop.group == listPop[i]) + tmp <- sample(listGroup, min(nbProfiles, length(listGroup))) + return(data.frame(sample.id=sample.id[tmp], + pop.group=sample.annot$pop.group[tmp], + superPop=sample.annot$superPop[tmp], + stringsAsFactors=FALSE)) }) - df <- do.call(rbind, listSel) + df <- do.call(rbind, dfAll) return(df) } @@ -104,12 +179,12 @@ select1KGPop <- function(gdsReference, nbProfiles) { #' population. #' #' @param dataRef a \code{data.frame} containing those columns: -#' \itemize{ -#' \item{sample.id} { a \code{character} string representing the sample +#' \describe{ +#' \item{sample.id}{ a \code{character} string representing the sample #' identifier. } -#' \item{pop.group} { a \code{character} string representing the +#' \item{pop.group}{ a \code{character} string representing the #' subcontinental population assigned to the sample. } -#' \item{superPop} { a \code{character} string representing the +#' \item{superPop}{ a \code{character} string representing the #' super-population assigned to the sample. } #' } #' @@ -182,7 +257,8 @@ splitSelectByPop <- function(dataRef) { #' "Sample.Type" entries are always set to 'Synthetic'. #' #' The synthetic profiles are assigned unique names by combining: -#' [prefix].[data.id.profile].[listSampleRef].[simulation number(1 to nbSim)] +#' \code{prefix}.\code{data.id.profile}.\code{listSampleRef}.\code{simulation +#' number(1 to nbSim)} #' #' @param fileProfileGDS a \code{character} string representing the file name #' of the Profile GDS file containing the information about the reference @@ -224,28 +300,23 @@ splitSelectByPop <- function(dataRef) { #' ## Path to the demo 1KG GDS file is located in this package #' dataDir <- system.file("extdata/tests", package="RAIDS") #' -#' ## Copy the Profile GDS file demo that has been pruned and annotated -#' ## into a test directory (deleted after the example has been run) -#' dataDirAllelicFraction <- file.path(system.file("extdata", package="RAIDS"), -#' "demoAllelicFraction") -#' dir.create(dataDirAllelicFraction, showWarnings=FALSE, -#' recursive=FALSE, mode="0777") -#' -#' ## Profile GDS file -#' fileNameGDS <- file.path(dataDirAllelicFraction, "ex1.gds") +#' ## Temporary Profile GDS file +#' fileNameGDS <- file.path(tempdir(), "ex1.gds") #' +#' ## Copy the Profile GDS file demo that has been pruned and annotated #' file.copy(file.path(dataDir, "ex1_demo_with_pruning_and_1KG_annot.gds"), #' fileNameGDS) #' #' ## Information about the synthetic data set #' syntheticStudyDF <- data.frame(study.id="MYDATA.Synthetic", -#' study.desc="MYDATA synthetic data", study.platform="PLATFORM", -#' stringsAsFactors=FALSE) +#' study.desc="MYDATA synthetic data", study.platform="PLATFORM", +#' stringsAsFactors=FALSE) #' #' ## Add information related to the synthetic profiles into the Profile GDS #' prepSynthetic(fileProfileGDS=fileNameGDS, -#' listSampleRef=c("HG00243", "HG00150"), profileID="ex1", -#' studyDF=syntheticStudyDF, nbSim=1L, prefix="synthetic", verbose=FALSE) +#' listSampleRef=c("HG00243", "HG00150"), profileID="ex1", +#' studyDF=syntheticStudyDF, nbSim=1L, prefix="synthetic", +#' verbose=FALSE) #' #' ## Open Profile GDS file #' profileGDS <- openfn.gds(fileNameGDS) @@ -253,15 +324,16 @@ splitSelectByPop <- function(dataRef) { #' ## The synthetic profiles should be added in the 'study.annot' entry #' tail(read.gdsn(index.gdsn(profileGDS, "study.annot"))) #' -#' ## The synthetic study information should be added to the 'study.list' entry +#' ## The synthetic study information should be added to +#' ## the 'study.list' entry #' tail(read.gdsn(index.gdsn(profileGDS, "study.list"))) #' #' ## Close GDS file (important) #' closefn.gds(profileGDS) #' -#' ## Unlink Profile GDS file (created for demo purpose) -#' unlink(file.path(dataDirAllelicFraction, "ex1.gds")) -#' unlink(dataDirAllelicFraction) +#' ## Remove Profile GDS file (created for demo purpose) +#' unlink(fileNameGDS, force=TRUE) +#' #' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz #' @importFrom gdsfmt index.gdsn read.gdsn @@ -370,7 +442,7 @@ prepSynthetic <- function(fileProfileGDS, listSampleRef, #' Default: \code{0.01}. #' #' @param minProb a single positive \code{numeric} between 0 and 1 that -#' represents the probability that the genotype is correct. TODO. +#' represents the probability that the genotype is correct. #' Default: \code{0.999}. #' #' @param seqError a single positive \code{numeric} between 0 and 1 @@ -386,42 +458,36 @@ prepSynthetic <- function(fileProfileGDS, listSampleRef, #' ## Path to the demo 1KG GDS file is located in this package #' dataDir <- system.file("extdata/tests", package="RAIDS") #' -#' ## Copy the Profile GDS file demo that has been pruned and annotated -#' ## into a test directory (deleted after the example has been run) -#' dataDirAllelicFraction <- file.path(system.file("extdata", package="RAIDS"), -#' "demoAllelicFraction") -#' dir.create(dataDirAllelicFraction, showWarnings=FALSE, -#' recursive=FALSE, mode="0777") -#' -#' ## Profile GDS file -#' fileNameGDS <- file.path(dataDirAllelicFraction, "ex1.gds") +#' ## Profile GDS file (temporary) +#' fileNameGDS <- file.path(tempdir(), "ex1.gds") #' +#' ## Copy the Profile GDS file demo that has been pruned and annotated #' file.copy(file.path(dataDir, "ex1_demo_with_pruning_and_1KG_annot.gds"), #' fileNameGDS) #' #' ## Information about the synthetic data set #' syntheticStudyDF <- data.frame(study.id="MYDATA.Synthetic", -#' study.desc="MYDATA synthetic data", study.platform="PLATFORM", -#' stringsAsFactors=FALSE) +#' study.desc="MYDATA synthetic data", study.platform="PLATFORM", +#' stringsAsFactors=FALSE) #' #' ## Add information related to the synthetic profiles into the Profile GDS #' prepSynthetic(fileProfileGDS=fileNameGDS, -#' listSampleRef=c("HG00243", "HG00150"), profileID="ex1", -#' studyDF=syntheticStudyDF, nbSim=1L, prefix="synthTest", -#' verbose=FALSE) +#' listSampleRef=c("HG00243", "HG00150"), profileID="ex1", +#' studyDF=syntheticStudyDF, nbSim=1L, prefix="synthTest", +#' verbose=FALSE) #' #' ## The 1KG files #' gds1KG <- snpgdsOpen(file.path(dataDir, -#' "ex1_good_small_1KG_GDS.gds")) +#' "ex1_good_small_1KG.gds")) #' gds1KGAnnot <- openfn.gds(file.path(dataDir, -#' "ex1_good_small_1KG_Annot_GDS.gds")) +#' "ex1_good_small_1KG_Annot.gds")) #' #' ## Generate the synthetic profiles and add them into the Profile GDS #' syntheticGeno(gdsReference=gds1KG, gdsRefAnnot=gds1KGAnnot, -#' fileProfileGDS=fileNameGDS, profileID="ex1", -#' listSampleRef=c("HG00243", "HG00150"), nbSim=1, -#' prefix="synthTest", -#' pRecomb=0.01, minProb=0.999, seqError=0.001) +#' fileProfileGDS=fileNameGDS, profileID="ex1", +#' listSampleRef=c("HG00243", "HG00150"), nbSim=1, +#' prefix="synthTest", +#' pRecomb=0.01, minProb=0.999, seqError=0.001) #' #' ## Open Profile GDS file #' profileGDS <- openfn.gds(fileNameGDS) @@ -433,9 +499,9 @@ prepSynthetic <- function(fileProfileGDS, listSampleRef, #' closefn.gds(gds1KG) #' closefn.gds(gds1KGAnnot) #' -#' ## Unlink Profile GDS file (created for demo purpose) -#' unlink(file.path(dataDirAllelicFraction, "ex1.gds")) -#' unlink(dataDirAllelicFraction) +#' ## Remove Profile GDS file (created for demo purpose) +#' unlink(fileNameGDS, force=TRUE) +#' #' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz #' @importFrom gdsfmt index.gdsn read.gdsn @@ -602,24 +668,47 @@ syntheticGeno <- function(gdsReference, gdsRefAnnot, fileProfileGDS, profileID, rownames(blockZone) <- listB - # We have to manage multipple simulation which mean - # different number of zone for the different simulation + + ## FOR_LOOP modification to be validated by Pascal + ## Remove commented code and this text after validation + + # We have to manage multiple simulation which means + # different number of zone for the different simulations LAPparent <- matrix(nrow = nbSNV, ncol = nbSim) - for(i in seq_len(nbSim)){ + # for(i in seq_len(nbSim)){ + # # list of zone with the same phase relatively to 1KG + # listZone <- unique(blockZone[,i]) + # + # ## matrix if the lap is the first entry in the phase or + # ## the second for each zone + # lapPos <- matrix(sample(x=c(0,1), size=1 *(length(listZone)), + # replace=TRUE), ncol=1) + # + # rownames(lapPos) <- listZone + # + # LAPparent[, i] <- + # lapPos[as.character(blockZone[as.character(blockDF[, + # curSP]),i]),] + # } + + + # We have to manage multiple simulations which means + # different number of zones for the different simulations + lapValues <- vapply(seq_len(nbSim), function(i) { # list of zone with the same phase relatively to 1KG listZone <- unique(blockZone[,i]) ## matrix if the lap is the first entry in the phase or ## the second for each zone lapPos <- matrix(sample(x=c(0,1), size=1 *(length(listZone)), - replace=TRUE), ncol=1) + replace=TRUE), ncol=1) rownames(lapPos) <- listZone - LAPparent[, i] <- - lapPos[as.character(blockZone[as.character(blockDF[, - curSP]),i]),] - } + return(lapPos[as.character(blockZone[as.character(blockDF[, + curSP]),i]),]) + }, double(nbSNV)) + LAPparent[, seq_len(nbSim)] <- lapValues phaseVal <- read.gdsn(index.gdsn(gdsRefAnnot, "phase"), start=c(1,listPosRef.1kg[r]), count=c(-1,1))[list1KG] @@ -702,6 +791,7 @@ syntheticGeno <- function(gdsReference, gdsRefAnnot, fileProfileGDS, profileID, #' the super-population information from the 1KG GDS file #' for profiles used to generate the synthetic profiles. The \code{data.frame} #' must contained a column named as the \code{pedCallAncestryColumn} argument. +#' The row names must correspond to the sample identifiers (mandatory). #' #' @param pedCallAncestryColumn a \code{character} string representing the #' name of the column that contains the known ancestry for the reference @@ -713,7 +803,7 @@ syntheticGeno <- function(gdsReference, gdsRefAnnot, fileProfileGDS, profileID, #' Default: \code{c("EAS", "EUR", "AFR", "AMR", "SAS")}. #' #' @return \code{list} containing 3 entries: -#' \itemize{ +#' \describe{ #' \item{\code{matAUROC.All}}{ a \code{data.frame} containing the AUROC for all #' the ancestry results. } #' \item{\code{matAUROC.Call}}{ a \code{data.frame} containing the AUROC @@ -724,22 +814,24 @@ syntheticGeno <- function(gdsReference, gdsRefAnnot, fileProfileGDS, profileID, #' #' @examples #' -#' dataDirRes <- system.file("extdata/demoAncestryCall", package="RAIDS") +#' ## Loading demo dataset containing pedigree information for synthetic +#' ## profiles and known ancestry of the profiles used to generate the +#' ## synthetic profiles +#' data(pedSynthetic) +#' +#' ## Loading demo dataset containing the inferred ancestry results +#' ## for the synthetic data +#' data(matKNNSynthetic) #' #' ## The inferred ancestry results for the synthetic data using #' ## values of D=6 and K=5 -#' matKNN <- readRDS(file.path(dataDirRes, "matKNN.RDS")) -#' matKNN <- matKNN[matKNN$K == 6 & matKNN$D == 5, ] -#' -#' ## The known ancestry from the reference profiles used to generate the -#' ## synthetic profiles -#' syntheticInfo <- readRDS(file.path(dataDirRes, "pedSyn.RDS")) +#' matKNN <- matKNNSynthetic[matKNNSynthetic$K == 6 & matKNNSynthetic$D == 5, ] #' #' ## Compile statistics from the #' ## synthetic profiles for fixed values of D and K #' results <- RAIDS:::computeSyntheticROC(matKNN=matKNN, #' matKNNAncestryColumn="SuperPop", -#' pedCall=syntheticInfo, pedCallAncestryColumn="superPop", +#' pedCall=pedSynthetic, pedCallAncestryColumn="superPop", #' listCall=c("EAS", "EUR", "AFR", "AMR", "SAS")) #' #' results$matAUROC.All diff --git a/R/synthetic_internal.R b/R/synthetic_internal.R index bfa45543a..d3ec33232 100644 --- a/R/synthetic_internal.R +++ b/R/synthetic_internal.R @@ -1,7 +1,7 @@ #' @title Validate input parameters for syntheticGeno() function #' #' @description This function validates the input parameters for the -#' \code{\link{syntheticGeno}} function. +#' [syntheticGeno()] function. #' #' @param gdsReference an object of class \code{\link[gdsfmt]{gds.class}} #' (a GDS file), the 1KG GDS file. @@ -42,24 +42,25 @@ #' dataDir <- system.file("extdata", package="RAIDS") #' #' ## The 1KG GDS file (opened) -#' gds1KG <- openfn.gds(file.path(dataDir, "gds1KG.gds"), readonly=TRUE) +#' gdsRef <- openfn.gds(file.path(dataDir, +#' "PopulationReferenceDemo.gds"), readonly=TRUE) #' #' ## The 1KG GDS Annotation file (opened) -#' gds1KGAnnot <- openfn.gds(file.path(dataDir, "gdsAnnot1KG.gds"), -#' readonly=TRUE) +#' gdsRefAnnot <- openfn.gds(file.path(dataDir, +#' "PopulationReferenceSNVAnnotationDemo.gds"), readonly=TRUE) #' #' ## The GDS Sample file #' gdsSample <- file.path(dataDir, "GDS_Sample_with_study_demo.gds") #' #' ## The validation should be successful -#' RAIDS:::validateSyntheticGeno(gdsReference=gds1KG, gdsRefAnnot=gds1KGAnnot, +#' RAIDS:::validateSyntheticGeno(gdsReference=gdsRef, gdsRefAnnot=gdsRefAnnot, #' fileProfileGDS=gdsSample, profileID="A101TCGA", #' listSampleRef="A101TCGA", nbSim=1L, prefix="TCGA", pRecomb=0.02, #' minProb=0.999, seqError=0.002) #' #' ## All GDS file must be closed -#' closefn.gds(gdsfile=gds1KG) -#' closefn.gds(gdsfile=gds1KGAnnot) +#' closefn.gds(gdsfile=gdsRef) +#' closefn.gds(gdsfile=gdsRefAnnot) #' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz #' @importFrom S4Vectors isSingleNumber @@ -133,7 +134,7 @@ validateSyntheticGeno <- function(gdsReference, gdsRefAnnot, fileProfileGDS, #' @title Validate input parameters for prepSynthetic() function #' #' @description This function validates the input parameters for the -#' \code{\link{prepSynthetic}} function. +#' [prepSynthetic()] function. #' #' @param fileProfileGDS a \code{character} string representing the file name #' of the GDS Sample file containing the information about the sample @@ -235,8 +236,7 @@ validatePepSynthetic <- function(fileProfileGDS, #' @title Validate input parameters for computeSyntheticROC() function #' #' @description This function validates the input parameters for the -#' \code{\link{computeSyntheticROC}} function. -#' +#' [computeSyntheticROC()] function. #' #' @param matKNN a \code{data.frame} containing the inferred ancestry results #' for fixed values of _D_ and _K_. On of the column names of the @@ -266,22 +266,23 @@ validatePepSynthetic <- function(fileProfileGDS, #' #' @examples #' -#' ## Directory where demo GDS files are located -#' dataDir <- system.file("extdata/demoAncestryCall", package="RAIDS") +#' ## Loading demo dataset containing pedigree information for synthetic +#' ## profiles and known ancestry of the profiles used to generate the +#' ## synthetic profiles +#' data(pedSynthetic) +#' +#' ## Loading demo dataset containing the inferred ancestry results +#' ## for the synthetic data +#' data(matKNNSynthetic) #' #' ## The inferred ancestry results for the synthetic data using #' ## values of D=6 and K=5 -#' matKNN <- readRDS(file.path(dataDir, "matKNN.RDS")) -#' matKNN <- matKNN[matKNN$K == 6 & matKNN$D == 5, ] -#' -#' ## The known ancestry from the reference profiles used to generate the -#' ## synthetic profiles -#' syntheticData <- readRDS(file.path(dataDir, "pedSyn.RDS")) +#' matKNN <- matKNNSynthetic[matKNNSynthetic$K == 6 & matKNNSynthetic$D == 5, ] #' #' ## The validation should be successful #' RAIDS:::validateComputeSyntheticRoc(matKNN=matKNN, #' matKNNAncestryColumn="SuperPop", -#' pedCall=syntheticData, pedCallAncestryColumn="superPop", +#' pedCall=pedSynthetic, pedCallAncestryColumn="superPop", #' listCall=c("EAS", "EUR", "AFR", "AMR", "SAS")) #' #' @@ -322,7 +323,7 @@ validateComputeSyntheticRoc <- function(matKNN, matKNNAncestryColumn, pedCall, ## The pedCallAncestryColumn must be a column in the pedCall data frame if (!(pedCallAncestryColumn %in% colnames(pedCall))) { stop("The \'pedCallAncestryColumn\' must be a column in the ", - "\'pedCall\' data frame.") + "\'pedCall\' data frame.") } ## The listCall must be character string @@ -394,7 +395,7 @@ validateComputeSyntheticRoc <- function(matKNN, matKNNAncestryColumn, pedCall, #' #' ## The open 1KG GDS file is required (this is a demo file) #' dataDir <- system.file("extdata", package="RAIDS") -#' gds_1KG_file <- file.path(dataDir, "1KG_Demo.gds") +#' gds_1KG_file <- file.path(dataDir, "PopulationReferenceDemo.gds") #' gds1KG <- openfn.gds(gds_1KG_file) #' #' fileSampleGDS <- file.path(dataDir, "GDS_Sample_with_study_demo.gds") @@ -475,30 +476,32 @@ prepPedSynthetic1KG <- function(gdsReference, gdsSample, studyID, popName) { #' the list of possible ancestry assignations. #' #' @return \code{list} containing 2 entries: -#' \itemize{ -#' \item{confMat} { a \code{matrix} representing the confusion matrix } -#' \item{matAccuracy} { a \code{data.frame} containing the statistics +#' \describe{ +#' \item{confMat}{ a \code{matrix} representing the confusion matrix } +#' \item{matAccuracy}{ a \code{data.frame} containing the statistics #' associated to the confusion matrix} #' } #' #' @examples #' -#' dataDirRes <- system.file("extdata/demoAncestryCall", package="RAIDS") +#' ## Loading demo dataset containing pedigree information for synthetic +#' ## profiles and known ancestry of the profiles used to generate the +#' ## synthetic profiles +#' data(pedSynthetic) +#' +#' ## Loading demo dataset containing the inferred ancestry results +#' ## for the synthetic data +#' data(matKNNSynthetic) #' #' ## The inferred ancestry results for the synthetic data using #' ## values of D=6 and K=5 -#' matKNN <- readRDS(file.path(dataDirRes, "matKNN.RDS")) -#' matKNN <- matKNN[matKNN$K == 6 & matKNN$D == 5, ] -#' -#' ## The known ancestry from the reference profiles used to generate the -#' ## synthetic profiles -#' syntheticInfo <- readRDS(file.path(dataDirRes, "pedSyn.RDS")) +#' matKNN <- matKNNSynthetic[matKNNSynthetic$K == 6 & matKNNSynthetic$D == 5, ] #' #' ## Compile the confusion matrix using the #' ## synthetic profiles for fixed values of D and K values #' results <- RAIDS:::computeSyntheticConfMat(matKNN=matKNN, #' matKNNAncestryColumn="SuperPop", -#' pedCall=syntheticInfo, pedCallAncestryColumn="superPop", +#' pedCall=pedSynthetic, pedCallAncestryColumn="superPop", #' listCall=c("EAS", "EUR", "AFR", "AMR", "SAS")) #' #' results$confMat diff --git a/R/tools.R b/R/tools.R index 009b23641..2446b44dd 100644 --- a/R/tools.R +++ b/R/tools.R @@ -9,8 +9,9 @@ #' @param gdsReference an object of class \code{\link[gdsfmt]{gds.class}} #' (a GDS file), the 1KG GDS file. #' -#' @param fileOUT a \code{character} string representing the path and file +#' @param fileOut a \code{character} string representing the path and file #' name of the VCF file that will be created wit the retained SNP information. +#' The file should have the ".vcf" extension. #' #' @param offset a single \code{integer} that is added to the SNP position to #' switch from 0-based to 1-based coordinate when needed (or reverse). @@ -19,22 +20,27 @@ #' @param freqCutoff a single positive \code{numeric} specifying the cut-off to #' keep a SNP. If \code{NULL}, all SNPs are retained. Default: \code{NULL}. #' -#' @return The integer \code{0} when successful. +#' @return The integer \code{0L} when successful. #' #' @examples #' +#' ## Required library +#' library(gdsfmt) +#' #' ## Path to the demo pedigree file is located in this package #' dataDir <- system.file("extdata", package="RAIDS") #' -#' ## Demo 1KG GDS file -#' fileGDS <- openfn.gds(file.path(dataDir, "1KG_Demo.gds")) +#' ## Demo 1KG Reference GDS file +#' fileGDS <- openfn.gds(file.path(dataDir, +#' "PopulationReferenceDemo.gds")) #' -#' ## Output VCF file that will be created -#' vcfFile <- file.path(dataDir, "Demo_TMP_01.vcf") +#' ## Output VCF file that will be created (temporary) +#' vcfFile <- file.path(tempdir(), "Demo_TMP_01.vcf") #' #' ## Create a VCF file with the SNV dataset present in the GDS file #' ## No cutoff on frequency, so all SNVs are saved -#' snvListVCF(gdsReference=fileGDS, fileOUT=vcfFile, offset=0L, freqCutoff=NULL) +#' snvListVCF(gdsReference=fileGDS, fileOut=vcfFile, offset=0L, +#' freqCutoff=NULL) #' #' ## Close GDS file (IMPORTANT) #' closefn.gds(fileGDS) @@ -46,9 +52,10 @@ #' @importFrom gdsfmt read.gdsn #' @importFrom methods is #' @importFrom S4Vectors isSingleNumber +#' @importFrom utils write.table #' @encoding UTF-8 #' @export -snvListVCF <- function(gdsReference, fileOUT, offset=0L, freqCutoff=NULL) { +snvListVCF <- function(gdsReference, fileOut, offset=0L, freqCutoff=NULL) { ## Validate that gdsReference is an object of class gds.class if (!inherits(gdsReference, "gds.class")) { @@ -65,37 +72,38 @@ snvListVCF <- function(gdsReference, fileOUT, offset=0L, freqCutoff=NULL) { stop("The \'freqCutoff\' must be a single numeric or NULL.") } - snp.chromosome <- read.gdsn(index.gdsn(gdsReference, "snp.chromosome")) - snp.position <- read.gdsn(index.gdsn(gdsReference, "snp.position")) - snp.allele <- read.gdsn(index.gdsn(gdsReference, "snp.allele")) + snpChromosome <- read.gdsn(index.gdsn(gdsReference, "snp.chromosome")) + snpPosition <- read.gdsn(index.gdsn(gdsReference, "snp.position")) + snpAllele <- read.gdsn(index.gdsn(gdsReference, "snp.allele")) - allele <- matrix(unlist(strsplit(snp.allele, "\\/")), nrow=2) + allele <- matrix(unlist(strsplit(snpAllele, "\\/")), nrow=2) df <- NULL if(is.null(freqCutoff)){ snp.AF <- read.gdsn(index.gdsn(gdsReference, "snp.AF")) - df <- data.frame(CHROM=snp.chromosome, - POS=as.integer(snp.position + offset), - ID=rep(".", length(snp.chromosome)), + df <- data.frame(CHROM=snpChromosome, + POS=as.integer(snpPosition + offset), + ID=rep(".", length(snpChromosome)), REF=allele[1,], ALT=allele[2,], - QUAL=rep(".", length(snp.chromosome)), - FILTER=rep(".", length(snp.chromosome)), + QUAL=rep(".", length(snpChromosome)), + FILTER=rep(".", length(snpChromosome)), INFO=paste0("AF=", snp.AF), stringsAsFactors=FALSE) } else { - freqDF <- data.frame(snp.AF=read.gdsn(index.gdsn(gdsReference, "snp.AF")), - snp.EAS_AF=read.gdsn(index.gdsn(gdsReference, "snp.EAS_AF")), - snp.EUR_AF=read.gdsn(index.gdsn(gdsReference, "snp.EUR_AF")), - snp.AFR_AF=read.gdsn(index.gdsn(gdsReference, "snp.AFR_AF")), - snp.AMR_AF=read.gdsn(index.gdsn(gdsReference, "snp.AMR_AF")), - snp.SAS_AF=read.gdsn(index.gdsn(gdsReference, "snp.SAS_AF"))) + freqDF <- data.frame( + snp.AF=read.gdsn(index.gdsn(gdsReference, "snp.AF")), + snp.EAS_AF=read.gdsn(index.gdsn(gdsReference, "snp.EAS_AF")), + snp.EUR_AF=read.gdsn(index.gdsn(gdsReference, "snp.EUR_AF")), + snp.AFR_AF=read.gdsn(index.gdsn(gdsReference, "snp.AFR_AF")), + snp.AMR_AF=read.gdsn(index.gdsn(gdsReference, "snp.AMR_AF")), + snp.SAS_AF=read.gdsn(index.gdsn(gdsReference, "snp.SAS_AF"))) listKeep <- which(rowSums(freqDF[,2:6] >= freqCutoff & freqDF[,2:6] <= 1 - freqCutoff) > 0) - df <- data.frame(CHROM=snp.chromosome[listKeep], - POS=as.integer(snp.position[listKeep] + offset), + df <- data.frame(CHROM=snpChromosome[listKeep], + POS=as.integer(snpPosition[listKeep] + offset), ID=rep(".", length(listKeep)), REF=allele[1,listKeep], ALT=allele[2,listKeep], @@ -112,65 +120,21 @@ snvListVCF <- function(gdsReference, fileOUT, offset=0L, freqCutoff=NULL) { ## in the range (0,1)"> #CHROM POS ID REF ALT QUAL FILTER INFO - cat(paste0('##fileformat=VCFv4.3', "\n"), file = fileOUT) + cat(paste0('##fileformat=VCFv4.3', "\n"), file=fileOut) cat(paste0('##FILTER=', - "\n"), file = fileOUT, append=TRUE) + "\n"), file=fileOut, append=TRUE) cat(paste0('##INFO=', - "\n"), file = fileOUT, append=TRUE) - cat('#', file = fileOUT, append=TRUE) + "\n"), file=fileOut, append=TRUE) + cat('#', file=fileOut, append=TRUE) - write.table(df, file=fileOUT, sep="\t", append=TRUE, row.names=FALSE, + write.table(df, file=fileOut, sep="\t", append=TRUE, row.names=FALSE, col.names=TRUE, quote=FALSE) ## Successful return(0L) } -#' @title Merge the pruning files by chromosome in one file -#' -#' @description TODO -#' -#' @param pathPrunedGDS TODO -#' -#' @param filePref TODO -#' -#' @param fileOUT TODO -#' -#' @return TODO a \code{vector} of \code{numeric} -#' -#' @examples -#' -#' ## Path to the demo pedigree file is located in this package -#' dataDir <- system.file("extdata", package="RAIDS") -#' -#' ## TODO -#' -#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz -#' @encoding UTF-8 -#' @keywords internal -groupChrPruning <- function(pathPrunedGDS, filePref, fileOUT) { - - prunedList <- list() - - # Read the content of each file (one file per chromosome) - for(i in seq_len(22)) { - fileChr <- file.path(pathPrunedGDS, paste0(filePref, i, ".rds")) - - if(file.exists(fileChr)) { - prunedList[[i]] <- readRDS(fileChr) - } else { - stop("Problem with the file: ", fileChr) - } - } - - ## Merge the content of all files - pruned <- do.call(c, prunedList) - - ## Save all the information into one file - saveRDS(pruned, fileChr <- file.path(pathPrunedGDS, fileOUT)) -} - #' @title Merge the genotyping files per chromosome into one file #' @@ -190,14 +154,44 @@ groupChrPruning <- function(pathPrunedGDS, filePref, fileOUT) { #' the merged genotyping files for each sample will be created. #' The path must exists. #' -#' @return The integer \code{0L} when successful. +#' @return The integer \code{0L} when successful or \code{FALSE} if not. #' #' @examples #' -#' ## Path to the demo pedigree file is located in this package +#' ## Path to the demo vcf files in this package #' dataDir <- system.file("extdata", package="RAIDS") +#' pathGenoTar <- file.path(dataDir, "demoGenoChr", "demoGenoChr.tar") +#' +#' ## Path where the chromosomes files will be located +#' pathGeno <- file.path(tempdir(), "tempGeno") +#' dir.create(pathGeno, showWarnings=FALSE) +#' +#' ## Untar the file that contains the VCF files for 3 samples split by +#' ## chromosome (one directory per chromosome) +#' untar(tarfile=pathGenoTar, exdir=pathGeno) +#' +#' ## Path where the output VCF file will be created is +#' ## the same where the split VCF are (pathGeno) #' -#' ## TODO +#' ## The files must not exist +#' if (!file.exists(file.path(pathGeno, "NA12003.csv.bz2")) && +#' !file.exists(file.path(pathGeno, "NA12004.csv.bz2")) && +#' !file.exists(file.path(pathGeno, "NA12005.csv.bz2"))) { +#' +#' ## Return 0 when successful +#' ## The files "NA12003.csv.bz2", "NA12004.csv.bz2" and +#' ## "NA12005.csv.bz2" should not be present in the current directory +#' groupChr1KGSNV(pathGenoChr=pathGeno, pathOut=pathGeno) +#' +#' ## Validate that files have been created +#' file.exists(file.path(pathGeno, "NA12003.csv.bz2")) +#' file.exists(file.path(pathGeno, "NA12004.csv.bz2")) +#' file.exists(file.path(pathGeno, "NA12005.csv.bz2")) +#' +#' } +#' +#' ## Remove temporary directory +#' unlink(pathGeno, recursive=TRUE, force=TRUE) #' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz #' @importFrom utils write.csv2 read.csv2 @@ -219,25 +213,25 @@ groupChr1KGSNV <- function(pathGenoChr, pathOut) { listFiles <- dir(file.path(pathGenoChr, "chr1"), ".+\\.chr1\\.vcf\\.bz2") listSamples <- gsub("\\.chr1\\.vcf\\.bz2", "", listFiles) + ## Merge files associated to each samples into one csv file - for(sampleId in listSamples) { - listGeno <- list() + results <- lapply(X=listSamples, FUN=function(sampleId, pathOut) { - ## Read each genotyping file and append the information - for(chr in seq_len(22)) { + ## For each chromosome, read genotyping file and append the information + listGeno <- lapply(seq_len(22), function(chr, sampleId) { geno <- read.csv2(file.path(pathGenoChr, paste0("chr", chr), - paste0(sampleId, ".chr", chr,".vcf.bz2")), - sep="\t", row.names=NULL) - - listGeno[[paste0("chr", chr)]] <- geno - } + paste0(sampleId, ".chr", chr,".vcf.bz2")), + sep="\t", row.names=NULL) + return(geno)}, sampleId=sampleId) genoAll <- do.call(rbind, listGeno) ## Save the genotyping information into one file write.csv2(genoAll, file=bzfile(file.path(pathOut, - paste0(sampleId, ".csv.bz2"))), row.names=FALSE) - } + paste0(sampleId, ".csv.bz2"))), row.names=FALSE) - return(0L) + return(TRUE)}, pathOut=pathOut) + + ## Successful or not + return(ifelse(all(unlist(results)), 0L, FALSE)) } diff --git a/R/tools_internal.R b/R/tools_internal.R index c6f0716c5..09e0b94ff 100644 --- a/R/tools_internal.R +++ b/R/tools_internal.R @@ -1,6 +1,3 @@ - - - #' @title Validate that the input parameter is a GDS object #' #' @description This function validates that the input parameter inherits @@ -20,7 +17,8 @@ #' dataDir <- system.file("extdata", package="RAIDS") #' #' ## The 1KG GDS file (opened) -#' gds1KG <- openfn.gds(file.path(dataDir, "gds1KG.gds"), readonly=TRUE) +#' gds1KG <- openfn.gds(file.path(dataDir, +#' "PopulationReferenceDemo.gds"), readonly=TRUE) #' #' ## The validation should be successful #' RAIDS:::validateGDSClass(gds=gds1KG, name="gds") @@ -177,158 +175,746 @@ validatePositiveIntegerVector <- function(value, name) { return(0L) } - - -#' @title TODO -#' -#' @description TODO +#' @title Read a SNP-pileup file #' -#' @param snp.keep TODO +#' @description The function reads a generic SNP pileup file and +#' returns a data frame +#' containing the information about the read counts for the SNVs present in +#' the file. #' -#' @param PATHBLOCK TODO +#' @param fileName a \code{character} string representing the name, including +#' the path, of a text file containing the SNV read counts as generated by +#' snp-pileup software. The text file must be comma separated. +#' The text file must contain those columns: Chromosome, Position, Ref, Alt, +#' File1R, File1A, File1E and File1D. #' -#' @param superPop TODO +#' @param offset a \code{integer} representing the offset to be added to the +#' position of the SNVs. The value of offset +#' is added to the position present in the file. Default: \code{0L}. #' -#' @param chr TODO +#' @return the a \code{data.frame} containing at least: +#' \describe{ +#' \item{Chromosome}{ a \code{numeric} representing the name of +#' the chromosome} +#' \item{Position}{ a \code{numeric} representing the position on the +#' chromosome} +#' \item{Ref}{ a \code{character} string representing the reference nucleotide} +#' \item{Alt}{ a \code{character} string representing the alternative +#' nucleotide} +#' \item{File1R}{ a \code{numeric} representing the count for +#' the reference nucleotide} +#' \item{File1A}{ a \code{numeric} representing the count for the +#' alternative nucleotide} +#' \item{File1E}{a \code{numeric} representing the count for the +#' errors} +#' \item{File1D}{a \code{numeric} representing the count for the +#' deletions} +#' \item{count}{ a \code{numeric} representing the total count} +#' } #' +#' @examples #' -#' @return the a \code{array} with the sample from pedDF keept +#' ## Directory where demo SNP-pileup file +#' dataDir <- system.file("extdata/example/snpPileup", package="RAIDS") #' -#' @examples +#' ## The SNP-pileup file +#' snpPileupFile <- file.path(dataDir, "ex1.txt.gz") #' -#' # TODO +#' info <- RAIDS:::readSNVPileupFile(fileName=snpPileupFile) +#' head(info) #' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz #' @importFrom gdsfmt add.gdsn #' @encoding UTF-8 #' @keywords internal -processBlockChr <- function(snp.keep, PATHBLOCK, superPop, chr) { - - blockChr <- read.delim(file.path(PATHBLOCK, - paste0("block.sp.", superPop, ".f0.05.chr", chr, - ".blocks.det")), sep="") - - z <- cbind(c(blockChr$BP1, snp.keep, blockChr$BP2+1), - c(seq_len(nrow(blockChr)), - rep(0, length(snp.keep)), -1*seq_len(nrow(blockChr)))) - - z <- z[order(z[,1]),] - block.snp <- cumsum(z[,2])[z[,2] == 0] +readSNVPileupFile <- function(fileName, offset = 0L) { - curStart <- 0 - activeBlock <- 0 - blockState <- 0 - block.inter <- rep(0, length(which(block.snp == 0))) - k <- 1 - for(i in seq_len(length(block.snp))){ - if(block.snp[i] == 0){ - if(activeBlock == 1){ - if(snp.keep[i] - curStart >= 10000) { - blockState <- blockState - 1 + matSample <- read.csv(fileName) - curStart <- snp.keep[i] - } - } else{ - blockState <- blockState - 1 - curStart <- snp.keep[i] - curStart <- snp.keep[i] - activeBlock <- 1 - } - block.inter[k] <- blockState - k <- k + 1 - }else{ - activeBlock <- 0 - } + # Check if the mandatory column are there + if (!(all(c("Chromosome", "Position", "Ref", "Alt", "File1R", + "File1A", "File1E", "File1D") %in% colnames(matSample)))) { + stop("The SNP-pileup file must contain all those columns: ", + "\'Chromosome\', \'Position\', \'Ref\', \'Alt\', \'File1R\', ", + "\'File1A\', \'File1E\', \'File1D\'.") } - block.snp[block.snp == 0] <- block.inter - return(block.snp) + matSample[, "Chromosome"] <- as.integer(gsub("chr", "", + matSample[, "Chromosome"])) + matSample[, "Position"] <- matSample[, "Position"] + offset + matSample[, "count"] <- rowSums(matSample[, c("File1R", "File1A", + "File1E", "File1D")]) + + return(matSample) } -#' @title TODO +#' @title Read a generic SNP pileup file #' -#' @description TODO +#' @description The function reads a generic SNP pileup file and +#' returns a data frame +#' containing the information about the read counts for the SNVs present in +#' the file. #' -#' @param fileName Output from snp-pileup -#' must csv with the columns: -#' Chromosome,Position,Ref,Alt,File1R,File1A,File1E,File1D +#' @param fileName a \code{character} string representing the name, including +#' the path, of a text file containing the SNV read counts. The text file must +#' be comma separated. The text file must +#' contain those columns: Chromosome, Position, Ref, Alt, Count, +#' File1R and File1A. #' -#' @param offset TODO +#' @param offset a \code{integer} representing the offset to be added to the +#' position of the SNVs. The value of offset +#' is added to the position present in the file. Default: \code{0L}. #' -#' @return the a \code{data.frame} containing at least: -#' \itemize{ -#' \item{Chromosome} {TODO} -#' \item{Position} {TODO} -#' \item{File1R} {TODO} -#' \item{File1A} {TODO} -#' \item{count} {TODO} +#' @return a \code{data.frame} containing at least: +#' \describe{ +#' \item{Chromosome}{ a \code{numeric} representing the name of +#' the chromosome} +#' \item{Position}{ a \code{numeric} representing the position on the +#' chromosome} +#' \item{Ref}{ a \code{character} string representing the reference nucleotide} +#' \item{Alt}{ a \code{character} string representing the alternative +#' nucleotide} +#' \item{File1R}{ a \code{numeric} representing the count for +#' the reference nucleotide} +#' \item{File1A}{ a \code{numeric} representing the count for the +#' alternative nucleotide} +#' \item{count}{ a \code{numeric} representing the total count} #' } #' #' @examples #' -#' # TODO +#' +#' ## Directory where demo SNP-pileup file +#' dataDir <- system.file("extdata/example/snpPileup", package="RAIDS") +#' +#' ## The SNP-pileup file +#' snpPileupFile <- file.path(dataDir, "ex1.generic.txt.gz") +#' +#' info <- RAIDS:::readSNVFileGeneric(fileName=snpPileupFile) +#' head(info) #' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz #' @importFrom gdsfmt add.gdsn #' @encoding UTF-8 #' @keywords internal -readSNVPileupFile <- function(fileName, offset = 0L) { +readSNVFileGeneric <- function(fileName, offset = 0L) { matSample <- read.csv(fileName) + # Check if the mandatory column are there + if (!(all(c("Chromosome", "Position", "Ref", "Alt", "File1R", + "File1A", "Count") %in% colnames(matSample)))) { + stop("The generic SNP pileup file must contain all those columns: ", + "\'Chromosome\', \'Position\', \'Ref\', \'Alt\', \'File1R\', ", + "\'File1A\', \'Count\'.") + } matSample[, "Chromosome"] <- as.integer(gsub("chr", "", - matSample[, "Chromosome"])) + matSample[, "Chromosome"])) matSample[, "Position"] <- matSample[, "Position"] + offset - matSample[, "count"] <- rowSums(matSample[, c("File1R", "File1A", - "File1E", "File1D")]) + colnames(matSample)[colnames(matSample) == "Count"] <- "count" return(matSample) } - -#' @title TODO +#' @title Read a VCF file with the genotypes use for the ancestry call #' -#' @description TODO +#' @description The function reads VCF file and returns a data frame +#' containing the information about the read counts for the SNVs present in +#' the file. #' -#' @param fileName File name with the path to a -#' csv with at least the columns: -#' Chromosome,Position,Ref,Alt,Count,File1R,File1A -#' where Count is the deep at the position, -#' FileR is the deep of the reference allele, and -#' File1A is the deep of the specific alternative allele +#' @param fileName a \code{character} string representing the name, including +#' the path, of a VCF file containing the SNV read counts. +#' The VCF must contain those genotype fields: GT, AD, DP. #' -#' @param offset TODO +#' @param profileName a \code{character} with Name.ID for the genotype name. +#' Default: \code{NULL}. +#' +#' @param offset a \code{integer} representing the offset to be added to the +#' position of the SNVs. The value of offset +#' is added to the position present in the file. Default: \code{0L}. #' #' @return a \code{data.frame} containing at least: -#' \itemize{ -#' \item{Chromosome} {TODO} -#' \item{Position} {TODO} -#' \item{Ref} -#' \item{Alt} -#' \item{File1R} {deep of the reference allele} -#' \item{File1A} {deep of the alternative allele} -#' \item{count} {Total deep at the position} +#' \describe{ +#' \item{Chromosome}{ a \code{numeric} representing the name of +#' the chromosome} +#' \item{Position}{ a \code{numeric} representing the position on the +#' chromosome} +#' \item{Ref}{ a \code{character} string representing the reference nucleotide} +#' \item{Alt}{ a \code{character} string representing the alternative +#' nucleotide} +#' \item{File1R}{ a \code{numeric} representing the count for +#' the reference nucleotide} +#' \item{File1A}{ a \code{numeric} representing the count for the +#' alternative nucleotide} +#' \item{count}{ a \code{numeric} representing the total count} #' } #' #' @examples #' -#' # TODO +#' +#' ## Directory where demo SNP-pileup file +#' dataDir <- system.file("extdata/example/snpPileup", package="RAIDS") +#' +#' ## The SNP-pileup file +#' snpPileupFile <- file.path(dataDir, "ex1.vcf.gz") +#' +#' info <- RAIDS:::readSNVVCF(fileName=snpPileupFile) +#' head(info) #' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz -#' @importFrom gdsfmt add.gdsn +#' @importFrom VariantAnnotation readVcf geno +#' @importFrom MatrixGenerics rowRanges +#' @importFrom GenomicRanges seqnames start width #' @encoding UTF-8 #' @keywords internal -readSNVFileGeneric <- function(fileName, offset = 0L) { +readSNVVCF <- function(fileName, profileName=NULL, offset=0L) { - matSample <- read.csv(fileName) - # Check if the mendatory column are there + vcf <- readVcf(fileName) - matSample[, "Chromosome"] <- as.integer(gsub("chr", "", - matSample[, "Chromosome"])) - matSample[, "Position"] <- matSample[, "Position"] + offset - colnames(matSample)[colnames(matSample) == "Count"] <- "count" + gtCur <- geno(vcf) + genoPos <- 1 + if(! is.null(profileName)){ + listVcfSample <- colnames(gtCur$GT) + genoPos <- which(listVcfSample == profileName) + } + + colChr <- as.character(seqnames(vcf)) + + infoPos <- rowRanges(vcf) + start <- start(infoPos) + widthRef <- width(infoPos$REF) + refCur <- as.character(infoPos$REF) + listKeep <- seq_len(length(refCur)) + listKeep <- listKeep[which(widthRef == 1)] + + countV <- as.integer(gtCur$DP) + countA <- gtCur$AD + + idVCF <- row.names(gtCur$GT) + tmp <- matrix(unlist(strsplit(idVCF, ":")), nrow=2)[2,] + alleleChar <- matrix(unlist(strsplit(tmp, "_")), nrow=2)[2,] + rm(tmp) + + matCur <- lapply(listKeep, FUN=function(x, countA, alleleChar){ + + listAlt <- strsplit(alleleChar[x], "\\/")[[1]] + + keep <- ifelse(listAlt[2] %in% c("A", "C", "G", "T"), TRUE, FALSE) + + res <- data.frame(Alt=as.character(listAlt[2]), + File1R=countA[[x]][1], File1A=countA[[x]][2], + keep=keep) + + return(res) + }, + countA=countA, + alleleChar=alleleChar) + + matCur <- do.call(rbind, matCur) + listTmp <- which(matCur$keep) + listKeep <- listKeep[listTmp] + matSample <- data.frame(Chromosome=as.integer(gsub("chr", "", + colChr))[listKeep], + Position=start[listKeep] + offset, + Ref=refCur[listKeep], Alt=matCur$Alt[listTmp], + File1R=matCur$File1R[listTmp], + File1A=matCur$File1A[listTmp], + count=gtCur$DP[,1], stringsAsFactors=FALSE) return(matSample) } + +#' @title Filtering the read counts for a specific nucleotide +#' +#' @description The function returns the read counts for the specific +#' nucleotide or zero when read counts are not available. +#' +#' @param nucleotide a \code{vector} of a \code{character} strings +#' representing the nucleotides (ex: A, C, G or T). +#' +#' +#' @param count a \code{vector} of \code{numeric} representing the counts for +#' each nucleotide listed in \code{nucleotide} parameter. +#' +#' @param curNucleo a \code{character} strings representing the nucleotide +#' that will be retained (ex: A, C, G or T). +#' +#' @return a \code{numeric} representing the counts for the selected +#' nucleotide. The default value is \code{0}. +#' +#' @examples +#' +#' ## Nucleotides vector +#' nuc <- c("A", "G", "C", "T") +#' +#' ## Count vector +#' cnt <- c(100, 200, 4, 32) +#' +#' ## Return the count for the nucleotide "G" +#' RAIDS:::extractNucleotide(nucleotide=nuc, count=cnt, curNucleo="G") +#' +#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz +#' @encoding UTF-8 +#' @keywords internal +extractNucleotide <- function(nucleotide, count, curNucleo) { + + tmp <- which(nucleotide == curNucleo) + res <- 0 + if(length(tmp) == 1) res <- count[tmp] + return(res) + +} + + +#' @title Extract SNV information from pileup file for a selected chromosome +#' +#' @description The function reads pileup file and +#' returns a \code{data.frame} +#' containing the information about the read counts for the SNVs present in +#' the selected chromosome. +#' +#' @param chr a \code{character} string representing the name of the +#' chromosome to keep +#' +#' @param resPileup a \code{data.frame} as generated by the \code{pileup} +#' function from \code{Rsamtools} package +#' +#' @param varDf a \code{list} containing a \code{data.frame} representing +#' the positions to keep for each chromosome. +#' +#' @param verbose a \code{logical} indicating if messages should be printed +#' +#' @return a \code{data.frame} containing at least: +#' \describe{ +#' \item{seqnames}{ a \code{character} representing the name of the chromosome} +#' \item{pos}{ a \code{numeric} representing the position on the chromosome} +#' \item{REF}{ a \code{character} string representing the reference nucleotide} +#' \item{ALT}{ a \code{character} string representing the alternative +#' nucleotide} +#' \item{A}{ a \code{numeric} representing the count for the A nucleotide} +#' \item{C}{ a \code{numeric} representing the count for the C nucleotide} +#' \item{G}{ a \code{numeric} representing the count for the G nucleotide} +#' \item{T}{ a \code{numeric} representing the count for the T nucleotide} +#' \item{count}{ a \code{numeric} representing the total count} +#' } +#' +#' @examples +#' +#' ## Demo pileup result data.frame +#' resDemo <- data.frame(seqnames=rep("chr14", 10), +#' pos=c(19069583, 19069584, 19069586, 19069588, 19069589, 19069590, +#' 19069591, 19069592, 19069609, 19069760), +#' strand=c(rep("+", 5), rep("-", 5)), +#' nucleotide=c("T", "G", "G", "C", "A", "A", "C", "T", "T", "G"), +#' count=c(5, 3, 2, 4, 1, 2, 1, 8, 7, 4)) +#' resDemo$seqnames <- factor(resDemo$seqnames) +#' resDemo$strand <- factor(resDemo$strand) +#' resDemo$nucleotide <- factor(resDemo$nucleotide) +#' +#' ## Position to keep in a data.frame format +#' varInfo <- list("chr14"=data.frame(chr=c("chr14", "chr14"), +#' start=c(19069584, 19069609), REF=c("A", "G"), ALT=c("T", "A"))) +#' +#' ## Extract information from pileup for selected positions +#' RAIDS:::processPileupChrBin(chr="chr14", resPileup=resDemo, varDf=varInfo, +#' verbose=FALSE) +#' +#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz +#' @importFrom dplyr group_by summarize %>% +#' @importFrom rlang .data +#' @encoding UTF-8 +#' @keywords internal +processPileupChrBin <- function(chr, resPileup, varDf, verbose) { + + resCur <- NULL + + ## Assign FALSE to verbose by default + if (is.null(verbose)) { + verbose <- FALSE + } + + if (chr %in% names(varDf)) { + + keep <- which(resPileup$seqnames == chr) + + if (length(keep) > 0) { + # restrict the resPileup to the chromosome chr + snpO <- resPileup[keep,] + rm(keep) + + # restrict. varDf to the region overlapping snpO + vcfCur <- varDf[[chr]][varDf[[chr]]$start >= min(snpO$pos) & + varDf[[chr]]$start <= max(snpO$pos),, drop=FALSE] + + if (nrow(vcfCur) > 0) { + + # Get the positions to keep in resPileup (snpO) + tmpTime <- system.time( {z <- cbind( c(vcfCur$start, + snpO$pos, vcfCur$start), + c(rep(-1, nrow(vcfCur)), rep(0, nrow(snpO)), + rep(1, nrow(vcfCur))), + c(seq_len(nrow(vcfCur)), seq_len(nrow(snpO)), + seq_len(nrow(vcfCur)))) + z <- z[order(z[,1]),] + listKeep <- which(cumsum(z[,2]) < 0 & z[,2]==0)} ) + + if (verbose) { + message("processPileupChrBin selected pos user ", + chr, " ", + round(tmpTime[1],3), " system ", round(tmpTime[2],3), + " elapsed ", round(tmpTime[3],3)) } + + # summarize by position with the for base + tmpTime <- system.time(resCur <- as.data.frame(snpO[z[listKeep, 3],] %>% + group_by(.data$pos) %>% + summarize(seqnames=.data$seqnames[1], + A=extractNucleotide(.data$nucleotide,.data$count, "A"), + C=extractNucleotide(.data$nucleotide,.data$count, "C"), + G=extractNucleotide(.data$nucleotide,.data$count, "G"), + T=extractNucleotide(.data$nucleotide,.data$count, "T"), + count=sum(.data$count)))) + + if (verbose) { + message("processPileupChrBin extracted nucleotides user ", + round(tmpTime[1],3), " system ", round(tmpTime[2],3), + " elapsed ", round(tmpTime[3],3)) } + if(length(listKeep) > 0){ + # Add the reference allele and the alternative allele + tmpTime <- system.time({z <- cbind(c(resCur$pos, vcfCur$start, resCur$pos), + c(rep(-1, nrow(resCur)), rep(0, nrow(vcfCur)), rep(1, nrow(resCur))), + c(seq_len(nrow(resCur)), seq_len(nrow(vcfCur)), seq_len(nrow(resCur)))) + z <- z[order(z[,1]),] + listKeep <- which(cumsum(z[,2]) < 0 & z[,2]==0) + resCur$REF <- rep("N", nrow(resCur)) + resCur$ALT <- rep("N", nrow(resCur)) + resCur$REF[z[listKeep-1,3]] <- vcfCur[z[listKeep,3], "REF"] + resCur$ALT[z[listKeep-1,3]] <- vcfCur[z[listKeep,3], "ALT"] + resCur <- resCur[,c("seqnames", "pos", "REF", "ALT", "A", + "C", "G", "T", "count")]} ) + + if(verbose) { + message("processPileupChrBin add ref and alt allele user ", + round(tmpTime[1],3), " system ", round(tmpTime[2],3), + " elapsed ", round(tmpTime[3],3)) } + } else{ + resCur <- NULL + } + } + } + } + if(verbose) {message("readSNVBAM processPileupChrBin before return ", + Sys.time())} + return(resCur) +} + +#' @title Read a VCF file with the genotypes use for the ancestry call +#' +#' @description The function reads VCF file and +#' returns a data frame +#' containing the information about the read counts for the SNVs present in +#' the file. +#' +#' @param fileName a \code{character} string representing the name, including +#' the path, of a BAM file with the index file in the same directory +#' +#' +#' @param paramSNVBAM a \code{list} containing the parameters passed to the +#' BamFile() function. Default: \code{list(ScanBamParam=NULL, PileupParam=NULL, +#' yieldSize=10000000)}. +#' +#' @param varSelected a \code{data.frame} representing the position to keep +#' +#' @param offset a \code{integer} representing the offset to be added to the +#' position of the SNVs. The value of offset +#' is added to the position present in the file. Default: \code{0L}. +#' +#' @param verbose a \code{logical} indicating if messages should be printed +#' to show how the different steps in the function. Default: \code{FALSE}. +#' +#' @return a \code{data.frame} containing at least: +#' \describe{ +#' \item{Chromosome}{ a \code{numeric} representing the name of +#' the chromosome} +#' \item{Position}{ a \code{numeric} representing the position on the +#' chromosome} +#' \item{Ref}{ a \code{character} string representing the reference nucleotide} +#' \item{Alt}{ a \code{character} string representing the alternative +#' nucleotide} +#' \item{File1R}{ a \code{numeric} representing the count for +#' the reference nucleotide} +#' \item{File1A}{ a \code{numeric} representing the count for the +#' alternative nucleotide} +#' \item{count}{ a \code{numeric} representing the total count} +#' } +#' +#' @examples +#' +#' +#' ## Required library for this example to run correctly +#' if (requireNamespace("Rsamtools", quietly=TRUE)) { +#' ## Demo bam +#' fl <- system.file("extdata", "no_which_buffered_pileup.bam", +#' package="Rsamtools", mustWork=TRUE) +#' +#' RAIDS:::readSNVBAM(fl, varSelected=data.frame(chr=c(1,1), +#' start=c(3,5), REF=c("A", "A"), ALT=c("C", "C"))) +#' } +#' +#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz +#' @importFrom Rsamtools BamFile ScanBamParam PileupParam pileup +#' @importFrom MatrixGenerics rowRanges +#' @importFrom GenomicRanges seqnames start width +#' @encoding UTF-8 +#' @keywords internal +readSNVBAM <- function(fileName, varSelected, offset=0L, + paramSNVBAM=list(ScanBamParam=NULL, PileupParam=NULL, + yieldSize=10000000), verbose=FALSE) { + # Note the offset is apply to the ref not the sequemce (snp-pileup and vcf) + varSelected$chr <- paste0("chr", varSelected$chr) + varSelected$start <- varSelected$start - offset + listChr <- unique(varSelected$chr) + # listChr <- names(varSelected) + varSelected <- lapply(listChr, + FUN=function(x, varDf){ + return(varDf[which(varDf$chr == x),]) + }, + varDf=varSelected) + + # names(listPos) <- paste0("chr", listChr) + # varSelected <- lapply(seq_len(length(varSelected)), + # FUN=function(x, varSelected){ + # varSelected[[x]]$chr <- paste0("chr", varSelected[[x]]$chr) + # varSelected[[x]]$start <- varSelected[[x]]$start + offset + # return(varSelected[[x]]) + # }, + # varSelected=varSelected) + names(varSelected) <- listChr + #varSelected$chr <- paste0("chr", varSelected$chr) + #varSelected$start <- varSelected$start - offset + + myBf <- BamFile(fileName, yieldSize=paramSNVBAM$yieldSize) + bf <- open(myBf) + # temporary before create a parameters class + if(is.null(paramSNVBAM$ScanBamParam)){ + sbp <- ScanBamParam() # which=vcf_granges + } + if(is.null(paramSNVBAM$ScanBamParam)){ + pup <- PileupParam(max_depth=5000, + min_base_quality=20, + min_mapq=15, + min_nucleotide_depth=1, + min_minor_allele_depth=0, + distinguish_strands=FALSE, + distinguish_nucleotides=TRUE, + ignore_query_Ns=TRUE, + include_deletions=FALSE, + include_insertions=FALSE) + } + + i<-1 + res <- list() + vcfChr <- list() + k<-0 + + repeat { + if(verbose) {message("readSNVBAM pileup user ", + k, " i ", i)} + tmpTime <- system.time(resPileup <- pileup(bf, + pileupParam = pup, + ScanBamParam=sbp)) + if(verbose) {message("readSNVBAM pileup user ", + round(tmpTime[1],3), + " system ", round(tmpTime[2],3), + " elapsed ", round(tmpTime[3],3))} + + listChr <- unique(as.character(resPileup$seqnames)) + + + #print(paste0("nb Chr ", length(listChr))) + res[[i]] <- NULL + if(length(listChr) > 0 ) { + tmpChr <- grep("chr", listChr) + if(length(tmpChr) != length(listChr)){ + listChg <- listChr[-1* tmpChr] + for(i in seq_len(length(listChg))){ + resPileup$seqnames[resPileup$seqnames == listChg[i]] <- + paste0("chr", listChg[i]) + } + listChr <- unique(as.character(resPileup$seqnames)) + } + if(sum(!(listChr %in% names(varSelected))) == length(listChr)) { + # message("End chromosome") + break + } + # print(paste0("Current chr ", listChr)) + if(verbose) { message("readSNVBAM processPileupChrBin start ", + Sys.time()) } + tmp <- lapply(listChr, + FUN=function(x, res, varSelected){ + return(processPileupChrBin(chr=x, res, + varDf=varSelected, verbose=verbose)) + }, res=resPileup, + varSelected=varSelected) + if(verbose) { message("readSNVBAM processPileupChrBin end ", + Sys.time()) } + + if(length(tmp) > 0) { + res[[i]] <- do.call(rbind, tmp) + i <- i + 1 + #message(nrow(res[[i]]), " rows in result data.frame") + } + } + + if(nrow(resPileup) == 0L){ + k <- k + 1 + if(k > 20) { + break + } + }else{ + k <- 0 + # i <- i + 1 + } + if(verbose) { message("readSNVBAM pileup end repeat ", Sys.time()) } + } + if(verbose) {message("readSNVBAM pileup Done ", Sys.time())} + resSNP <- do.call(rbind,res) + close(bf) + + if(verbose) {message("readSNVBAM pileup user ", k, " i ", i)} + + resSNP$File1R <- rep(0, nrow(resSNP)) + resSNP$File1A <- rep(0, nrow(resSNP)) + for(nuc in c("A", "C", "G", "T")){ + tmp <- which(resSNP$REF == nuc) + if(length(tmp) > 0){ + resSNP$File1R[tmp] <- resSNP[tmp, nuc] + } + tmp <- which(resSNP$ALT == nuc) + if(length(tmp) > 0){ + resSNP$File1A[tmp] <- resSNP[tmp, nuc] + } + } + resSNP <- resSNP[, c("seqnames", "pos", "REF", "ALT", "File1R", "File1A", + "count", "A", "C", "G", "T")] + colnames(resSNP) <- c("Chromosome", "Position", "Ref", "Alt", "File1R", + "File1A", "count", "A", "C", "G", "T") + resSNP$Position <- resSNP$Position + offset + + if(verbose) {message("readSNVBAM pileup format Done ", Sys.time())} + return(resSNP) +} + + + +#' @title The function create a vector of integer representing the linkage +#' disequilibrium block for each SNV in the in the same order +#' than the variant in Population reference dataset. +#' +#' @description The function create a vector of integer representing the linkage +#' disequilibrium block for each SNV in the in the same order +#' than the variant in Population reference dataset. +#' +#' @param fileReferenceGDS a \code{character} string representing the file +#' name of the Reference GDS file. The file must exist. +#' +#' @param fileBlock a \code{character} string representing the file +#' name of output file det from the plink block command for a chromosome. +#' +#' @return a \code{list} containing 2 entries: +#' \describe{ +#' \item{\code{chr}}{ a \code{integer} representing a the chromosome from +#' fileBlock. +#' } +#' \item{\code{block.snp}}{ a \code{array} of integer +#' representing the linkage disequilibrium block for +#' each SNV in the in the same order than the variant +#' in Population reference dataset. +#' } +#' } +#' +#' +#' @examples +#' +#' ## Path to the demo pedigree file is located in this package +#' dataDir <- system.file("extdata", package="RAIDS") +#' +#' ## Demo of Reference GDS file containing reference information +#' fileReferenceGDS <- file.path(dataDir, "PopulationReferenceDemo.gds") +#' +#' ## Demo of of output file det from the plink block +#' ## command for chromosome 1 +#' fileLdBlock <- file.path(dataDir, "block.sp.EUR.Ex.chr1.blocks.det") +#' +#' listLdBlock <- RAIDS:::processBlockChr(fileReferenceGDS, fileLdBlock) +#' +#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz +#' @importFrom gdsfmt read.gdsn index.gdsn closefn.gds +#' @importFrom SNPRelate snpgdsOpen +#' @encoding UTF-8 +#' @keywords internal +processBlockChr <- function(fileReferenceGDS, fileBlock) { + + if (!(is.character(fileReferenceGDS) && (file.exists(fileReferenceGDS)))) { + stop("The \'fileReferenceGDS\' must be a character string ", + "representing the Reference GDS file. The file must exist.") + } + if (!(is.character(fileBlock) && (file.exists(fileBlock)))) { + stop("The \'fileBlock\' must be a character string representing the", + " file .det from plink block result. The file must exist.") + } + + gdsReference <- snpgdsOpen(filename=fileReferenceGDS) + blockChr <- read.delim(fileBlock, sep="") + + listChr <- unique(blockChr$CHR) + if(length(listChr) != 1){ + stop(paste0("There is not just one CHR in ", fileBlock, "\n")) + } + listChr <- as.integer(gsub("chr", "", listChr)) + listSNVChr <- read.gdsn(index.gdsn(gdsReference, "snp.chromosome")) + listSNVChr <- which(listSNVChr == listChr) + snpKeep <- read.gdsn(index.gdsn(gdsReference, "snp.position"))[listSNVChr] + closefn.gds(gdsReference) + z <- cbind(c(blockChr$BP1, snpKeep, blockChr$BP2+1), + c(seq_len(nrow(blockChr)), + rep(0, length(snpKeep)), -1*seq_len(nrow(blockChr)))) + + z <- z[order(z[,1]),] + blockSnp <- cumsum(z[,2])[z[,2] == 0] + + curStart <- 0 + activeBlock <- 0 + blockState <- 0 + blockInter <- rep(0, length(which(blockSnp == 0))) + k <- 1 + for(i in seq_len(length(blockSnp))){ + if(blockSnp[i] == 0){ + if(activeBlock == 1){ + if(snpKeep[i] - curStart >= 10000) { + blockState <- blockState - 1 + + curStart <- snpKeep[i] + } + } else{ + blockState <- blockState - 1 + curStart <- snpKeep[i] + activeBlock <- 1 + } + if(blockState == 0){ + blockState <- -1 + } + blockInter[k] <- blockState + k <- k + 1 + }else{ + activeBlock <- 0 + } + } + + blockSnp[blockSnp == 0] <- blockInter + res <- list(chr=listChr, + block.snp=blockSnp) + return(res) +} + diff --git a/R/visualization.R b/R/visualization.R new file mode 100644 index 000000000..2c7fea29d --- /dev/null +++ b/R/visualization.R @@ -0,0 +1,206 @@ +#' @title Generate accuracy graph +#' +#' @description This function extracts the required information from an +#' output generated by RAIDS to create a graphic representation of the +#' accuracy for different values of PCA dimensions and K-neighbors through +#' all tested ancestries. +#' +#' @param fileRDS a \code{character} string representing the path and file +#' name of the RDS file containing the ancestry information as generated by +#' RAIDS. +#' +#' @param title a \code{character} string representing the title of the graph. +#' Default: \code{""}. +#' +#' @param selectD a \code{array} of \code{integer} representing the selected +#' PCA dimensions to plot. The length of the \code{array} cannot be more than +#' 5 entries. The dimensions must tested by RAIDS (i.e. be present in the +#' RDS file). Default: \code{c(3,7,11)}. +#' +#' @param selectColor a \code{array} of \code{character} strings representing +#' the selected colors for the associated PCA dimensions to plot. The length +#' of the \code{array} must correspond to the length of the \code{selectD} +#' parameter. In addition, the length of the \code{array} cannot be more than +#' 5 entries. +#' Default: \code{c("#5e688a", "#cd5700", "#CC79A7")}. +#' +#' @return a \code{ggplot} object containing the graphic representation of the +#' accuracy for different values of PCA dimensions and K-neighbors through +#' all tested ancestries. +#' +#' @examples +#' +#' ## Required library +#' library(ggplot2) +#' +#' ## Path to RDS file with ancestry information generated by RAIDS (demo file) +#' dataDir <- system.file("extdata", package="RAIDS") +#' fileRDS <- file.path(dataDir, "TEST_01.infoCall.RDS") +#' +#' ## Create accuracy graph +#' accuracyGraph <- createAccuracyGraph(fileRDS=fileRDS, title="Test 01", +#' selectD=c(3,6,9,12,15), +#' selectColor=c("steelblue", "darkorange", "violet", "pink", "gray80")) +#' +#' accuracyGraph +#' +#' @author Astrid Deschênes and Pascal Belleau +#' @importFrom ggplot2 ggplot geom_ribbon geom_line theme_classic ylim +#' @importFrom ggplot2 ylab facet_grid theme scale_colour_manual ggtitle +#' @importFrom ggplot2 element_text element_rect aes +#' @importFrom rlang .data +#' @encoding UTF-8 +#' @export +createAccuracyGraph <- function(fileRDS, title="", + selectD=c(3,7,11), selectColor=c("#5e688a", "#cd5700", "#CC79A7")) { + + ## Validate parameters + validateCreateAccuracyGraph(fileRDS=fileRDS, title=title, selectD=selectD, + selectColor=selectColor) + + ## Extract required information from RDS file + info <- readRDS(fileRDS) + dfAUROC <- info$paraSample$dfAUROC + + if (!all(selectD %in% unique(dfAUROC$pcaD))) { + stop("Not all values in \'selectD\' are present in the RDS file.") + } + + ## Retained selected dimensions + dfAUROC <- dfAUROC[which(dfAUROC$pcaD %in% selectD), ] + dfAUROC$pcaD <- as.factor(dfAUROC$pcaD) + + colnames(dfAUROC)[colnames(dfAUROC) == "pcaD"] <- "D" + + ## Set y axis minimum value + ymin <- min(c(dfAUROC$L)) - 0.008 + + ## Generate graph + accuracy <- ggplot(dfAUROC, aes(x=.data$K, y=.data$AUC, group=.data$D, + color=.data$D, linetype=.data$D)) + + geom_ribbon(aes(ymin=.data$L, ymax=.data$H, group=.data$D), + linetype="dotted", linewidth=2, alpha=0.1) + + geom_line(linewidth=2) + facet_grid(. ~ Call) + + ylim(c(ymin, 1)) + ggtitle(title) + ylab(label = "AUROC") + + scale_colour_manual(aesthetics = c("colour", "fill"), + breaks=selectD, values=selectColor) + + theme_classic() + + theme(axis.text=element_text(size=20, colour = "black"), + panel.background = element_rect(color="black"), + axis.text.x=element_text(size=20, angle=90, + vjust = 0.5, hjust=1, colour="black"), + plot.title = element_text(size=22, face="bold", + colour="gray20", hjust=0.5), + axis.title=element_text(size=30,face="bold.italic"), + strip.text = element_text(size=20, face="bold"), + strip.background = element_rect(fill="gray90"), + legend.text=element_text(size=19), + legend.title=element_text(size=22, face="bold.italic")) + + ## Successful + return(accuracy) +} + +#' @title Generate accuracy graph +#' +#' @description This function extracts the required information from an +#' output generated by RAIDS to create a graphic representation of the +#' accuracy for different values of PCA dimensions and K-neighbors through +#' all tested ancestries. +#' +#' @param dfAUROC a \code{data.frame} corresponding to res$paraSample$dfAUROC +#' where res is the result of inferAncestry() or inferAncestryGeneAware() +#' functions. +#' +#' @param title a \code{character} string representing the title of the graph. +#' Default: \code{""}. +#' +#' @param selectD a \code{array} of \code{integer} representing the selected +#' PCA dimensions to plot. The length of the \code{array} cannot be more than +#' 5 entries. The dimensions must tested by RAIDS (i.e. be present in the +#' RDS file). Default: \code{c(3,7,11)}. +#' +#' @param selectColor a \code{array} of \code{character} strings representing +#' the selected colors for the associated PCA dimensions to plot. The length +#' of the \code{array} must correspond to the length of the \code{selectD} +#' parameter. In addition, the length of the \code{array} cannot be more than +#' 5 entries. +#' Default: \code{c("#5e688a", "#cd5700", "#CC79A7")}. +#' +#' @return a \code{ggplot} object containing the graphic representation of the +#' accuracy for different values of PCA dimensions and K-neighbors through +#' all tested ancestries. +#' +#' @examples +#' +#' ## Required library +#' library(ggplot2) +#' +#' ## Path to RDS file with ancestry information generated by RAIDS (demo file) +#' dataDir <- system.file("extdata", package="RAIDS") +#' fileRDS <- file.path(dataDir, "TEST_01.infoCall.RDS") +#' info <- readRDS(fileRDS) +#' dfAUROC <- info$paraSample$dfAUROC +#' +#' ## Some of the column names must be updated to fit new standards +#' colnames(dfAUROC) <- c("D", "K", "Call", "L", "AUROC", "H") +#' +#' ## Create accuracy graph +#' accuracyGraph <- createAUROCGraph(dfAUROC=dfAUROC, title="Test 01", +#' selectD=c(3, 6, 9, 12, 15), +#' selectColor=c("steelblue", "darkorange", "violet", "pink", "gray40")) +#' +#' accuracyGraph +#' +#' @author Astrid Deschênes and Pascal Belleau +#' @importFrom ggplot2 ggplot geom_ribbon geom_line theme_classic ylim +#' @importFrom ggplot2 ylab facet_grid theme scale_colour_manual ggtitle +#' @importFrom ggplot2 element_text element_rect aes +#' @importFrom rlang .data +#' @encoding UTF-8 +#' @export +createAUROCGraph <- function(dfAUROC, title="", + selectD=c(3,7,11), selectColor=c("#5e688a", "#cd5700", "#CC79A7")) { + + ## Validate parameters + validatecreateAUROCGraph(dfAUROC=dfAUROC, title=title, selectD=selectD, + selectColor=selectColor) + + if (!all(selectD %in% unique(dfAUROC$D))) { + stop("Not all values in \'selectD\' are present in the \'dfAUROC\' ", + "data frame.") + } + + ## Retained selected dimensions + dfAUROC <- dfAUROC[which(dfAUROC$D %in% selectD), ] + dfAUROC$D <- as.factor(dfAUROC$D) + + ## Set y axis minimum value + ymin <- min(c(dfAUROC$L)) - 0.008 + + ## Generate graph + accuracy <- ggplot(dfAUROC, aes(x=.data$K, y=.data$AUROC, group=.data$D, + color=.data$D, linetype=.data$D)) + + geom_ribbon(aes(ymin=.data$L, ymax=.data$H, group=.data$D), + linetype="dotted", linewidth=2, alpha=0.1) + + geom_line(linewidth=2) + facet_grid(. ~ Call) + + scale_colour_manual(aesthetics = c("colour", "fill"), + breaks=selectD, values=selectColor) + + ylim(c(ymin, 1)) + ggtitle(title) + ylab(label = "AUROC") + + theme_classic() + + theme(axis.text=element_text(size=20, colour="black"), + panel.background = element_rect(color="black"), + axis.text.x=element_text(size=20, angle=90, + vjust = 0.5, hjust=1, colour="black"), + plot.title=element_text(size=22, face="bold", + colour="gray20", hjust=0.5), + axis.title=element_text(size=30, face="bold.italic"), + strip.text=element_text(size=20, face="bold"), + strip.background = element_rect(fill="gray90"), + legend.text=element_text(size=19), + legend.title=element_text(size=22, face="bold.italic")) + + ## Successful + return(accuracy) +} + diff --git a/R/visualization_internal.R b/R/visualization_internal.R new file mode 100644 index 000000000..894960a9a --- /dev/null +++ b/R/visualization_internal.R @@ -0,0 +1,190 @@ +#' @title Validate input parameters for createAccuracyGraph +#' function +#' +#' @description This function validates the parameters for the +#' \code{\link{createAccuracyGraph}} function. +#' +#' @param dfAUROC a \code{data.frame} corresponding to res$paraSample$dfAUROC +#' where res is the result of inferAncestry() or inferAncestryGeneAware() +#' functions. +#' +#' @param title a \code{character} string representing the title of the graph. +#' +#' @param selectD a \code{array} of \code{integer} representing the selected +#' PCA dimensions to plot. The length of the \code{array} cannot be more than +#' 5 entries. The dimensions must tested by RAIDS (i.e. be present in the +#' RDS file). +#' +#' @param selectColor a \code{array} of \code{character} strings representing +#' the selected colors for the associated PCA dimensions to plot. The length +#' of the \code{array} must correspond to the length of the \code{selectD} +#' parameter. In addition, the length of the \code{array} cannot be more than +#' 5 entries. +#' +#' @return The function returns \code{0L} when successful. +#' +#' @examples +#' +#' ## Path to RDS file with ancestry information generated by RAIDS (demo file) +#' dataDir <- system.file("extdata", package="RAIDS") +#' fileRDS <- file.path(dataDir, "TEST_01.infoCall.RDS") +#' info <- readRDS(fileRDS) +#' dfAUROC <- info$paraSample$dfAUROC +#' +#' ## Some of the column names must be updated to fit new standards +#' colnames(dfAUROC) <- c("D", "K", "Call", "L", "AUROC", "H") +#' +#' ## Validate parameters +#' RAIDS:::validatecreateAUROCGraph(dfAUROC=dfAUROC, title="Accuracy Graph", +#' selectD=c(6, 12), selectColor=c("blue","darkblue")) +#' +#' @author Astrid Deschênes and Pascal Belleau +#' @importFrom stringr str_detect +#' @encoding UTF-8 +#' @keywords internal +validatecreateAUROCGraph <- function(dfAUROC, title, selectD, selectColor) { + + ## Validate dfAUROC is a data frame + if(!is.data.frame(dfAUROC)) { + stop("The \'dfAUROC\' parameter must be a data frame.") + } + + ## Validate the file extension + colL <- c("D", "K", "Call", "L", "AUROC", "H") + if (!all(colL %in% colnames(dfAUROC))) { + stop("The \'dfAUROC\' must have all those columns: D, K, Call, ", + "L, AUROC, H.") + } + + ## Validate other parameters + res <- validateAccuracyGraphInternal(title=title, selectD=selectD, + selectColor=selectColor) + + ## Success + return(res) +} + + +#' @title Validate input parameters for createAccuracyGraph +#' function +#' +#' @description This function validates the parameters for the +#' \code{\link{createAccuracyGraph}} function. +#' +#' @param fileRDS a \code{character} string representing the path and file +#' name of the RDS file containing the ancestry information as generated by +#' RAIDS. +#' +#' @param title a \code{character} string representing the title of the graph. +#' +#' @param selectD a \code{array} of \code{integer} representing the selected +#' PCA dimensions to plot. The length of the \code{array} cannot be more than +#' 5 entries. The dimensions must tested by RAIDS (i.e. be present in the +#' RDS file). +#' +#' @param selectColor a \code{array} of \code{character} strings representing +#' the selected colors for the associated PCA dimensions to plot. The length +#' of the \code{array} must correspond to the length of the \code{selectD} +#' parameter. In addition, the length of the \code{array} cannot be more than +#' 5 entries. +#' +#' @return The function returns \code{0L} when successful. +#' +#' @examples +#' +#' ## Path to RDS file with ancestry information generated by RAIDS (demo file) +#' dataDir <- system.file("extdata", package="RAIDS") +#' fileRDS <- file.path(dataDir, "TEST_01.infoCall.RDS") +#' +#' ## Validate parameters +#' RAIDS:::validateCreateAccuracyGraph(fileRDS=fileRDS, title="Accuracy Graph", +#' selectD=c(5, 10), selectColor=c("blue","darkblue")) +#' +#' @author Astrid Deschênes and Pascal Belleau +#' @importFrom stringr str_detect +#' @encoding UTF-8 +#' @keywords internal +validateCreateAccuracyGraph <- function(fileRDS, title, selectD, selectColor) { + + ## Validate fileRDS is character string + if(!is.character(fileRDS)) { + stop("The \'fileRDS\' parameter must be a character string.") + } + + ## Validate fileRDS exist + if(!file.exists(fileRDS)) { + stop("The \'fileRDS\' file does not exist.") + } + + ## Validate the file extension + if (!str_detect(fileRDS, "\\.(RDS|rds)$")) { + stop("The \'fileRDS\' must have a RDS (or rds) extension.") + } + + ## Validate other parameters + res <- validateAccuracyGraphInternal(title=title, selectD=selectD, + selectColor=selectColor) + + ## Success + return(res) +} + + +#' @title Validate input parameters for createAccuracyGraph and +#' createAUROCGraph functions +#' +#' @description This function validates the parameters for the +#' \code{\link{createAccuracyGraph}} and \code{\link{createAUROCGraph}} +#' functions. +#' +#' @param title a \code{character} string representing the title of the graph. +#' +#' @param selectD a \code{array} of \code{integer} representing the selected +#' PCA dimensions to plot. The length of the \code{array} cannot be more than +#' 5 entries. The dimensions must tested by RAIDS (i.e. be present in the +#' RDS file). +#' +#' @param selectColor a \code{array} of \code{character} strings representing +#' the selected colors for the associated PCA dimensions to plot. The length +#' of the \code{array} must correspond to the length of the \code{selectD} +#' parameter. In addition, the length of the \code{array} cannot be more than +#' 5 entries. +#' +#' @return The function returns \code{0L} when successful. +#' +#' @examples +#' +#' ## Validate parameters +#' RAIDS:::validateAccuracyGraphInternal(title="Accuracy Graph", +#' selectD=c(5, 10), selectColor=c("blue","darkblue")) +#' +#' @author Astrid Deschênes and Pascal Belleau +#' @importFrom stringr str_detect +#' @encoding UTF-8 +#' @keywords internal +validateAccuracyGraphInternal <- function(title, selectD, selectColor) { + + ## Validate the title is a string + if (!is.character(title)) { + stop("The \'title\' must be a character string.") + } + + ## Validate the length of the selectD parameter + if (length(selectD) == 0) { + stop("The \'selectD\' parameter cannot be empty.") + } + + ## Validate the length of the selectD parameter + if (length(selectD) > 5) { + stop("The \'selectD\' parameter can contain a maximum of 5 elements.") + } + + ## Validate lengths of selectD and selectcolor identical + if (length(selectColor) != length(selectD)) { + stop("The \'selectColor\' parameter must be the same length than ", + "the \'selectD\' parameter.") + } + + ## Success + return(0L) +} \ No newline at end of file diff --git a/README.md b/README.md index b9aa5f80b..9a3029e4f 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,13 @@ [![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) -[![codecov](https://codecov.io/gh/krasnitzlab/RAIDS/branch/main/graph/badge.svg?token=LPFLOMUDVT)](https://codecov.io/gh/krasnitzlab/RAIDS) +[![codecov](https://codecov.io/gh/adeschen/RAIDS/graph/badge.svg?token=JTTO1CH8AZ)](https://codecov.io/gh/adeschen/RAIDS) [![R-CMD-check-bioc](https://github.com/krasnitzlab/RAIDS/actions/workflows/check-bioc.yaml/badge.svg)](https://github.com/krasnitzlab/RAIDS/actions/workflows/check-bioc.yaml) -# Accurate genetic ancestry inference from cancer-derived molecular data with **RAIDS** # +# Robust genetic ancestry inference from challenging molecular data with **RAIDS** # -The Robust Ancestry Inference using Data Synthesis (**_RAIDS_**) package +The Robust Ancestry Inference using Data Synthesis (**RAIDS**) package enables accurate and robust inference of genetic ancestry from various types of molecular data, including whole-genome, whole-exome, targeted gene panels and RNA sequences, as described in our @@ -22,22 +22,44 @@ alterations, such as those caused by cancer. ## Authors ## [Pascal Belleau](http://ca.linkedin.com/in/pascalbelleau "Pascal Belleau"), -[Astrid Deschênes](http://ca.linkedin.com/in/astriddeschenes "Astrid Deschênes"), -David A. Tuveson and +[Astrid Deschênes](https://www.linkedin.com/in/astriddeschenes "Astrid Deschênes"), +[David A. Tuveson](https://tuvesonlab.labsites.cshl.edu/) and [Alexander Krasnitz](https://www.cshl.edu/research/faculty-staff/alexander-krasnitz/ "Alexander Krasnitz") ## Citing ## -If you use the **_RAIDS_** package for a publication, we would ask you to cite +If you use the **RAIDS** package for a publication, we would ask you to cite the following: > Pascal Belleau, Astrid Deschênes, Nyasha Chambwe, David A. Tuveson, Alexander Krasnitz; Genetic Ancestry Inference from Cancer-Derived Molecular Data across Genomic and Transcriptomic Platforms. Cancer Res 1 January 2023; 83 (1): 49–58. https://doi.org/10.1158/0008-5472.CAN-22-0682 +## Bioconductor Package ## + +[![Bioconductor Time](https://bioconductor.org/shields/years-in-bioc/RAIDS.svg)](https://bioconductor.org/packages/RAIDS) + +The **RAIDS** package is now an official package +of [Bioconductor](http://bioconductor.org/). + +The current Bioconductor release can be directly downloaded from their website: +[Current release](https://bioconductor.org/packages/RAIDS) + + ## Installation ## -To install the latest version accessible, the [devtools](https://cran.r-project.org/web/packages/devtools/index.html) +To install this package +from [Bioconductor](https://bioconductor.org/packages/RAIDS), start R +(version "4.3" or later) and enter: + + if (!requireNamespace("BiocManager", quietly = TRUE)) + install.packages("BiocManager") + + BiocManager::install("RAIDS") + + +To install the latest version accessible from Github, +the [devtools](https://cran.r-project.org/web/packages/devtools/index.html) package is required. ## Load required package @@ -56,7 +78,7 @@ package is required. ## License ## -This package and the underlying **_RAIDS_** code are distributed under +This package and the underlying **RAIDS** code are distributed under the Apache-2.0 license. You are free to use and redistribute this software. For more information on Apache-2.0 License see diff --git a/data/demoKnownSuperPop1KG.RData b/data/demoKnownSuperPop1KG.RData new file mode 100644 index 000000000..4facb3ee5 Binary files /dev/null and b/data/demoKnownSuperPop1KG.RData differ diff --git a/data/demoPCA1KG.RData b/data/demoPCA1KG.RData new file mode 100644 index 000000000..e5c37ba3a Binary files /dev/null and b/data/demoPCA1KG.RData differ diff --git a/data/demoPCASyntheticProfiles.RData b/data/demoPCASyntheticProfiles.RData new file mode 100644 index 000000000..62846c3cb Binary files /dev/null and b/data/demoPCASyntheticProfiles.RData differ diff --git a/data/demoPedigreeEx1.RData b/data/demoPedigreeEx1.RData new file mode 100644 index 000000000..3aaf3166f Binary files /dev/null and b/data/demoPedigreeEx1.RData differ diff --git a/data/matKNNSynthetic.RData b/data/matKNNSynthetic.RData new file mode 100644 index 000000000..b4b7771a7 Binary files /dev/null and b/data/matKNNSynthetic.RData differ diff --git a/data/pedSynthetic.RData b/data/pedSynthetic.RData new file mode 100644 index 000000000..e9ef8066e Binary files /dev/null and b/data/pedSynthetic.RData differ diff --git a/data/snpPositionDemo.RData b/data/snpPositionDemo.RData new file mode 100644 index 000000000..76a7e092b Binary files /dev/null and b/data/snpPositionDemo.RData differ diff --git a/doc/README_1KG_GDS.md b/doc/README_1KG_GDS.md deleted file mode 100644 index 26e7deb85..000000000 --- a/doc/README_1KG_GDS.md +++ /dev/null @@ -1,152 +0,0 @@ -# Generate the 1000 genomes GDS file # - -Preparation of the 1000 genomes (1KG) GDS file for RAIDS - -## Authors ## - -[Pascal Belleau](http://ca.linkedin.com/in/pascalbelleau "Pascal Belleau"), -[Astrid Deschênes](http://ca.linkedin.com/in/astriddeschenes "Astrid Deschênes") and -[Alexander Krasnitz](https://www.cshl.edu/research/faculty-staff/alexander-krasnitz/ "Alexander Krasnitz") - -## Download needs ## - -you need to download the vcf files from GRCh38 -The files used are (add ref): - - ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20181203_biallelic_SNV/ALL.chr*.shapeit2_integrated_v1a.GRCh38.20181129.phased.vcf.gz* - - -You need a pedigree file for the description of the samples - - ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/working/20130606_sample_info/20130606_g1k.ped - -## Prepare a rds file in R from the pedigree file ## - -In R: - - library(RAIDS) - ## pedigree file from 1KG with the path - ## If the file is in the current directory - filePED1KGOri <- "20130606_g1k.ped" - ped <- prepPed1KG(filePED1KGOri) - ## save the RDS file here we - ##save the file in data/metadata/ped1KG.rds - filePED1KG <- file.path("data", "metadata","ped1KG.rds") - saveRDS(ped, filePED1KG) - - - -## Prepare intermediate files based on 1KG vcf for GDS - -We generate some intermediate files to prepare the data to be import in R GDS file (Bioconductor gdsfmt) - -Generate a file with the SNP position and the frequence in each super population. - -PATHVCF is the path to the vcf files from 1KG -The script generates the matFreq.${chr}.txt -We will change extract1000gFreq.py soon. - - - for i in `ls PATHVCF/*shapeit2_integrated_v1a.GRCh38.20181129.phased.vcf.gz` - do - chr=$(echo $FILECUR|perl -n -e '/ALL\.(chr[^\.]+)/;print $1') - python PATH2SCRIPT/extract1000gFreq.py ${FILECUR} matFreq.${chr} - done - for i in `seq 1 22` - do - cat matFreq.chr${i}.txt >matFreqSNV.txt - bzip2 matFreqSNV.txt - done - - -Generate a list of SNP with frequency higher then a cutoff (here 0.01) for at least 1 super population. - -The file is mapSNVSel.rds - -In R: - - library(RAIDS) - - fileSNV.v <- file.path(PATHGENO, "matFreqSNV.txt.bz2") - fileLSNP.v <- file.path(PATHSEL, "listSNP.rds") - fileFREQ.v <- file.path(PATHSEL, "mapSNVSel.rds") - generateMapSnvSel(cutOff = 0.01, - fileSNV = fileSNV.v, - fileLSNP = fileLSNP.v, - fileFREQ = fileFREQ.v) - - - -## Split the VCF by sample - -TODO - -## Generate the base GDS file with 1KG - - library(RAIDS) - - filePED1KG <- file.path(PATHMETA,"ped1KG.rds") - fileSNV.v <- file.path(PATHGENO, "matFreqSNV.txt.bz2") - fileLSNP.v <- file.path(PATHSEL, "listSNP.rds") - fileFREQ.v <- file.path(PATHSEL, "mapSNVSel.rds") - - fileNameGDS <- "matGeno1000g.gds" - fileGDS <- file.path(PATHGDS, fileNameGDS) - fileIBD <- file.path(PATHMETA,"ibd.All.0.05.rds") - filePart <- file.path(PATHMETA,"part.All.0.05.rds") - - # Generate the base structure of the gds file - generateGDS1KG(PATHGENO = file.path("data", "sampleGeno"), - fileNamePED = filePED1KG, - fileSNPSel = fileFREQ.v, - fileListSNP = fileLSNP.v, - fileNameGDS = fileGDS) - - # Identify the individual related - gds <- snpgdsOpen(fileGDS) - identifyRelative(gds, - maf = 0.05, - thresh = 2^(-11/2), - fileIBD = fileIBD, - filePart = filePart) - closefn.gds(gds) - - # create the ref sample list which correspond - # to the list of sample unrelated - - addRef2GDS1KG(fileGDS, - filePart) - - -## The base GDS for 1KG - - File: fileGDS (15.4G) - + [ ] * - # The id for the genotype - |--+ sample.id { Str8 2548, 19.9K } - # data.frame for the annotation of sample - # for all genotype good for the reference - # samples but sn other data.frame will be - # define to describe the samples - # in the studies - |--+ sample.annot [ data.frame ] * - | |--+ sex { Str8 2548, 5.0K } - # population - | |--+ pop.group { Str8 2548, 10.0K } - # The 5 super populations in - | |--+ superPop { Str8 2548, 10.0K } - | \--+ batch { Float64 2548, 19.9K } - |--+ snp.id { Str8 24542710, 223.5M } - |--+ snp.chromosome { UInt16 24542710, 46.8M } - |--+ snp.position { Int32 24542710, 93.6M } - |--+ snp.allele { Str8 24542710, 93.6M } - |--+ snp.AF { PackedReal24 24542710, 70.2M } - |--+ snp.EAS_AF { PackedReal24 24542710, 70.2M } - |--+ snp.EUR_AF { PackedReal24 24542710, 70.2M } - |--+ snp.AFR_AF { PackedReal24 24542710, 70.2M } - |--+ snp.AMR_AF { PackedReal24 24542710, 70.2M } - |--+ snp.SAS_AF { PackedReal24 24542710, 70.2M } - |--+ genotype { Bit2 24542710x2548, 14.6G } - \--+ sample.ref { Bit1 2548, 319B } - - diff --git a/doc/README_GDS.struct.md b/doc/README_GDS.struct.md deleted file mode 100644 index 63d8ef406..000000000 --- a/doc/README_GDS.struct.md +++ /dev/null @@ -1,106 +0,0 @@ -# GDS file structure - - -## The base for 1KG - - File: /mnt/wigclust5/data/unsafe/belleau/process1000G/samples1000gUnrelated/data/genoGDS1KG.2022.03.22/matGeno1000g.gds (15.4G) - + [ ] * - |--+ sample.id { Str8 2548, 19.9K } - |--+ sample.annot [ data.frame ] * - | |--+ sex { Str8 2548, 5.0K } - | |--+ pop.group { Str8 2548, 10.0K } - | |--+ superPop { Str8 2548, 10.0K } - | \--+ batch { Float64 2548, 19.9K } - |--+ snp.id { Str8 24542710, 223.5M } - |--+ snp.chromosome { UInt16 24542710, 46.8M } - |--+ snp.position { Int32 24542710, 93.6M } - |--+ snp.allele { Str8 24542710, 93.6M } - |--+ snp.AF { PackedReal24 24542710, 70.2M } - |--+ snp.EAS_AF { PackedReal24 24542710, 70.2M } - |--+ snp.EUR_AF { PackedReal24 24542710, 70.2M } - |--+ snp.AFR_AF { PackedReal24 24542710, 70.2M } - |--+ snp.AMR_AF { PackedReal24 24542710, 70.2M } - |--+ snp.SAS_AF { PackedReal24 24542710, 70.2M } - |--+ genotype { Bit2 24542710x2548, 14.6G } - \--+ sample.ref { Bit1 2548, 319B } - -If we add the information relative to specific studies we want to -infer the ancestry. - - File: /mnt/wigclust15/data/unsafe/belleau/TCGA-OV-WXS/data/genoGDS1KG.TCGA-OV.2022.03.30/matGeno1000g.gds (15.5G) - + [ ] * - |--+ sample.id { Str8 2558, 20.2K } - |--+ sample.annot [ data.frame ] * - | |--+ sex { Str8 2558, 5.0K } - | |--+ pop.group { Str8 2558, 10.2K } - | |--+ superPop { Str8 2558, 10.0K } - | \--+ batch { Float64 2558, 20.0K } - |--+ snp.id { Str8 24516859, 223.2M } - |--+ snp.chromosome { UInt16 24516859, 46.8M } - |--+ snp.position { Int32 24516859, 93.5M } - |--+ snp.allele { Str8 24516859, 93.5M } - |--+ snp.AF { PackedReal24 24516859, 70.1M } - |--+ snp.EAS_AF { PackedReal24 24516859, 70.1M } - |--+ snp.EUR_AF { PackedReal24 24516859, 70.1M } - |--+ snp.AFR_AF { PackedReal24 24516859, 70.1M } - |--+ snp.AMR_AF { PackedReal24 24516859, 70.1M } - |--+ snp.SAS_AF { PackedReal24 24516859, 70.1M } - |--+ genotype { Bit2 24516859x2558, 14.6G } - |--+ sample.ref { Bit1 2548, 319B } - |--+ study.offset { Int32 1, 4B } - |--+ study.list [ data.frame ] * - | |--+ study.id { Str8 1, 14B } - | |--+ study.desc { Str8 1, 16B } - | \--+ study.platform { Str8 1, 4B } - \--+ study.annot [ data.frame ] * - |--+ data.id { Str8 10, 170B } - |--+ case.id { Str8 10, 130B } - |--+ sample.type { Str8 10, 140B } - |--+ diagnosis { Str8 10, 20B } - |--+ source { Str8 10, 60B } - \--+ study.id { Str8 10, 140B } - - -## The phase GDS for 1KG - -It includes the linkage disequilibrium and the gene info -for the allelic of the RNA seq (Not show yet) - - File: /mnt/wigclust6/data/unsafe/belleau/process1000G/samples1000gUnrelated/data/genoGDS1KG.2022.04.03/matPhase1000g.gds (5.1G) - + [ ] - \--+ phase { Bit2 24516859x2548 LZ4_ra(35.0%), 5.1G } - - -## A group of GDS specific to each sample we want to infor - -Principaly use for the simulation - - File: /mnt/wigclust15/data/unsafe/belleau/TCGA-OV-WXS/data/genoGDS1KG.TCGA-OV.samples/TCGA-25-2404-01A-01W-0799-08.gds (164.1M) - + [ ] - |--+ Ref.count { SparseInt16 24516859x1, 7.2M } - |--+ Alt.count { SparseInt16 24516859x1, 1.0M } - |--+ Total.count { SparseInt16 24516859x1, 7.6M } - |--+ pruned.study { Str8 237908, 2.2M } - |--+ sampleStudy { Str8 1, 29B } - |--+ sample.id { Str8 2470, 19.3K } - |--+ snp.id { Str8 237908, 2.2M } - |--+ snp.chromosome { Int32 237908, 929.3K } - |--+ snp.position { Int32 237908, 929.3K } - |--+ snp.index { Int32 237908, 929.3K } - |--+ genotype { Bit2 237908x2470, 140.1M } - |--+ SamplePos { Float64 1, 8B } - |--+ lap { PackedReal8 237908, 232.3K } - \--+ segment { UInt32 237908, 929.3K } - - -## A GDS phase specific for a study with with a smaller number of SNV - -The snp.id are the union the snp.id of GDSSample - -File: /mnt/wigclust15/data/unsafe/belleau/TCGA-OV-WXS/data/genoGDS1KG.TCGA-OV.samples/phase1KG.gds (434.6M) -+ [ ] -|--+ snp.id { Str8 722473, 6.6M } -|--+ snp.index { Int32 722473, 2.8M } -\--+ phase { Bit2 722473x2469, 425.3M } - - diff --git a/doc/README_StudyInit.md b/doc/README_StudyInit.md deleted file mode 100644 index a9d3eb4ce..000000000 --- a/doc/README_StudyInit.md +++ /dev/null @@ -1,257 +0,0 @@ -# Initialize a study - -You need a mapped BAM with the same genome than the -reference here (hg38 for 1KG) -for each sample. - -## Extract the read at each in the GDS - -We used here snp-pileup from Facet - -First you a vcf file with the SNV you want to keep - -## Create a VCF file with the SNV you want tokeep -In R - library(RAIDS) - - gds <- snpgdsOpen(fileGDS1kg) - snvListVCF(gds, fileOUT, offset = 1) - closefn.gds(gds) - -You should compress and indexing the vcf file -You need to install [HTSlib](http://www.htslib.org/download/) - -in a terminal - - bgzip fileOUT - tabix -p vcf fileOUT.gz - - - -You need a ped file with the column: - -"sample.id"* Id of the sample but the sample can be genotyped more than one -"sex"* -"pop.group"* can be self declare or something else but must be there -"superPop"* can be self declare or something else but must be there -"Sample.Type"* Ex Primary Tumor or Blood Derived Normal -"Diagnosis"* C or N (cancer normal) this is for the sample not the patient -"Source"* The tissue ex Ovary -"Recurrent" -"Case.ID" Patient identifiant -"Name.ID"* This the unique id of the table -"batch" Number - -You need the extra code in the package Facet to extract the SNV -[snp-pileup](https://github.com/mskcc/facets/tree/master/inst/extcode) - -The output from snp-pileup should be Name.ID.txt -(snp-pileup add .gz to the filename) - - - snp-pileup -g -d5000 -q15 -Q20 -r0 VCFGenerateAbove snvSel0.01.vcf.gz OUTPUT.txt FILEBAM.bam - - - -TODO change the path to something generic - -You add the genotype call from the SNP-pileup to -the gds 1KG and create a GDS file for each sample. -with the covereage information - -in R: - - library(RAIDS) - - PATHGENO <- file.path("data", "snpCancer") - PATHMETA <- file.path("data", "metadata") - PATHGDS <- file.path("data", "genoGDS1000gAF.All.TCGA_OV.WXS") - - fileNamePED <- "pedTCGA_OV_WXS_C.rds" - fileNameGDS <- "matGeno1000g.gds" - - filePED <- file.path(PATHMETA, fileNamePED) - fileNameGDS <- file.path(PATHGDS, fileNameGDS) - - ped <- readRDS(filePED) - - studyDF <- data.frame(study.id = "TCGA-OV.WXS.C", - study.desc = "Ovarian example", - study.platform = "WXS", - stringsAsFactors = FALSE) - - listSamples <- ped[, "Name.ID"] - appendStudy2GDS1KG(PATHGENO = PATHGENO, - fileNamePED = filePED, - fileNameGDS = fileNameGDS, - listSamples = listSamples, - studyDesc = studyDF, - batch = 1) - - - - - -The gds per sample at this step - + [ ] - |--+ Ref.count { SparseInt16 24516859x1, 7.2M } - |--+ Alt.count { SparseInt16 24516859x1, 1.0M } - |--+ Total.count { SparseInt16 24516859x1, 7.6M } - |--+ sampleStudy { Str8 1, 29B } - -It the coverage it define at each position in the GDS 1KG - -We select a subset of SNV pruned SNV - -I include a R script with the possibility to run it in parallele -on mutliple instance. - - Rscript ${PATHRSCRIPT}/runPruningStudy1KG.R ${PATHPKG} ${PATHGDS} matGeno1000g.gds $SGE_TASK_ID $PATHOUT $FILEPREF $STUDYNAME - -or in R - - library(RAIDS) - - # fileNameGDS with the sample and 1KG genotype - # listSamples list of sample.id from the study - # PATHSAMPLEGDS is the path where the gds specific - # to the sample is created - - gds <- snpgdsOpen(fileNameGDS) - - for(i in seq_len(length(listSamples))){ - print(system.time(pruned <- pruningSample(gds=gds, - method="corr", - sampleCurrent = listSamples[i], - listSNP = NULL, - slide.max.bp.v = 5e5, - ld.threshold.v=sqrt(0.1), - np = 1, - verbose.v=FALSE, - chr = NULL, - minAF.SuperPop = NULL, - keepGDSpruned = TRUE, - PATHSAMPLEGDS = PATHSAMPLEGDS, - keepFile = FALSE))) - } - - Sys.time() - closefn.gds(gds) - -The function pruningSample add the snp.id of the snp selected -in the pruning process to the gds Sample - - + [ ] - |--+ Ref.count { SparseInt16 24516859x1, 7.2M } - |--+ Alt.count { SparseInt16 24516859x1, 1.0M } - |--+ Total.count { SparseInt16 24516859x1, 7.6M } - |--+ sampleStudy { Str8 1, 29B } - |--+ pruned.study { Str8 237908, 2.2M } - - -TODO -Add genotype only for the snp selected in pruning to gdsSample - - library(RAIDS) - - # fileNameGDS1KG the file to the gds1KG - # PATHSAMPLEGDS the path where the files gdsSample - - gds <- snpgdsOpen(fileNameGDS1KG) - listGDSSample <- dir(PATHSAMPLEGDS, pattern = ".+.gds") - - for(gdsSampleFile in listGDSSample){ - print(system.time(add1KG2SampleGDS(gds, file.path(PATHSAMPLEGDS, gdsSampleFile) ))) - } - - -see GDSSample - - + [ ] - # Ref.count, Alt.count, Total.count are - # all the snv from 1KG - |--+ Ref.count { SparseInt16 24516859x1, 7.2M } - |--+ Alt.count { SparseInt16 24516859x1, 1.0M } - |--+ Total.count { SparseInt16 24516859x1, 7.6M } - # Genotype from pruning - |--+ sampleStudy { Str8 1, 29B } - |--+ pruned.study { Str8 237908, 2.2M } - |--+ sample.id { Str8 2470, 19.3K } - |--+ snp.id { Str8 237908, 2.2M } - |--+ snp.chromosome { Int32 237908, 929.3K } - |--+ snp.position { Int32 237908, 929.3K } - |--+ snp.index { Int32 237908, 929.3K } - |--+ genotype { Bit2 237908x2470, 140.1M } - |--+ SamplePos { Float64 1, 8B } - |--+ lap { PackedReal8 237908, 232.3K } - \--+ segment { UInt32 237908, 929.3K } # not sure I keep it - - -Section annotation study - |--+ study.list [ data.frame ] * - | |--+ study.id { Str8 1, 9B } - | |--+ study.desc { Str8 1, 13B } - | \--+ study.platform { Str8 1, 4B } - \--+ study.annot [ data.frame ] * - |--+ data.id { Str8 188, 1.1K } - |--+ case.id { Str8 188, 1.1K } - |--+ sample.type { Str8 188, 376B } - |--+ diagnosis { Str8 188, 376B } - |--+ source { Str8 188, 1.7K } - \--+ study.id { Str8 188, 1.7K } -Not yet the good struct for option 1 and 2 - -option 1 all the hetero from ex varscan - - |--+ Ref.count.o { SparseInt16 24516859x1, 7.2M } - |--+ Alt.count.o { SparseInt16 24516859x1, 1.0M } - |--+ Total.count.o { SparseInt16 24516859x1, 7.6M } - |--+ snp.id.o { Str8 237908, 2.2M } - |--+ snp.allele.o { Str8 237908, 2.2M } - |--+ snp.chromosome.o { Int32 237908, 929.3K } - |--+ snp.position.o { Int32 237908, 929.3K } - |--+ normal.geno { Bit2 237908x2470, 140.1M } - |--+ cancer.geno { Bit2 237908x2470, 140.1M } - -option 2 all the hetero in normal from ex varscan we extract -the coverage in cancer at the position where the hetero -are (no overlap with anp.id) - - |--+ Ref.count.N { SparseInt16 24516859x1, 7.2M } - |--+ Alt.count.N { SparseInt16 24516859x1, 1.0M } - |--+ Total.count.N { SparseInt16 24516859x1, 7.6M } - |--+ snp.id.N { Str8 237908, 2.2M } - |--+ snp.chromosome.N { Int32 237908, 929.3K } - |--+ snp.position.N { Int32 237908, 929.3K } - -Block - - File: /mnt/wigclust5/data/unsafe/belleau/process1000G/samples1000gUnrelated/data/genoGDS1KG.2022.04.18/testBlock.gds (69.2M) - + [ ] - |--+ block.annot [ data.frame ] * - | |--+ block.id { Str8 2, 28B } - | \--+ block.desc { Str8 2, 108B } - \--+ block { Int32 9076010x2 LZ4_ra(3.75%), 2.6M } - - -We create another GDS with the snp.id, -snp.index the index in GDS of the snp in at least one sample -phase information for the snp.id of the 1KG -in R - - - fileGDS1KG <- file.path(PATH1KG, PATHGDS, "matGeno1000g.gds") - gds <- openfn.gds(fileGDS1KG) - fileGDSPhase <- file.path(PATH1KG, PATHGDS, "matPhase1000g.gds") - gdsPhase <- openfn.gds(fileGDSPhase) - - PATHSAMPLEGDS <- file.path("data", "genoGDS1KG.TCGA-OV.samples") - - - addPhase1KG2SampleGDSFromGDS(gds, - gdsPhase, - PATHSAMPLEGDS) - closefn.gds(gdsPhase) - closefn.gds(gds) - - diff --git a/inst/NEWS.md b/inst/NEWS.md index 57a1471f2..dfc113acf 100644 --- a/inst/NEWS.md +++ b/inst/NEWS.md @@ -1,3 +1,129 @@ +CHANGES IN VERSION 1.5.1 +------------------------ + +SIGNIFICANT USER-VISIBLE CHANGES + + o Integration of Rsamtools + +CHANGES IN VERSION 1.3.3 +------------------------ + +SIGNIFICANT USER-VISIBLE CHANGES + + o More comprehensive vignette + +CHANGES IN VERSION 1.3.2 +------------------------ + +SIGNIFICANT USER-VISIBLE CHANGES + + o New functions inferAncestry(), inferAncestryGeneAware() and getRefSuperPop() to simplify ancestry inference + +CHANGES IN VERSION 1.3.1 +------------------------ + +SIGNIFICANT USER-VISIBLE CHANGES + + o New function createAccuracyGraph() that creates a graphic representation of the accuracy for different values of PCA dimensions and K-neighbors through all tested ancestries. + +CHANGES IN VERSION 0.99.15 +------------------------ + +SIGNIFICANT USER-VISIBLE CHANGES + + o Updating installation section in vignette. + +CHANGES IN VERSION 0.99.14 +------------------------ + +SIGNIFICANT USER-VISIBLE CHANGES + + o Adding missing author David Tuveson. + o Updating BiocViews terms. + +CHANGES IN VERSION 0.99.13 +------------------------ + +SIGNIFICANT USER-VISIBLE CHANGES + + o Update in Reference GDS vignette. + +CHANGES IN VERSION 0.99.12 +------------------------ + +SIGNIFICANT USER-VISIBLE CHANGES + + o Seven new loadable objects are available in the package. + o The new readSNVVCF() function enable the use of VCF SNP files as input for the runExomeAncestry() and runRNAAncestry() functions. + + +CHANGES IN VERSION 0.99.11 +------------------------ + +SIGNIFICANT USER-VISIBLE CHANGES + + o Update main vignette. + +CHANGES IN VERSION 0.99.10 +------------------------ + +SIGNIFICANT USER-VISIBLE CHANGES + + o Update main vignette. + +CHANGES IN VERSION 0.99.9 +------------------------ + +SIGNIFICANT USER-VISIBLE CHANGES + + o Better documentation for the runRNAAncestry() function. + + +CHANGES IN VERSION 0.99.8 +------------------------ + +SIGNIFICANT USER-VISIBLE CHANGES + + o Some examples have been updated in the documentation. + + +CHANGES IN VERSION 0.99.7 +------------------------ + +SIGNIFICANT USER-VISIBLE CHANGES + + o The new runRNAAncestry() function executes most steps leading to the ancestry inference call on a specific RNA profile. + o A vignette describing the content of the Reference GDS files has been created. + o More parameter names have been changed to follow the camelCase style. + + +CHANGES IN VERSION 0.99.6 +------------------------ + +SIGNIFICANT USER-VISIBLE CHANGES + + o The vignette now referes to the generic formatted reference GDS rather than 1KG GDS file to showcase that the software is not dependant of the 1KG GDS file. Any refence dataset can be used as long as the dataset is formatted into a GDS file. + + +CHANGES IN VERSION 0.99.5 +------------------------ + +SIGNIFICANT USER-VISIBLE CHANGES + + o The function documentation has been improved. + o New vignette has been created. The vignette covers the steps done by the runExomeAncestry() function. + + +CHANGES IN VERSION 0.99.4 +------------------------ + +SIGNIFICANT USER-VISIBLE CHANGES + + o More parameter names have been changed to follow the camelCase style. + o The function documentation has been improved. + o A wrapper function runExomeAncestry() is now available. + + CHANGES IN VERSION 0.99.3 ------------------------ diff --git a/inst/extdata/1KG_Demo.gds b/inst/extdata/1KG_Demo.gds deleted file mode 100644 index 5338fcaee..000000000 Binary files a/inst/extdata/1KG_Demo.gds and /dev/null differ diff --git a/inst/extdata/1KG_Demo_with_sampleREF.gds b/inst/extdata/1KG_Demo_with_sampleREF.gds deleted file mode 100644 index dbb7375e7..000000000 Binary files a/inst/extdata/1KG_Demo_with_sampleREF.gds and /dev/null differ diff --git a/inst/extdata/PedigreeDemo.rds b/inst/extdata/PedigreeReferenceDemo.rds similarity index 100% rename from inst/extdata/PedigreeDemo.rds rename to inst/extdata/PedigreeReferenceDemo.rds diff --git a/inst/extdata/tests/1KG_Test.gds b/inst/extdata/PopulationReferenceDemo.gds similarity index 51% rename from inst/extdata/tests/1KG_Test.gds rename to inst/extdata/PopulationReferenceDemo.gds index 5338fcaee..737eb19d1 100644 Binary files a/inst/extdata/tests/1KG_Test.gds and b/inst/extdata/PopulationReferenceDemo.gds differ diff --git a/inst/extdata/PopulationReferenceSNVAnnotationDemo.gds b/inst/extdata/PopulationReferenceSNVAnnotationDemo.gds new file mode 100644 index 000000000..159d01b80 Binary files /dev/null and b/inst/extdata/PopulationReferenceSNVAnnotationDemo.gds differ diff --git a/inst/extdata/README.txt b/inst/extdata/README.txt new file mode 100644 index 000000000..92f35a024 --- /dev/null +++ b/inst/extdata/README.txt @@ -0,0 +1,95 @@ +############################################################################### +## README file +## +## This file describes the objects present in the inst/extdata directory. +############################################################################### + + +############################################################################### +## extdata/demoGenoChr.tar +############################################################################### + +The files present in the extdata/demoGenoChr.tar files are the first +100 genotypes of 3 reference profiles (NA12003, NA12004 and NA12005) for +each chromosome. The genotypes are directly extracted from the 1KG VCF files +that is available at: + http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20181203_biallelic_SNV + + +############################################################################### +## tests/ex1_good_small_1KG.gds +## tests/ex1_good_small_1KG_Annot.gds +############################################################################### + +Both files are based on subset of the genotypes from 1000 Genomes described in: + https://wellcomeopenresearch.org/articles/4-50/ + +The 500 SNVs from each chromosome habe been retained (11000 for the +22 autosomal chromosomes). For each of the 26 sub-continental populations +in this version of 1000 Genomes, 6 profiles have been randomly selected +(total of 156 profiles) to generate those reference GDS files. + + +############################################################################### +## example/snpPileup/ex1.vcf.gz +## example/snpPileup/ex1.generic.txt.gz +## example/snpPileup/ex1.txt.gz +## example/snpPileupRNA/ex1.vcf.gz +## example/snpPileupRNA/ex1.generic.txt.gz +## example/snpPileupRNA/ex1.txt.gz +## demoAncestryCall/ex1.gds +## demoKNNSynthetic/ex1.gds +############################################################################### + +This demonstration profile contains genotype from a synthetic profile +generated using a reference profile from 1000 Genome randomly selected +with an AFR ancestry (this profile is not present in the reference GDS file +tests/ex1_good_small_1KG.gds) and one PDAC cancer exome profile +from Tiriac et al 2018 (this exome is restricted access in dbGAP; it was +only use to extract the total read deep at the SNV positions and to estimate +allelic fraction of the regions). Only 500 SNVs per chromosome have been kept. +The pipeline to generate the synthetic data are describe in Belleau et al 2023. + + +############################################################################### +## demoPorfileGenotypes/HG00*.csv.bz2 +############################################################################### + +Each file contains the first 10 SNVs from chromosome 1 of profiles +present in 1000 Genomes as described in: + https://wellcomeopenresearch.org/articles/4-50/ + + +############################################################################### +## matFreqSNV_Demo.txt.bz2 +############################################################################### + +This is a text file with the frequency for the 5 super populations of the +10 first SNVs from chromosome 1 in 1000 Genomes as described in: + https://wellcomeopenresearch.org/articles/4-50/ + + +############################################################################### +## listSNPIndexes_Demo.rds +## mapSNVSelected_Demo.rds +############################################################################### + +Those 2 files are created using the information from the +matFreqSNV_Demo.txt.bz2 file. The listSNPIndexes_Demo.rds is an index of the +SNVs that have a frequency for at least one of the 1000 Genomes super-population +bigger than 1%. The mapSNVSelected_Demo.rds is a subset of +matFreqSNV_Demo.txt.bz2 containing the selected SNVs present in +listSNPIndexes_Demo.rds. + + +############################################################################### +## unrelatedPatientsInfo_Demo.rds +############################################################################### + +The file contains the list of unrelated profiles from 1000 Genomes. +The values does not represent the real relation between profiles; the +values have been modified for demonstration purpose. +The profiles from 1000 Genomes are described in: + https://wellcomeopenresearch.org/articles/4-50/ + + diff --git a/inst/extdata/TEST_01.infoCall.RDS b/inst/extdata/TEST_01.infoCall.RDS new file mode 100644 index 000000000..10e2d7170 Binary files /dev/null and b/inst/extdata/TEST_01.infoCall.RDS differ diff --git a/inst/extdata/block.sp.EUR.Ex.chr1.blocks.det b/inst/extdata/block.sp.EUR.Ex.chr1.blocks.det new file mode 100644 index 000000000..3578e69ca --- /dev/null +++ b/inst/extdata/block.sp.EUR.Ex.chr1.blocks.det @@ -0,0 +1,10 @@ + CHR BP1 BP2 KB NSNPS SNPS + 1 51897 51927 0.031 2 s3|s4 + 1 54707 54715 0.009 2 s6|s7 + 1 55544 59039 3.496 2 s14|s15 + 1 61986 66506 4.521 3 s17|s18|s31 + 1 76837 77873 1.037 3 s39|s42|s43 + 1 79771 80140 0.37 2 s48|s49 + 1 82675 86330 3.656 12 s55|s58|s61|s62|s66|s67|s68|s71|s72|s75|s78|s79 + 1 87189 88337 1.149 3 s81|s89|s96 + 1 631489 633328 1.84 3 s150|s159|s160 diff --git a/inst/extdata/create_PedigreeDemoPED.R b/inst/extdata/create_PedigreeDemoPED.R new file mode 100644 index 000000000..048574bdf --- /dev/null +++ b/inst/extdata/create_PedigreeDemoPED.R @@ -0,0 +1,29 @@ + +############################################################ +## How to create the PedigreeDemo.ped file +## This file contains a small pedigree reference data.frame +############################################################ + +pedigreeDemo <- data.frame(Family.ID=c("BB01", "BB01", "BB01", "BB02", "BB02", + "BB02", "BB03", "BB03", "BB03", "BB04"), + Individual.ID=c("HG00100", "HG00101", "HG00102", "HG00103", + "HG00104", "HG00105", "HG00106", "HG00107", "HG00108", + "HG00109"), + Paternal.ID=c("0", "0", "HG00100", "0", "0", "HG00103", + "HG00107", "0", "0", "0"), + Maternal.ID=c("0", "0", "HG00101", "0", "0", "HG00104", + "HG00108", "0", "0", "0"), + Gender=c(1, 2, 2, 1, 2, 1, 2, 1, 2, 2), + Phenotype=rep(0, 10), + Population=rep("ACB", 10), + Relationship=c("father", "mother", "child", "father", + "mother", "child", "child", "father", "mother", + "mother"), + Siblings=rep(0, 10), + Second.Order=rep(0, 10), + Third.Order=rep(0, 10), + Other.Comments=rep(0, 10), + stringsAsFactors=FALSE) + +write.table(pedigreeDemo, file="PedigreeDemo.ped", quote=FALSE, + row.names=FALSE, col.names=TRUE, sep="\t") diff --git a/inst/extdata/create_PedigreeReferenceDemoRDS.R b/inst/extdata/create_PedigreeReferenceDemoRDS.R new file mode 100644 index 000000000..141e11629 --- /dev/null +++ b/inst/extdata/create_PedigreeReferenceDemoRDS.R @@ -0,0 +1,17 @@ + +############################################################ +## How to create the PedigreeReferenceDemo.rds file +## This file contains a small pedigree reference data.frame +############################################################ + +pedigree <- data.frame(sample.id=c("HG00100", "HG00101", "HG00102", + "HG00103", "HG00104", "HG00105", "HG00106", "HG00107", + "HG00108", "HG00109"), + Name.ID=c("HG00100", "HG00101", "HG00102", "HG00103", + "HG00104", "HG00105", "HG00106", "HG00107", + "HG00108", "HG00109"), + sex=c("1", "2", "2", "1", "2", "1", "2", "1", "2", "2"), + pop.group=rep("ABC", 10), superPop=rep("AFR", 10), + batch=rep(0, 10), stringsAsFactors=FALSE) + +saveRDS(pedigree, file="PedigreeReferenceDemo.rds") diff --git a/inst/extdata/create_PopulationReferenceDemoGDS.R b/inst/extdata/create_PopulationReferenceDemoGDS.R new file mode 100644 index 000000000..b79d250d8 --- /dev/null +++ b/inst/extdata/create_PopulationReferenceDemoGDS.R @@ -0,0 +1,126 @@ + +############################################################ +## How to create the PopulationReferenceDemo.gds file +## This file is a small population reference GDS file +## with limited number of samples and SNVs +############################################################ + +## Required librairies +library(SNPRelate) +library(gdsfmt) + +## Create a GDS file +gdsRefNew <- createfn.gds("PopulationReferenceDemo.gds") + +## The entry 'sample.id' contain the unique identifiers of 10 samples +## that constitute the reference dataset +sample.id <- c("HG00100", "HG00101", "HG00102", "HG00103", "HG00104", + "HG00105", "HG00106", "HG00107", "HG00108", "HG00109") +add.gdsn(node=gdsRefNew, name="sample.id", val=sample.id, + storage="string", check=TRUE) + +## A data frame containing the information about the 10 samples +## (in the same order than in the 'sample.id') is created and added to +## the 'sample.annot' entry +## The data frame must contain those columns: +## 'sex': '1'=male, '2'=female +## 'pop.group': acronym for the population (ex: GBR, CDX, MSL, ASW, etc..) +## 'superPop': acronym for the super-population (ex: AFR, EUR, etc...) +## 'batch': number identifying the batch of provenance +sampleInformation <- data.frame(sex=c("1", "1", "1", "1", "1", + "2", "2", "2", "2", "2"), + pop.group=c("GBR", "GIH", "CDX", "GBR", "LWK", + "LWK", "LWK", "GBR", "GIH", "PEL"), + superPop=c("EUR", "SAS", "EAS", "EUR", "AFR", "AFR", "AFR", + "EUR", "SAS", "AMR"), batch=rep(0, 10), stringsAsFactors=FALSE) +add.gdsn(node=gdsRefNew, name="sample.annot", val=sampleInformation, + check=TRUE) + +## The identifier of each SNV is added in the 'snp.id' entry +snvID <- c("s1", "s2", "s3", "s4", "s5", "s6", "s7") +add.gdsn(node=gdsRefNew, name="snp.id", val=snvID, + check=TRUE) + +## The chromosome of each SNV is added to the 'snp.chromosome' entry +## The order of the SNVs is the same than in the 'snp.id' entry +snvChrom <- c(rep(1, 7)) +add.gdsn(node=gdsRefNew, name="snp.chromosome", val=snvChrom, storage="uint16", + check=TRUE) + +## The position on the chromosome of each SNV is added to +## the 'snp.position' entry +## The order of the SNVs is the same than in the 'snp.id' entry +snvPos <- c(16102, 51478, 51897, 51927, 54489, 54707, 54715) +add.gdsn(node=gdsRefNew, name="snp.position", val=snvPos, storage="int32", + check=TRUE) + +## The allele information of each SNV is added to the 'snp.allele' entry +## The order of the SNVs is the same than in the 'snp.allele' entry +snvAllele <- c("T/G", "T/A", "C/A", "G/A", "G/A", "G/C", "C/T") +add.gdsn(node=gdsRefNew, name="snp.allele", val=snvAllele, storage="string", + check=TRUE) + +## The allele frequency in the general population (between 0 and 1) of each +## SNV is added to the 'snp.AF' entry +## The order of the SNVs is the same than in the 'snp.id' entry +snvAF <- c(0.02, 0.11, 0.08, 0.07, 0.10, 0.23, 0.21) +add.gdsn(node=gdsRefNew, name="snp.AF", val=snvAF, storage="packedreal24", + check=TRUE) + +## The allele frequency in the East Asian population (between 0 and 1) of each +## SNV is added to the 'snp.EAS_AF' entry +## The order of the SNVs is the same than in the 'snp.id' entry +snvAF <- c(0.00, 0.00, 0.05, 0.01, 0.00, 0.08, 0.07) +add.gdsn(node=gdsRefNew, name="snp.EAS_AF", val=snvAF, storage="packedreal24", + check=TRUE) + +## The allele frequency in the European population (between 0 and 1) of each +## SNV is added to the 'snp.EUR_AF' entry +## The order of the SNVs is the same than in the 'snp.id' entry +snvAF <- c(0.04, 0.20, 0.12, 0.14, 0.18, 0.38, 0.34) +add.gdsn(node=gdsRefNew, name="snp.EUR_AF", val=snvAF, storage="packedreal24", + check=TRUE) + +## The allele frequency in the African population (between 0 and 1) of each +## SNV is added to the 'snp.AFR_AF' entry +## The order of the SNVs is the same than in the 'snp.id' entry +snvAF <- c(0.03, 0.02, 0.05, 0.06, 0.02, 0.18, 0.16) +add.gdsn(node=gdsRefNew, name="snp.AFR_AF", val=snvAF, storage="packedreal24", + check=TRUE) + +## The allele frequency in the American population (between 0 and 1) of each +## SNV is added to the 'snp.AMR_AF' entry +## The order of the SNVs is the same than in the 'snp.id' entry +snvAF <- c(0.03, 0.12, 0.06, 0.07, 0.10, 0.25, 0.24) +add.gdsn(node=gdsRefNew, name="snp.AMR_AF", val=snvAF, storage="packedreal24", + check=TRUE) + +## The allele frequency in the South Asian population (between 0 and 1) of each +## SNV is added to the 'snp.SAS_AF' entry +## The order of the SNVs is the same than in the 'snp.id' entry +snvAF <- c(0.02, 0.22, 0.10, 0.09, 0.21, 0.28, 0.27) +add.gdsn(node=gdsRefNew, name="snp.SAS_AF", val=snvAF, storage="packedreal24", + check=TRUE) + +## The genotype of each SNV for each sample is added to the 'genotype' entry +## The genotype correspond to the number of A alleles +## The rows represent the SNVs is the same order than in 'snp.id' entry +## The columns represent the samples is the same order than in 'sample.id' entry +genotypeInfo <- matrix(data=c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, + 1, 1, 2, 0, 0, 0, 1, 1, 0, 0, + 1, 1, 2, 0, 0, 0, 1, 1, 0, 0, + 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, + 1, 1, 2, 0, 1, 1, 1, 1, 0, 0, + 1, 1, 1, 0, 1, 1, 1, 1, 0, 0), ncol=10, + byrow=TRUE) +add.gdsn(node=gdsRefNew, name="genotype", val=genotypeInfo, + storage="bit2", check=TRUE) + +## The entry 'sample.ref' is filled with 0 or 1 indicating that 8 samples out +## of 10 samples are retained to be used as reference +## The order of the samples is the same than in the 'sample.id' entry +add.gdsn(node=gdsRefNew, name="sample.ref", val=c(rep(1L, 7), 0, 0, 1), + storage="bit1", check=TRUE) + +closefn.gds(gdsRefNew) diff --git a/inst/extdata/create_PopulationReferenceSNVAnnotationDemoGDS.R b/inst/extdata/create_PopulationReferenceSNVAnnotationDemoGDS.R new file mode 100644 index 000000000..0e7712a4c --- /dev/null +++ b/inst/extdata/create_PopulationReferenceSNVAnnotationDemoGDS.R @@ -0,0 +1,64 @@ + +###################################################################### +## How to create the PopulationReferenceSNVAnnotationDemo.gds file +## This file is a small population reference SNV Annotation GDS file +## with limited number of samples and SNVs +###################################################################### + +## Required librairies +library(SNPRelate) +library(gdsfmt) + +## Create a GDS file +gdsRefNew <- createfn.gds("PopulationReferenceSNVAnnotationDemo.gds") + +## The entry 'phase' contain the phase of the SNVs in the +## Population Annotation GDS file +## 0 means the first allele is a reference; 1 means the first allele is +## the alternative and 3 means unknown +## The SNVs (rows) and samples (columns) in phase are in the same order as +## in the Population Annotation GDS file. +phase <- matrix(data=c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, + 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, + 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, + 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, + 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, + 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, + 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 0, 1, 1, 0, 1, 1, 1, 1), ncol=10, byrow=TRUE) +add.gdsn(node=gdsRefNew, name="phase", val=phase, storage="bit2", check=TRUE) + +## The entry 'blockAnnot' contain the information for each group of blocks +## that are present in the 'block' entry. +blockAnnot <- data.frame(block.id=c("EAS.0.05.500k", "EUR.0.05.500k", + "AFR.0.05.500k", "AMR.0.05.500k", "SAS.0.05.500k"), + block.desc=c( + "EAS populationblock base on SNP 0.05 and windows 500k", + "EUR populationblock base on SNP 0.05 and windows 500k", + "AFR populationblock base on SNP 0.05 and windows 500k", + "AMR populationblock base on SNP 0.05 and windows 500k", + "SAS populationblock base on SNP 0.05 and windows 500k"), + stringsAsFactors=FALSE) +add.gdsn(node=gdsRefNew, name="block.annot", val=blockAnnot, check=TRUE) + +## The entry 'block' contain the block information for the SNVs in the +## Population Annotation GDS file. +## The SNVs (rows) are in the same order as in +## the Population Annotation GDS file. +## The block groups (columns) are in the same order as in +## the 'block.annot' entry. +block <- matrix(data=c(-1, -1, -1, -1, -1, + -2, -2, 1, -2, -2, + -2, 1, 1, 1, -2, + -2, 1, 1, 1, -2, + -2, -3, -2, -3, -2, + 1, 2, 2, 2, 1, + 1, 2, 2, 2, 1, + -3, -4, -3, -4, -3, + 2, -4, 3, -4, -3, + 2, -4, 3, -4, -3), ncol=5, byrow=TRUE) +add.gdsn(node=gdsRefNew, name="block", val=block, storage="int32", check=TRUE) + +closefn.gds(gdsRefNew) diff --git a/inst/extdata/create_listSNPIndexes_DemoRDS.R b/inst/extdata/create_listSNPIndexes_DemoRDS.R new file mode 100644 index 000000000..d1885fe30 --- /dev/null +++ b/inst/extdata/create_listSNPIndexes_DemoRDS.R @@ -0,0 +1,10 @@ + +############################################################ +## How to create the listSNPIndexes_Demo.rds file +## This file contains a vector containing indexes +## of the retained SNVs +############################################################ + +retainedSNVs <- c(1, 2, 3, 4, 6, 8, 9) + +saveRDS(retainedSNVs, file="listSNPIndexes_Demo.rds") diff --git a/inst/extdata/create_mapSNVSelected_DemoRDS.R b/inst/extdata/create_mapSNVSelected_DemoRDS.R new file mode 100644 index 000000000..dc6e5e896 --- /dev/null +++ b/inst/extdata/create_mapSNVSelected_DemoRDS.R @@ -0,0 +1,20 @@ + +############################################################ +## How to create the mapSNVSelected_Demo.rds file +## This file contains a data.frame with the SNV allelic +## frequency in the continental populations +############################################################ + +selectedSNVs <- data.frame(CHROM=rep("chr1", 7), + POS=c(16102, 51478, 51897, 51927, 54489, 54707, 54715), + REF=c("T", "T", "C", "G", "G", "G", "C"), + ALT=c("G", "A", "A", "A", "A", "C", "T"), + AF=c(0.02, 0.11, 0.08, 0.07, 0.1, 0.23, 0.21), + EAS_AF=c(0.0, 0.0, 0.05, 0.01, 0.0, 0.08, 0.07), + EUR_AF=c(0.04, 0.2, 0.12, 0.14, 0.18, 0.38, 0.34), + AFR_AF=c(0.03, 0.02, 0.05, 0.06, 0.02, 0.18, 0.16), + AMR_AF=c(0.03, 0.12, 0.06, 0.07, 0.1, 0.25, 0.24), + SAS_AF=c(0.02, 0.22, 0.1, 0.09, 0.21, 0.28, 0.27), + stringsAsFactors=FALSE) + +saveRDS(selectedSNVs, file="mapSNVSelected_Demo.rds") diff --git a/inst/extdata/create_unrelatedPatientsInfo_DemoRDS.R b/inst/extdata/create_unrelatedPatientsInfo_DemoRDS.R new file mode 100644 index 000000000..189defb7c --- /dev/null +++ b/inst/extdata/create_unrelatedPatientsInfo_DemoRDS.R @@ -0,0 +1,13 @@ + +############################################################ +## How to create the unrelatedPatientsInfo_Demo.rds file +## This file contains a list with the unrelated/related +## status for Reference samples +############################################################ + +unrelatedPatients <- list() +unrelatedPatients$rels <- c("HG00109", NA) + +unrelatedPatients$unrels <- rep(NA, 9) + +saveRDS(unrelatedPatients, file="unrelatedPatientsInfo_Demo.rds") diff --git a/inst/extdata/demoAncestryCall/listPCASample.RDS b/inst/extdata/demoAncestryCall/listPCASample.RDS deleted file mode 100644 index 34f2fa9c4..000000000 Binary files a/inst/extdata/demoAncestryCall/listPCASample.RDS and /dev/null differ diff --git a/inst/extdata/demoAncestryCall/matKNN.RDS b/inst/extdata/demoAncestryCall/matKNN.RDS deleted file mode 100644 index 83e79b3f6..000000000 Binary files a/inst/extdata/demoAncestryCall/matKNN.RDS and /dev/null differ diff --git a/inst/extdata/demoAncestryCall/pedSyn.RDS b/inst/extdata/demoAncestryCall/pedSyn.RDS deleted file mode 100644 index b26477920..000000000 Binary files a/inst/extdata/demoAncestryCall/pedSyn.RDS and /dev/null differ diff --git a/inst/extdata/demoGenoChr/demoGenoChr.tar b/inst/extdata/demoGenoChr/demoGenoChr.tar new file mode 100644 index 000000000..e17204e30 Binary files /dev/null and b/inst/extdata/demoGenoChr/demoGenoChr.tar differ diff --git a/inst/extdata/demoKNNSynthetic/knownSuperPop1KG.RDS b/inst/extdata/demoKNNSynthetic/knownSuperPop1KG.RDS deleted file mode 100644 index a58a35638..000000000 Binary files a/inst/extdata/demoKNNSynthetic/knownSuperPop1KG.RDS and /dev/null differ diff --git a/inst/extdata/demoKNNSynthetic/pca1KG.RDS b/inst/extdata/demoKNNSynthetic/pca1KG.RDS deleted file mode 100644 index f19a17d9b..000000000 Binary files a/inst/extdata/demoKNNSynthetic/pca1KG.RDS and /dev/null differ diff --git a/inst/extdata/demoKNNSynthetic/pcaSynthetic.RDS b/inst/extdata/demoKNNSynthetic/pcaSynthetic.RDS deleted file mode 100644 index 8ae11a9c7..000000000 Binary files a/inst/extdata/demoKNNSynthetic/pcaSynthetic.RDS and /dev/null differ diff --git a/inst/extdata/HG00100.csv.bz2 b/inst/extdata/demoProfileGenotypes/HG00100.csv.bz2 similarity index 100% rename from inst/extdata/HG00100.csv.bz2 rename to inst/extdata/demoProfileGenotypes/HG00100.csv.bz2 diff --git a/inst/extdata/HG00101.csv.bz2 b/inst/extdata/demoProfileGenotypes/HG00101.csv.bz2 similarity index 100% rename from inst/extdata/HG00101.csv.bz2 rename to inst/extdata/demoProfileGenotypes/HG00101.csv.bz2 diff --git a/inst/extdata/HG00102.csv.bz2 b/inst/extdata/demoProfileGenotypes/HG00102.csv.bz2 similarity index 100% rename from inst/extdata/HG00102.csv.bz2 rename to inst/extdata/demoProfileGenotypes/HG00102.csv.bz2 diff --git a/inst/extdata/HG00103.csv.bz2 b/inst/extdata/demoProfileGenotypes/HG00103.csv.bz2 similarity index 100% rename from inst/extdata/HG00103.csv.bz2 rename to inst/extdata/demoProfileGenotypes/HG00103.csv.bz2 diff --git a/inst/extdata/HG00104.csv.bz2 b/inst/extdata/demoProfileGenotypes/HG00104.csv.bz2 similarity index 100% rename from inst/extdata/HG00104.csv.bz2 rename to inst/extdata/demoProfileGenotypes/HG00104.csv.bz2 diff --git a/inst/extdata/HG00105.csv.bz2 b/inst/extdata/demoProfileGenotypes/HG00105.csv.bz2 similarity index 100% rename from inst/extdata/HG00105.csv.bz2 rename to inst/extdata/demoProfileGenotypes/HG00105.csv.bz2 diff --git a/inst/extdata/HG00106.csv.bz2 b/inst/extdata/demoProfileGenotypes/HG00106.csv.bz2 similarity index 100% rename from inst/extdata/HG00106.csv.bz2 rename to inst/extdata/demoProfileGenotypes/HG00106.csv.bz2 diff --git a/inst/extdata/HG00107.csv.bz2 b/inst/extdata/demoProfileGenotypes/HG00107.csv.bz2 similarity index 100% rename from inst/extdata/HG00107.csv.bz2 rename to inst/extdata/demoProfileGenotypes/HG00107.csv.bz2 diff --git a/inst/extdata/HG00108.csv.bz2 b/inst/extdata/demoProfileGenotypes/HG00108.csv.bz2 similarity index 100% rename from inst/extdata/HG00108.csv.bz2 rename to inst/extdata/demoProfileGenotypes/HG00108.csv.bz2 diff --git a/inst/extdata/HG00109.csv.bz2 b/inst/extdata/demoProfileGenotypes/HG00109.csv.bz2 similarity index 100% rename from inst/extdata/HG00109.csv.bz2 rename to inst/extdata/demoProfileGenotypes/HG00109.csv.bz2 diff --git a/inst/extdata/example/pedEx.rds b/inst/extdata/example/pedEx.rds deleted file mode 100644 index 59be2b527..000000000 Binary files a/inst/extdata/example/pedEx.rds and /dev/null differ diff --git a/inst/extdata/example/snpPileup/ex1.generic.txt.gz b/inst/extdata/example/snpPileup/ex1.generic.txt.gz new file mode 100644 index 000000000..48192f094 Binary files /dev/null and b/inst/extdata/example/snpPileup/ex1.generic.txt.gz differ diff --git a/inst/extdata/example/snpPileup/ex1.vcf.gz b/inst/extdata/example/snpPileup/ex1.vcf.gz new file mode 100644 index 000000000..55101e091 Binary files /dev/null and b/inst/extdata/example/snpPileup/ex1.vcf.gz differ diff --git a/inst/extdata/example/snpPileupRNA/ex1.generic.txt.gz b/inst/extdata/example/snpPileupRNA/ex1.generic.txt.gz new file mode 100644 index 000000000..48192f094 Binary files /dev/null and b/inst/extdata/example/snpPileupRNA/ex1.generic.txt.gz differ diff --git a/inst/extdata/example/snpPileupRNA/ex1.txt.gz b/inst/extdata/example/snpPileupRNA/ex1.txt.gz new file mode 100644 index 000000000..4e729bb8a Binary files /dev/null and b/inst/extdata/example/snpPileupRNA/ex1.txt.gz differ diff --git a/inst/extdata/example/snpPileupRNA/ex1.vcf.gz b/inst/extdata/example/snpPileupRNA/ex1.vcf.gz new file mode 100644 index 000000000..55101e091 Binary files /dev/null and b/inst/extdata/example/snpPileupRNA/ex1.vcf.gz differ diff --git a/inst/extdata/gds1KG.gds b/inst/extdata/gds1KG.gds deleted file mode 100644 index 338fb0240..000000000 Binary files a/inst/extdata/gds1KG.gds and /dev/null differ diff --git a/inst/extdata/gdsAnnot1KG.gds b/inst/extdata/gdsAnnot1KG.gds deleted file mode 100644 index 920a27d19..000000000 Binary files a/inst/extdata/gdsAnnot1KG.gds and /dev/null differ diff --git a/inst/extdata/listSNPIndexes_Demo.rds b/inst/extdata/listSNPIndexes_Demo.rds index 4cf5165ae..67cdf6700 100644 Binary files a/inst/extdata/listSNPIndexes_Demo.rds and b/inst/extdata/listSNPIndexes_Demo.rds differ diff --git a/inst/extdata/mapSNVSelected_Demo.rds b/inst/extdata/mapSNVSelected_Demo.rds index 243c71b63..6a5c3b538 100644 Binary files a/inst/extdata/mapSNVSelected_Demo.rds and b/inst/extdata/mapSNVSelected_Demo.rds differ diff --git a/inst/extdata/matFreqSNV_Demo.txt.bz2 b/inst/extdata/matFreqSNV_Demo.txt.bz2 index c30dd4f8c..11ed7d002 100644 Binary files a/inst/extdata/matFreqSNV_Demo.txt.bz2 and b/inst/extdata/matFreqSNV_Demo.txt.bz2 differ diff --git a/inst/extdata/tests/1KG_Test_02.gds b/inst/extdata/tests/1KG_Test_02.gds deleted file mode 100644 index 338fb0240..000000000 Binary files a/inst/extdata/tests/1KG_Test_02.gds and /dev/null differ diff --git a/inst/extdata/tests/Sample_Info_Test.RDS b/inst/extdata/tests/Sample_Info_Test.RDS deleted file mode 100644 index 7ba00d9de..000000000 Binary files a/inst/extdata/tests/Sample_Info_Test.RDS and /dev/null differ diff --git a/inst/extdata/tests/ex1_good_small_1KG_Annot_GDS.gds b/inst/extdata/tests/ex1_NoBlockGene.1KG_Annot_GDS.gds similarity index 93% rename from inst/extdata/tests/ex1_good_small_1KG_Annot_GDS.gds rename to inst/extdata/tests/ex1_NoBlockGene.1KG_Annot_GDS.gds index e977787ff..1f8a29af6 100644 Binary files a/inst/extdata/tests/ex1_good_small_1KG_Annot_GDS.gds and b/inst/extdata/tests/ex1_NoBlockGene.1KG_Annot_GDS.gds differ diff --git a/inst/extdata/example/gdsRef/ex1kg.gds b/inst/extdata/tests/ex1_good_small_1KG.gds similarity index 100% rename from inst/extdata/example/gdsRef/ex1kg.gds rename to inst/extdata/tests/ex1_good_small_1KG.gds diff --git a/inst/extdata/example/gdsRef/exAnnot1kg.gds b/inst/extdata/tests/ex1_good_small_1KG_Annot.gds similarity index 100% rename from inst/extdata/example/gdsRef/exAnnot1kg.gds rename to inst/extdata/tests/ex1_good_small_1KG_Annot.gds diff --git a/inst/extdata/tests/ex1_good_small_1KG_GDS.gds b/inst/extdata/tests/ex1_good_small_1KG_GDS.gds deleted file mode 100644 index b3d6717b1..000000000 Binary files a/inst/extdata/tests/ex1_good_small_1KG_GDS.gds and /dev/null differ diff --git a/inst/extdata/unrelatedPatientsInfo_Demo.rds b/inst/extdata/unrelatedPatientsInfo_Demo.rds index be2027454..3543aba0e 100644 Binary files a/inst/extdata/unrelatedPatientsInfo_Demo.rds and b/inst/extdata/unrelatedPatientsInfo_Demo.rds differ diff --git a/man/RAIDS-package.Rd b/man/RAIDS-package.Rd index 78fdd67a8..d670a1307 100644 --- a/man/RAIDS-package.Rd +++ b/man/RAIDS-package.Rd @@ -35,34 +35,15 @@ Cancer Res 1 January 2023; 83 (1): 49–58. https://doi.org/10.1158/0008-5472.CAN-22-0682 } \seealso{ -\itemize{ - \item \code{\link{prepPed1KG}} {This function extracts the - needed information from the 1000 Genomes pedigree file and formats it - into a \code{data.frame} so in can be used in following steps - of the ancestry inference process.} - \item \code{\link{generateMapSnvSel}} {The function applies a cut-off - filter to the SNP information file to retain only the SNP that have a - frequency superior or equal to the specified cut-off in at least one - super population.} - \item \code{\link{generateGDS1KG}} {This function generates the GDS - file that will contain the information from 1KG. } - \item \code{\link{identifyRelative}} {The function identify patients - that are genetically related in the 1KG GDS file. } - \item \code{\link{addRef2GDS1KG}} { This function adds the information - about the unrelated patients to the 1KG GDS file. } - \item \code{\link{add1KG2SampleGDS}} { This function adds the genotype - information for the list of pruned SNVs into the Profile GDS file } - \item \code{\link{appendStudy2GDS1KG}} { This function creates the - Sample GDS file(s) for one or multiple specific samples - using the information from a Sample RDS description file and the 1KG - GDS file. } - \item \code{\link{estimateAllelicFraction}} { This function estimates - the allelic fraction of the pruned SNVs for a specific sample and add - the information to the associated GDS Sample file. The allelic fraction - estimation method is adapted to the type of study (DNA or RNA). } - \item \code{\link{computeSyntheticROC}} { This function calculate the - AUROC of the inferences for specific values of D and K using the - inferred ancestry results from the synthetic profiles.} +\describe{ +\item{\code{\link{runExomeAncestry}}}{This function runs most steps +leading to the ancestry inference call on a specific exome profile.} +\item{\code{\link{runExomeAncestry}}}{This function runs most steps +leading to the ancestry inference call on a specific RNA profile.} +\item{\code{\link{createAccuracyGraph}}}{The function extracts the +required information from an output generated by RAIDS to create a +graphic representation of the accuracy for different values of +PCA dimensions and K-neighbors through all tested ancestries.} } } \author{ @@ -70,6 +51,7 @@ Pascal Belleau, Astrid Deschênes and Alexander Krasnitz Maintainer: -Pascal Belleau +Pascal Belleau \href{mailto:pascal_belleau@hotmail.com}{pascal_belleau@hotmail.com} } +\keyword{internal} \keyword{package} diff --git a/man/add1KG2SampleGDS.Rd b/man/add1KG2SampleGDS.Rd index a1b526309..bc1b0c1dc 100644 --- a/man/add1KG2SampleGDS.Rd +++ b/man/add1KG2SampleGDS.Rd @@ -27,16 +27,18 @@ The function returns \code{0L} when successful. \description{ The function extracts the information about the pruned SNVs from the 1KG GDS file and adds entries related to the pruned SNVs in -the Profile GDS file. +the Profile GDS file. The nodes are added to the Profile GDS file: +'sample.id', 'snp.id', 'snp.chromosome', 'snp.position', 'snp.index', +'genotype' and 'lap'. } \examples{ ## Required library for GDS -library(gdsfmt) +library(SNPRelate) ## Path to the demo 1KG GDS file is located in this package dataDir <- system.file("extdata/tests", package="RAIDS") -fileGDS <- file.path(dataDir, "ex1_good_small_1KG_GDS.gds") +fileGDS <- file.path(dataDir, "ex1_good_small_1KG.gds") ## The data.frame containing the information about the study ## The 3 mandatory columns: "studyID", "study.desc", "study.platform" @@ -46,39 +48,37 @@ studyDF <- data.frame(study.id="MYDATA", study.platform="PLATFORM", stringsAsFactors=FALSE) -## Copy the Profile GDS file demo that has been pruned -## into a test directory (deleted after the example has been run) -dataDirGenotyping <- file.path(system.file("extdata", package="RAIDS"), - "demoAddGenotype") -dir.create(dataDirGenotyping, showWarnings=FALSE, - recursive=FALSE, mode="0777") +## Temporary Profile file +fileProfile <- file.path(tempdir(), "ex2.gds") + +## Copy required file file.copy(file.path(dataDir, "ex1_demo_with_pruning.gds"), - file.path(dataDirGenotyping, "ex1.gds")) + fileProfile) ## Open 1KG file gds1KG <- snpgdsOpen(fileGDS) ## Compute the list of pruned SNVs for a specific profile 'ex1' -## and save it in the Profile GDS file 'ex1.gds' +## and save it in the Profile GDS file 'ex2.gds' add1KG2SampleGDS(gdsReference=gds1KG, - fileProfileGDS=file.path(dataDirGenotyping, "ex1.gds"), - currentProfile=c("ex1"), - studyID=studyDF$study.id) + fileProfileGDS=fileProfile, + currentProfile=c("ex1"), + studyID=studyDF$study.id) -## Close the 1KG GDS file (it is important to always close the GDS files) +## Close the 1KG GDS file (important) closefn.gds(gds1KG) ## Check content of Profile GDS file ## The 'pruned.study' entry should be present -content <- openfn.gds(file.path(dataDirGenotyping, "ex1.gds")) +content <- openfn.gds(fileProfile) content -## Close the Profile GDS file (it is important to always close the GDS files) +## Close the Profile GDS file (important) closefn.gds(content) -## Unlink Profile GDS file (created for demo purpose) -unlink(file.path(dataDirGenotyping, "ex1.gds")) -unlink(dataDirGenotyping) +## Remove Profile GDS file (created for demo purpose) +unlink(fileProfile, force=TRUE) + } \author{ diff --git a/man/addBlockFromDetFile.Rd b/man/addBlockFromDetFile.Rd new file mode 100644 index 000000000..ce1bb05f9 --- /dev/null +++ b/man/addBlockFromDetFile.Rd @@ -0,0 +1,105 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/process1KG.R +\encoding{UTF-8} +\name{addBlockFromDetFile} +\alias{addBlockFromDetFile} +\title{Append information associated to ld blocks, as indexes, into the +Population Reference SNV Annotation GDS file} +\usage{ +addBlockFromDetFile( + fileReferenceGDS, + gdsRefAnnotFile, + pathBlock, + superPop, + blockName = "ldBlock", + blockDesc = "Not Define", + verbose = FALSE +) +} +\arguments{ +\item{fileReferenceGDS}{a \code{character} string representing the file +name of the Reference GDS file. The file must exist.} + +\item{gdsRefAnnotFile}{a \code{character} string representing the +file name corresponding the Reference SNV +Annotation GDS file. The function will +open it in write mode and close it after. The file must exist.} + +\item{pathBlock}{a \code{character} string representing the directory +where all the output file det from the plink block command are located. +The directory must not include other file with the extension \'.det\'. +The name of the \'.det\' must include the super-population between \'.\' +and the chromosome in the form \'chrNumber.\' \( \'chr1.\'\).} + +\item{superPop}{a \code{character} string representing the super population.} + +\item{blockName}{a \code{character} string representing the id of the block. +The blockName should not exist in \'gdsRefAnnotFile\'. +Default: \code{"ldBlock"}.} + +\item{blockDesc}{a \code{character} string representing the description of +the block. +Default: \code{"Not Define"}} + +\item{verbose}{a \code{logical} indicating if message information should be +printed. Default: \code{FALSE}.} +} +\value{ +\code{OL} when the function is successful. +} +\description{ +The function appends the information about the ld blocks into +the Population Reference SNV Annotation GDS file. The information is +extracted from the Population Reference GDS file and files \'.det\'. +} +\details{ +More information about GDS file format can be found at the Bioconductor +gdsfmt website: +https://bioconductor.org/packages/gdsfmt/ +} +\examples{ + +## Path to the demo pedigree file is located in this package +dataDir <- system.file("extdata", package="RAIDS") + +fileAnnotGDS <- file.path(tempdir(), "ex1_good_small_1KG_Ann_GDS.gds") + +## Demo of of output file det from the plink block +## command for chromosome 1 +fileLdBlock <- file.path(dirname(fileAnnotGDS), "block.sp.EUR.Ex.chr1.blocks.det") + + +file.copy(file.path(dataDir, "tests", + "ex1_NoBlockGene.1KG_Annot_GDS.gds"), fileAnnotGDS) +file.copy(file.path(dataDir, "block.sp.EUR.Ex.chr1.blocks.det"), + fileLdBlock) + + + +## GDS Reference file +fileReferenceGDS <- file.path(dataDir, "tests", + "ex1_good_small_1KG.gds") + + \donttest{ + + + ## Append information associated to blocks + addBlockFromDetFile(fileReferenceGDS=fileReferenceGDS, + gdsRefAnnotFile=fileAnnotGDS, + pathBlock=dirname(fileAnnotGDS), + superPop="EUR") + + gdsAnnot1KG <- openfn.gds(fileAnnotGDS) + print(gdsAnnot1KG) + + closefn.gds(gdsAnnot1KG) +} + +## Remove temporary file +unlink(fileAnnotGDS, force=TRUE) +unlink(fileLdBlock, force=TRUE) + +} +\author{ +Pascal Belleau, Astrid Deschênes and Alexander Krasnitz +} diff --git a/man/addBlockFromPlink2GDS.Rd b/man/addBlockFromPlink2GDS.Rd deleted file mode 100644 index e922be722..000000000 --- a/man/addBlockFromPlink2GDS.Rd +++ /dev/null @@ -1,53 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/process1KG.R -\encoding{UTF-8} -\name{addBlockFromPlink2GDS} -\alias{addBlockFromPlink2GDS} -\title{TODO contain the information from 1KG} -\usage{ -addBlockFromPlink2GDS( - gds, - gdsOut, - PATHBLOCK, - superPop, - blockName, - blockDesc, - verbose = FALSE -) -} -\arguments{ -\item{gds}{an object of class -\link[gdsfmt]{gds.class} (a GDS file), TODO} - -\item{gdsOut}{an object of class \code{gds} in writing} - -\item{PATHBLOCK}{TODO} - -\item{superPop}{TODO} - -\item{blockName}{TODO} - -\item{blockDesc}{TODO} - -\item{verbose}{a \code{logical} indicating if message information should be -printed. Default: \code{FALSE}.} -} -\value{ -\code{OL} when the function is successful. -} -\description{ -TODO -} -\details{ -More information about GDS file format can be found at the Bioconductor -gdsfmt website: -https://bioconductor.org/packages/gdsfmt/ -} -\examples{ - -# TODO - -} -\author{ -Pascal Belleau, Astrid Deschênes and Alexander Krasnitz -} diff --git a/man/addBlockInGDSAnnot.Rd b/man/addBlockInGDSAnnot.Rd new file mode 100644 index 000000000..36479545a --- /dev/null +++ b/man/addBlockInGDSAnnot.Rd @@ -0,0 +1,76 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/gdsWrapper.R +\encoding{UTF-8} +\name{addBlockInGDSAnnot} +\alias{addBlockInGDSAnnot} +\title{Add block information in a Population Reference GDS Annotation file} +\usage{ +addBlockInGDSAnnot(gds, listBlock, blockName, blockDesc) +} +\arguments{ +\item{gds}{an object of class \code{gds} opened in writing mode.} + +\item{listBlock}{a \code{array} of \code{integer} representing all the +entries for the current block.} + +\item{blockName}{a \code{character} string representing the unique +block name.} + +\item{blockDesc}{a \code{character} string representing the description of +the current block.} +} +\value{ +The integer \code{0L} when successful. +} +\description{ +This function appends the information for one specific type +of blocks into a Population Reference GDS Annotation file. More +specifically, the node 'block.annot' is created if it does not exists. This +node contains a \code{data.frame} which will be append the description of +the current block. The node 'block' is also created if it does not exists. +This node is a \code{matrix} that will contain all the entries for the +current block. All the values for a specific block type are contained in a +single column that corresponds to the row number in the 'block.annot' node. +} +\examples{ + + +## Required library +library(gdsfmt) + +## Temporary GDS Annotation file in current directory +gdsFilePath <- file.path(tempdir(), "GDS_TEMP_Annot_14.gds") + +## Create and open the GDS file +GDS_file_tmp <- createfn.gds(filename=gdsFilePath) + +## One block +blockType <- "EAS.0.05.500k" + +## The description of the block +blockDescription <- "EAS population blocks based on 500k windows" + +## The values for each entry related to the block (integers) +blockEntries <- c(1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3) + +RAIDS:::addBlockInGDSAnnot(gds=GDS_file_tmp, listBlock=blockEntries, + blockName=blockType, blockDesc=blockDescription) + +## Read 'block.annot' node +read.gdsn(index.gdsn(GDS_file_tmp, "block.annot")) + +## Read 'block' node +read.gdsn(index.gdsn(GDS_file_tmp, "block")) + +## Close GDS file +closefn.gds(gdsfile=GDS_file_tmp) + +## Delete the temporary GDS file +unlink(x=gdsFilePath, force=TRUE) + + +} +\author{ +Pascal Belleau, Astrid Deschênes and Alexander Krasnitz +} +\keyword{internal} diff --git a/man/addGDS1KGLDBlock.Rd b/man/addGDS1KGLDBlock.Rd index f9eff3958..d76efe91b 100644 --- a/man/addGDS1KGLDBlock.Rd +++ b/man/addGDS1KGLDBlock.Rd @@ -1,31 +1,74 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/gdsWrapper.R +% Please edit documentation in R/gdsWrapper_internal.R \encoding{UTF-8} \name{addGDS1KGLDBlock} \alias{addGDS1KGLDBlock} -\title{TODO} +\title{Append information associated to ld blocks, as indexes, into the +Population Reference SNV Annotation GDS file} \usage{ addGDS1KGLDBlock(gds, listBlock, blockName, blockDesc) } \arguments{ -\item{gds}{an object of class \code{gds} opened for the sample} +\item{gds}{an object of class \link[gdsfmt]{gds.class} +(GDS file), an opened Reference Annotation GDS file.} -\item{listBlock}{TODO} +\item{listBlock}{a \code{array} of integer +representing the linkage disequilibrium block for +each SNV in the in the same order than the variant +in Population reference dataset.} -\item{blockName}{TODO} +\item{blockName}{a \code{character} string representing the id of the block. +The blockName should not exist in \'gdsRefAnnotFile\'.} -\item{blockDesc}{TODO} +\item{blockDesc}{a \code{character} string representing the description of +the block.} } \value{ The integer \code{0L} when successful. } \description{ -TODO +The function appends the information about the ld blocks into +the Population Reference SNV Annotation GDS file. The information is +extracted from the parameter listBlock. } \examples{ -# TODO -gds <- "Demo GDS TODO" +## Required library for GDS +library(gdsfmt) +## Path to the demo pedigree file is located in this package +dataDir <- system.file("extdata", package="RAIDS") + +fileAnnotGDS <- file.path(tempdir(), "ex1_good_small_1KG_Ann_GDS.gds") + + +file.copy(file.path(dataDir, "tests", + "ex1_NoBlockGene.1KG_Annot_GDS.gds"), fileAnnotGDS) + + +fileReferenceGDS <- file.path(dataDir, "tests", + "ex1_good_small_1KG.gds") + \donttest{ + gdsRef <- openfn.gds(fileReferenceGDS) + listBlock <- read.gdsn(index.gdsn(gdsRef, "snp.position")) + listBlock <- rep(-1, length(listBlock)) + closefn.gds(gdsRef) + gdsAnnot1KG <- openfn.gds(fileAnnotGDS, readonly=FALSE) + ## Append information associated to blocks + RAIDS:::addGDS1KGLDBlock(gds=gdsAnnot1KG, + listBlock=listBlock, + blockName="blockEmpty", + blockDesc="Example") + + closefn.gds(gdsAnnot1KG) + + gdsAnnot1KG <- openfn.gds(fileAnnotGDS) + print(gdsAnnot1KG) + + closefn.gds(gdsAnnot1KG) +} + +## Remove temporary file +unlink(fileAnnotGDS, force=TRUE) } \author{ diff --git a/man/addGDSRef.Rd b/man/addGDSRef.Rd index c976e966e..71a905324 100644 --- a/man/addGDSRef.Rd +++ b/man/addGDSRef.Rd @@ -33,13 +33,12 @@ that contains the information about the unrelated reference samples. ## Required library library(gdsfmt) -## Locate RDS with unrelated/related status for 1KG samples +## Locate RDS with unrelated/related status for Reference samples dataDir <- system.file("extdata", package="RAIDS") rdsFilePath <- file.path(dataDir, "unrelatedPatientsInfo_Demo.rds") -## Create a temporary GDS file in an test directory -dataDir <- system.file("extdata/tests", package="RAIDS") -gdsFilePath <- file.path(dataDir, "GDS_TEMP_11.gds") +## Temporary GDS file +gdsFilePath <- file.path(tempdir(), "GDS_TEMP_11.gds") ## Create and open the GDS file tmpGDS <- createfn.gds(filename=gdsFilePath) @@ -47,7 +46,7 @@ tmpGDS <- createfn.gds(filename=gdsFilePath) sampleIDs <- c("HG00104", "HG00109", "HG00110") add.gdsn(node=tmpGDS, name="sample.id", val=sampleIDs) -## Create "sample.ref" node in GDS file using RDS information +## Create "sample.ref" node in GDS file using RDS information RAIDS:::addGDSRef(gdsReference=tmpGDS, filePart=rdsFilePath) ## Read sample reference data.frame @@ -59,6 +58,7 @@ closefn.gds(gdsfile=tmpGDS) ## Delete the temporary GDS file unlink(x=gdsFilePath, force=TRUE) + } \author{ Pascal Belleau, Astrid Deschênes and Alexander Krasnitz diff --git a/man/addGDSStudyPruning.Rd b/man/addGDSStudyPruning.Rd index 4f33484aa..e00bd666c 100644 --- a/man/addGDSStudyPruning.Rd +++ b/man/addGDSStudyPruning.Rd @@ -30,8 +30,7 @@ deleted and a new entry is created. library(gdsfmt) ## Create a temporary GDS file in an test directory -dataDir <- system.file("extdata/tests", package="RAIDS") -gdsFilePath <- file.path(dataDir, "GDS_TEMP_1.gds") +gdsFilePath <- file.path(tempdir(), "GDS_TEMP_1.gds") ## Create and open the GDS file tmpGDS <- createfn.gds(filename=gdsFilePath) @@ -51,6 +50,7 @@ closefn.gds(gdsfile=tmpGDS) ## Delete the temporary GDS file unlink(x=gdsFilePath, force=TRUE) + } \author{ Pascal Belleau, Astrid Deschênes and Alexander Krasnitz diff --git a/man/addGeneBlockGDSRefAnnot.Rd b/man/addGeneBlockGDSRefAnnot.Rd index b5113fc4e..c8cd79029 100644 --- a/man/addGeneBlockGDSRefAnnot.Rd +++ b/man/addGeneBlockGDSRefAnnot.Rd @@ -3,23 +3,23 @@ \encoding{UTF-8} \name{addGeneBlockGDSRefAnnot} \alias{addGeneBlockGDSRefAnnot} -\title{Generate two indexes based on gene annotation for gdsAnnot1KG -block and add the indexes into the -gdsAnnot1KG} +\title{Append information associated to blocks, as indexes, into the +Population Reference SNV Annotation GDS file} \usage{ addGeneBlockGDSRefAnnot( gdsReference, - file.gdsRefAnnot, + gdsRefAnnotFile, winSize = 10000, - EnsDb, - suffixe.blockName + ensDb, + suffixBlockName ) } \arguments{ \item{gdsReference}{an object of class -\link[gdsfmt]{gds.class} (a GDS file), the opened 1KG GDS file.} +\link[gdsfmt]{gds.class} (a GDS file), the opened Reference GDS file.} -\item{file.gdsRefAnnot}{the filename corresponding the 1KG SNV +\item{gdsRefAnnotFile}{a \code{character} string representing the +file name corresponding the Reference SNV Annotation GDS file. The function will open it in write mode and close it after. The file must exist.} @@ -27,23 +27,67 @@ open it in write mode and close it after. The file must exist.} size of the window to use to group the SNVs when the SNVs are in a non-coding region. Default: \code{10000L}.} -\item{EnsDb}{An object with the ensembl genome annotation +\item{ensDb}{An object with the ensembl genome annotation Default: \code{EnsDb.Hsapiens.v86}.} -\item{suffixe.blockName}{TODO ex Ensembl.Hsapiens.v86} +\item{suffixBlockName}{a \code{character} string that identify the source +of the block and that will be added to the block description into +the Reference SNV Annotation GDS file, as example: Ensembl.Hsapiens.v86.} } \value{ The integer \code{OL} when the function is successful. } \description{ -TODO +The function appends the information about the blocks into +the Population Reference SNV Annotation GDS file. The information is +extracted from the Population Reference GDS file. } \examples{ +## Required library +library(SNPRelate) + ## Path to the demo pedigree file is located in this package dataDir <- system.file("extdata", package="RAIDS") -## TODO +fileAnnotGDS <- file.path(tempdir(), "ex1_good_small_1KG_Ann_GDS.gds") + +## Required library +if (requireNamespace("EnsDb.Hsapiens.v86", quietly=TRUE)) { + + file.copy(file.path(dataDir, "tests", + "ex1_NoBlockGene.1KG_Annot_GDS.gds"), fileAnnotGDS) + + ## Making a "short cut" on the ensDb object + edb <- EnsDb.Hsapiens.v86::EnsDb.Hsapiens.v86 + + ## GDS Reference file + fileReferenceGDS <- file.path(dataDir, "tests", + "ex1_good_small_1KG.gds") + + \donttest{ + ## Open the reference GDS file (demo version) + gds1KG <- snpgdsOpen(fileReferenceGDS) + + ## Append information associated to blocks + addGeneBlockGDSRefAnnot(gdsReference=gds1KG, + gdsRefAnnotFile=fileAnnotGDS, + ensDb=edb, + suffixBlockName="EnsDb.Hsapiens.v86") + + gdsAnnot1KG <- openfn.gds(fileAnnotGDS) + print(gdsAnnot1KG) + print(read.gdsn(index.gdsn(gdsAnnot1KG, "block.annot"))) + + ## Close GDS files + closefn.gds(gds1KG) + closefn.gds(gdsAnnot1KG) + } + + ## Remove temporary file + unlink(fileAnnotGDS, force=TRUE) + +} } \author{ diff --git a/man/addGeneBlockRefAnnot.Rd b/man/addGeneBlockRefAnnot.Rd new file mode 100644 index 000000000..b12d09f49 --- /dev/null +++ b/man/addGeneBlockRefAnnot.Rd @@ -0,0 +1,89 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/process1KG.R +\encoding{UTF-8} +\name{addGeneBlockRefAnnot} +\alias{addGeneBlockRefAnnot} +\title{Append information associated to blocks, as indexes, into the +Population Reference SNV Annotation GDS file} +\usage{ +addGeneBlockRefAnnot( + fileReferenceGDS, + gdsRefAnnotFile, + winSize = 10000, + ensDb, + suffixBlockName +) +} +\arguments{ +\item{fileReferenceGDS}{a \code{character} string representing the file +name of the Reference GDS file. The file must exist.} + +\item{gdsRefAnnotFile}{a \code{character} string representing the +file name corresponding the Reference SNV +Annotation GDS file. The function will +open it in write mode and close it after. The file must exist.} + +\item{winSize}{a single positive \code{integer} representing the +size of the window to use to group the SNVs when the SNVs are in a +non-coding region. Default: \code{10000L}.} + +\item{ensDb}{An object with the ensembl genome annotation +Default: \code{EnsDb.Hsapiens.v86}.} + +\item{suffixBlockName}{a \code{character} string that identify the source +of the block and that will be added to the block description into +the Reference SNV Annotation GDS file, as example: Ensembl.Hsapiens.v86.} +} +\value{ +The integer \code{OL} when the function is successful. +} +\description{ +The function appends the information about the blocks into +the Population Reference SNV Annotation GDS file. The information is +extracted from the Population Reference GDS file. +} +\examples{ + +## Path to the demo pedigree file is located in this package +dataDir <- system.file("extdata", package="RAIDS") + +fileAnnotGDS <- file.path(tempdir(), "ex1_good_small_1KG_Ann_GDS.gds") + +## Required library +if (requireNamespace("EnsDb.Hsapiens.v86", quietly=TRUE)) { + + file.copy(file.path(dataDir, "tests", + "ex1_NoBlockGene.1KG_Annot_GDS.gds"), fileAnnotGDS) + + ## Making a "short cut" on the ensDb object + edb <- EnsDb.Hsapiens.v86::EnsDb.Hsapiens.v86 + + ## GDS Reference file + fileReferenceGDS <- file.path(dataDir, "tests", + "ex1_good_small_1KG.gds") + + \donttest{ + + + ## Append information associated to blocks + addGeneBlockRefAnnot(fileReferenceGDS=fileReferenceGDS, + gdsRefAnnotFile=fileAnnotGDS, + ensDb=edb, + suffixBlockName="EnsDb.Hsapiens.v86") + + gdsAnnot1KG <- openfn.gds(fileAnnotGDS) + print(gdsAnnot1KG) + print(read.gdsn(index.gdsn(gdsAnnot1KG, "block.annot"))) + + closefn.gds(gdsAnnot1KG) + } + + ## Remove temporary file + unlink(fileAnnotGDS, force=TRUE) + +} + +} +\author{ +Pascal Belleau, Astrid Deschênes and Alexander Krasnitz +} diff --git a/man/addPhase1KG2SampleGDSFromFile.Rd b/man/addPhase1KG2SampleGDSFromFile.Rd deleted file mode 100644 index ee339f644..000000000 --- a/man/addPhase1KG2SampleGDSFromFile.Rd +++ /dev/null @@ -1,48 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/processStudy.R -\encoding{UTF-8} -\name{addPhase1KG2SampleGDSFromFile} -\alias{addPhase1KG2SampleGDSFromFile} -\title{TODO} -\usage{ -addPhase1KG2SampleGDSFromFile( - gdsReference, - pathProfileGDS, - pathGeno, - fileSNPsRDS, - verbose = FALSE -) -} -\arguments{ -\item{gdsReference}{an object of class -\link[gdsfmt]{gds.class} (a GDS file), the opened 1KG GDS file.} - -\item{pathProfileGDS}{a \code{character} string representing the path to -the directory that contains the Profile GDS files. The directory must -exist.} - -\item{pathGeno}{a \code{character} string representing the path to -the directory that contains TODO} - -\item{fileSNPsRDS}{TODO} - -\item{verbose}{a \code{logical} indicating if message information should be -printed. Default: \code{FALSE}.} -} -\value{ -The integer \code{0L} when successful. -} -\description{ -TODO -} -\examples{ - -## Path to the demo pedigree file is located in this package -dataDir <- system.file("extdata", 'RAIDS') - -## TODO - -} -\author{ -Pascal Belleau, Astrid Deschênes and Alex Krasnitz -} diff --git a/man/addPhase1KG2SampleGDSFromGDS.Rd b/man/addPhase1KG2SampleGDSFromGDS.Rd deleted file mode 100644 index d86163380..000000000 --- a/man/addPhase1KG2SampleGDSFromGDS.Rd +++ /dev/null @@ -1,44 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/processStudy.R -\encoding{UTF-8} -\name{addPhase1KG2SampleGDSFromGDS} -\alias{addPhase1KG2SampleGDSFromGDS} -\title{TODO} -\usage{ -addPhase1KG2SampleGDSFromGDS( - gdsReference, - gdsPhase, - pathProfileGDS, - verbose = FALSE -) -} -\arguments{ -\item{gdsReference}{an object of class -\code{\link[SNPRelate:SNPGDSFileClass]{SNPRelate::SNPGDSFileClass}}, a SNP -GDS file.} - -\item{gdsPhase}{TODO} - -\item{pathProfileGDS}{the path of an object of class \code{gds} related to -the sample} - -\item{verbose}{a \code{logical} indicating if message information should be -printed. Default: \code{TRUE}.} -} -\value{ -The integer \code{0} when successful. -} -\description{ -TODO -} -\examples{ - -## Path to the demo pedigree file is located in this package -dataDir <- system.file("extdata", "RAIDS") - -## TODO - -} -\author{ -Pascal Belleau, Astrid Deschênes and Alexander Krasnitz -} diff --git a/man/addRef2GDS1KG.Rd b/man/addRef2GDS1KG.Rd index 0199031e3..ede83943c 100644 --- a/man/addRef2GDS1KG.Rd +++ b/man/addRef2GDS1KG.Rd @@ -3,20 +3,21 @@ \encoding{UTF-8} \name{addRef2GDS1KG} \alias{addRef2GDS1KG} -\title{Add the information about the unrelated patients to the 1KG GDS file} +\title{Add the information about the unrelated patients to the Reference +GDS file} \usage{ addRef2GDS1KG(fileNameGDS, filePart) } \arguments{ \item{fileNameGDS}{a \code{character} string representing the path and file -name of the GDS file that contains the 1KG information. The 1KG GDS file -must contain the SNP information, the genotyping information and -the pedigree information from 1000 Genomes. +name of the GDS file that contains the Reference information. The +Reference GDS file must contain the SNP information, the genotyping +information and the pedigree information from Reference dataset. The extension of the file must be '.gds'.} \item{filePart}{a \code{character} string representing the path and file name of the RDS file that contains the -information about the 1KG patients that are unrelated. +information about the Reference patients that are unrelated. The extension of the file must be '.rds'. The file must exists.} } \value{ @@ -24,7 +25,7 @@ The integer \code{0L} when successful. } \description{ This function adds the information about the unrelated patients -to the 1KG GDS file. More specifically, it creates the field +to the Reference GDS file. More specifically, it creates the field \code{sample.ref} which as the value \code{1} when the sample is unrelated and the value \code{0} otherwise. The \code{sample.ref} is filled based on the information present in the diff --git a/man/addStudyGDSSample.Rd b/man/addStudyGDSSample.Rd index cd55bcd99..332227c4e 100644 --- a/man/addStudyGDSSample.Rd +++ b/man/addStudyGDSSample.Rd @@ -54,30 +54,29 @@ created and then, the information is added. ## Required library library(gdsfmt) -## Create a temporary GDS file in an test directory -dataDir <- system.file("extdata/tests", package="RAIDS") -gdsFilePath <- file.path(dataDir, "GDS_TEMP_11.gds") +## Create a temporary GDS file in an current directory +gdsFilePath <- file.path(tempdir(), "GDS_TEMP_11.gds") ## Create and open the GDS file tmpGDS <- createfn.gds(filename=gdsFilePath) ## Create a PED data frame with sample information ped1KG <- data.frame(Name.ID=c("1KG_sample_01", "1KG_sample_02"), - Case.ID=c("1KG_sample_01", "1KG_sample_02"), - Sample.Type=rep("Reference", 2), Diagnosis=rep("Reference", 2), - Source=rep("IGSR", 2), stringsAsFactors=FALSE) + Case.ID=c("1KG_sample_01", "1KG_sample_02"), + Sample.Type=rep("Reference", 2), Diagnosis=rep("Reference", 2), + Source=rep("IGSR", 2), stringsAsFactors=FALSE) ## Create a Study data frame with information about the study ## All samples are associated to the same study studyInfo <- data.frame(study.id="Ref.1KG", - study.desc="Unrelated samples from 1000 Genomes", - study.platform="GRCh38 1000 genotypes", - stringsAsFactors=FALSE) + study.desc="Unrelated samples from 1000 Genomes", + study.platform="GRCh38 1000 genotypes", + stringsAsFactors=FALSE) ## Add the sample information to the GDS Sample file ## The information for all samples is added (listSamples=NULL) RAIDS:::addStudyGDSSample(gdsProfile=tmpGDS, pedProfile=ped1KG, batch=1, - listSamples=NULL, studyDF=studyInfo, verbose=FALSE) + listSamples=NULL, studyDF=studyInfo, verbose=FALSE) ## Read study information from GDS Sample file read.gdsn(index.gdsn(node=tmpGDS, path="study.list")) @@ -91,6 +90,7 @@ closefn.gds(gdsfile=tmpGDS) ## Delete the temporary GDS file unlink(x=gdsFilePath, force=TRUE) + } \author{ Pascal Belleau, Astrid Deschênes and Alexander Krasnitz diff --git a/man/addUpdateLap.Rd b/man/addUpdateLap.Rd index 180523cc4..2b1174af8 100644 --- a/man/addUpdateLap.Rd +++ b/man/addUpdateLap.Rd @@ -33,9 +33,8 @@ already be present in the GDS file. ## Required library library(gdsfmt) -## Create a temporary GDS file in an test directory -dataDir <- system.file("extdata/tests", package="RAIDS") -gdsFilePath <- file.path(dataDir, "GDS_TEMP.gds") +## Create a temporary GDS file +gdsFilePath <- file.path(tempdir(), "GDS_TEMP.gds") ## Create and open the GDS file gdsFile <- createfn.gds(filename=gdsFilePath) @@ -59,6 +58,7 @@ closefn.gds(gdsfile=gdsFile) ## Delete the temporary GDS file unlink(x=gdsFilePath, force=TRUE) + } \author{ Pascal Belleau, Astrid Deschênes and Alexander Krasnitz diff --git a/man/addUpdateSegment.Rd b/man/addUpdateSegment.Rd index 4839a1f8b..e9444fa79 100644 --- a/man/addUpdateSegment.Rd +++ b/man/addUpdateSegment.Rd @@ -1,18 +1,18 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/gdsWrapper.R +% Please edit documentation in R/gdsWrapper_internal.R \encoding{UTF-8} \name{addUpdateSegment} \alias{addUpdateSegment} \title{Add information related to segments associated to the SNV dataset for a specific sample into a GDS file} \usage{ -addUpdateSegment(gdsProfile, snp.seg) +addUpdateSegment(gdsProfile, snpSeg) } \arguments{ \item{gdsProfile}{an object of class \code{\link[gdsfmt]{gds.class}} (a GDS file), a GDS Sample file.} -\item{snp.seg}{a \code{vector} of \code{integer} representing the segment +\item{snpSeg}{a \code{vector} of \code{integer} representing the segment identifiers associated to each SNV selected for the specific sample. The length of the \code{vector} should correspond to the number of SNVs present in the "snp.id" entry of the GDS sample file.} @@ -28,9 +28,11 @@ already exists, the previous information is erased. } \examples{ -## Create a temporary GDS file in an test directory -dataDir <- system.file("extdata/tests", package="RAIDS") -gdsFilePath <- file.path(dataDir, "GDS_TEMP.gds") +## Required library +library(gdsfmt) + +## Temporary GDS file +gdsFilePath <- file.path(tempdir(), "GDS_TEMP.gds") ## Create and open the GDS file GDS_file_tmp <- createfn.gds(filename=gdsFilePath) @@ -39,7 +41,7 @@ GDS_file_tmp <- createfn.gds(filename=gdsFilePath) segments <- c(1L, 1L, 1L, 2L, 2L, 3L, 3L) ## Add segments to the GDS file -RAIDS:::addUpdateSegment(gdsProfile=GDS_file_tmp, snp.seg=segments) +RAIDS:::addUpdateSegment(gdsProfile=GDS_file_tmp, snpSeg=segments) ## Read segments information from GDS file read.gdsn(index.gdsn(node=GDS_file_tmp, path="segment")) @@ -50,6 +52,7 @@ closefn.gds(gdsfile=GDS_file_tmp) ## Delete the temporary GDS file unlink(x=gdsFilePath, force=TRUE) + } \author{ Pascal Belleau, Astrid Deschênes and Alexander Krasnitz diff --git a/man/appendGDSRefSample.Rd b/man/appendGDSRefSample.Rd index c0f67a718..3791bb95e 100644 --- a/man/appendGDSRefSample.Rd +++ b/man/appendGDSRefSample.Rd @@ -49,40 +49,41 @@ addStudyGDSSample() must be used. library(gdsfmt) ## Create a temporary GDS file in an test directory -dataDir <- system.file("extdata/tests", package="RAIDS") -gdsFilePath <- file.path(dataDir, "GDS_TEMP_03.gds") +gdsFilePath <- file.path(tempdir(), "GDS_TEMP_03.gds") ## Create and open the GDS file tmpGDS <- createfn.gds(filename=gdsFilePath) ## Create "sample.id" node (the node must be present) add.gdsn(node=tmpGDS, name="sample.id", val=c("sample_01", - "sample_02")) + "sample_02")) ## Create "sample.annot" node (the node must be present) add.gdsn(node=tmpGDS, name="sample.annot", val=data.frame( - Name.ID=c("sample_01", "sample_02"), - sex=c(1,1), # 1:Male 2: Female - pop.group=c("ACB", "ACB"), - superPop=c("AFR", "AFR"), - batch=c(1, 1), - stringsAsFactors=FALSE)) + Name.ID=c("sample_01", "sample_02"), + sex=c(1,1), # 1:Male 2: Female + pop.group=c("ACB", "ACB"), + superPop=c("AFR", "AFR"), + batch=c(1, 1), + stringsAsFactors=FALSE)) sync.gds(gdsfile=tmpGDS) ## Create a data.frame with information about samples -sample_info <- data.frame(Name.ID=c("sample_04", "sample_05", "sample_06"), - sex=c(1,2,1), # 1:Male 2: Female - pop.group=c("ACB", "ACB", "ACB"), - superPop=c("AFR", "AFR", "AFR"), - stringsAsFactors=FALSE) +sample_info <- data.frame(Name.ID=c("sample_04", "sample_05", + "sample_06"), + sex=c(1,2,1), # 1:Male 2: Female + pop.group=c("ACB", "ACB", "ACB"), + superPop=c("AFR", "AFR", "AFR"), + stringsAsFactors=FALSE) ## The row names must be the sample identifiers rownames(sample_info) <- sample_info$Name.ID ## Add information about 2 samples to the GDS file -RAIDS:::appendGDSRefSample(gdsReference=tmpGDS, dfPedReference=sample_info, - batch=2, listSamples=c("sample_04", "sample_06"), verbose=FALSE) +RAIDS:::appendGDSRefSample(gdsReference=tmpGDS, + dfPedReference=sample_info, + batch=2, listSamples=c("sample_04", "sample_06"), verbose=FALSE) ## Read sample identifier list ## Only "sample_04" and "sample_06" should have been added @@ -98,6 +99,7 @@ closefn.gds(gdsfile=tmpGDS) ## Delete the temporary GDS file unlink(x=gdsFilePath, force=TRUE) + } \author{ Pascal Belleau, Astrid Deschênes and Alexander Krasnitz diff --git a/man/appendGDSSampleOnly.Rd b/man/appendGDSSampleOnly.Rd index 0df21ce48..a561410e8 100644 --- a/man/appendGDSSampleOnly.Rd +++ b/man/appendGDSSampleOnly.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/gdsWrapper.R +% Please edit documentation in R/gdsWrapper_internal.R \encoding{UTF-8} \name{appendGDSSampleOnly} \alias{appendGDSSampleOnly} @@ -11,7 +11,7 @@ appendGDSSampleOnly(gds, listSamples) \item{gds}{an object of class \link[gdsfmt]{gds.class} (a GDS file), the opened GDS file.} -\item{listSample}{a \code{vector} of \code{character} string representing +\item{listSamples}{a \code{vector} of \code{character} string representing the sample identifiers to be added to GDS file.} } \value{ @@ -23,22 +23,24 @@ This function append the sample identifiers into the } \examples{ -## Create a temporary GDS file in an test directory -dataDir <- system.file("extdata/tests", package="RAIDS") -gdsFilePath <- file.path(dataDir, "GDS_TEMP_04.gds") +## Required library +library(gdsfmt) + +## Temporary GDS file in current directory +gdsFilePath <- file.path(tempdir(), "GDS_TEMP_04.gds") ## Create and open the GDS file GDS_file_tmp <- createfn.gds(filename=gdsFilePath) ## Create "sample.id" node (the node must be present) add.gdsn(node=GDS_file_tmp, name="sample.id", val=c("sample_01", - "sample_02")) + "sample_02")) sync.gds(gdsfile=GDS_file_tmp) ## Add information about 2 samples to the GDS file RAIDS:::appendGDSSampleOnly(gds=GDS_file_tmp, - listSamples=c("sample_03", "sample_04")) + listSamples=c("sample_03", "sample_04")) ## Read sample identifier list ## Only "sample_03" and "sample_04" should have been added diff --git a/man/appendGDSgenotype.Rd b/man/appendGDSgenotype.Rd index 0de1e1f8e..0b94be5e7 100644 --- a/man/appendGDSgenotype.Rd +++ b/man/appendGDSgenotype.Rd @@ -3,33 +3,94 @@ \encoding{UTF-8} \name{appendGDSgenotype} \alias{appendGDSgenotype} -\title{This function append the field genotype in the gds file} +\title{Append information related to profile genotypes into a Population +Reference GDS file (associated node already present in the GDS)} \usage{ -appendGDSgenotype(gds, listSample, pathGeno, fileSNPsRDS, verbose = FALSE) +appendGDSgenotype(gds, listSample, pathGeno, fileSNPsRDS, verbose) } \arguments{ -\item{gds}{a \code{gds} object.} +\item{gds}{an object of class +\link[gdsfmt]{gds.class} (a GDS file), the opened Population Reference +GDS file.} -\item{pathGeno}{TODO a PATH to a directory with the a file for each -samples with the genotype.} +\item{listSample}{a \code{character} string representing the path and file +name of the RDS file that contains the indexes of the retained SNPs. The +file must exist. The file must be a RDS file.} -\item{fileSNPsRDS}{TODO list of SNP to keep in the file genotype} +\item{pathGeno}{a \code{character} string representing the path where +the reference genotyping files for each sample are located. The name of the +genotyping files must correspond to +the individual identification (Individual.ID) in the pedigree file.} -\item{verbose}{a \code{logical} indicating if the function must print -messages when running. Default: \code{FALSE}.} +\item{fileSNPsRDS}{a \code{character} string representing the path and file +name of the RDS file that contains the indexes of the retained SNPs. The +file must exist. The file must be a RDS file.} -\item{listSamples}{a \code{array} with the sample to keep} +\item{verbose}{a \code{logical} indicating if the function must print +messages when running.} } \value{ The integer \code{0} when successful. } \description{ -TODO +This function appends the genotype fields with the associated +information into the Population Reference GDS file for the selected +profiles. The associated node must already present in the GDS file. } \examples{ -# TODO -gds <- "Demo GDS TODO" +## Required library +library(gdsfmt) + +## Path to the demo pedigree file is located in this package +dataDir <- system.file("extdata", package="RAIDS") + +## Path where the demo genotype CSV files are located +pathGeno <- file.path(dataDir, "demoProfileGenotypes") + +## The RDS file containing the pedigree information +pedigreeFile <- file.path(dataDir, "PedigreeReferenceDemo.rds") + +## The RDS file containing the indexes of the retained SNPs +snpIndexFile <- file.path(dataDir, "listSNPIndexes_Demo.rds") + +## The RDS file containing the filtered SNP information +filterSNVFile <- file.path(dataDir, "mapSNVSelected_Demo.rds") + +## Temporary Reference GDS file +tempRefGDS <- file.path(tempdir(), "Ref_TEMP02.gds") + +## Create temporary Reference GDS file +newGDS <- createfn.gds(tempRefGDS) +put.attr.gdsn(newGDS$root, "FileFormat", "SNP_ARRAY") + +## Read the pedigree file +ped1KG <- readRDS(pedigreeFile) + +## Add information about samples to the Reference GDS file +listSampleGDS <- RAIDS:::generateGDSRefSample(gdsReference=newGDS, + dfPedReference=ped1KG, listSamples=NULL) + +## Add SNV information to the Reference GDS +RAIDS:::generateGDSSNPinfo(gdsReference=newGDS, fileFreq=filterSNVFile, + verbose=FALSE) + +## Add genotype information to the Reference GDS for the 3 first samples +RAIDS:::generateGDSgenotype(gds=newGDS, pathGeno=pathGeno, + fileSNPsRDS=snpIndexFile, listSamples=listSampleGDS[1:3], + verbose=FALSE) + +## Append genotype information to the Reference GDS for the other samples +RAIDS:::appendGDSgenotype(gds=newGDS, pathGeno=pathGeno, + fileSNPsRDS=snpIndexFile, + listSample=listSampleGDS[4:length(listSampleGDS)], + verbose=FALSE) + +## Close file +closefn.gds(newGDS) + +## Remove temporary files +unlink(tempRefGDS, force=TRUE) } \author{ diff --git a/man/appendGDSgenotypeMat.Rd b/man/appendGDSgenotypeMat.Rd index 6e82542fb..850edff77 100644 --- a/man/appendGDSgenotypeMat.Rd +++ b/man/appendGDSgenotypeMat.Rd @@ -36,9 +36,8 @@ correspond to the number of rows of the matrix present in the ## Required library library(gdsfmt) -## Create a temporary GDS file in an test directory -dataDir <- system.file("extdata/tests", package="RAIDS") -gdsFilePath <- file.path(dataDir, "GDS_TEMP_06.gds") +## Create a temporary GDS file +gdsFilePath <- file.path(tempdir(), "GDS_TEMP_06.gds") ## Create and open the GDS file tmpGDS <- createfn.gds(filename=gdsFilePath) @@ -66,6 +65,7 @@ closefn.gds(gdsfile=tmpGDS) ## Delete the temporary GDS file unlink(x=gdsFilePath, force=TRUE) + } \author{ Pascal Belleau, Astrid Deschênes and Alexander Krasnitz diff --git a/man/appendStudy2GDS1KG.Rd b/man/appendStudy2GDS1KG.Rd deleted file mode 100644 index 9dae8e382..000000000 --- a/man/appendStudy2GDS1KG.Rd +++ /dev/null @@ -1,103 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/processStudy.R -\encoding{UTF-8} -\name{appendStudy2GDS1KG} -\alias{appendStudy2GDS1KG} -\title{Create the GDS Sample file(s) for one or multiple specific samples -using the information from a RDS Sample description file and the 1KG -GDS file} -\usage{ -appendStudy2GDS1KG( - pathGeno = file.path("data", "sampleGeno"), - filePedRDS, - fileNameGDS, - batch = 1, - studyDF, - listSamples = NULL, - pathProfileGDS = NULL, - genoSource = c("snp-pileup", "generic"), - verbose = TRUE -) -} -\arguments{ -\item{pathGeno}{a \code{character} string representing the path to the -directory containing the output of SNP-pileup, a VCF Sample file, for -each sample. The -SNP-pileup files must be compressed (gz files) and have the name identifiers -of the samples. A sample with "Name.ID" identifier would have an -associated SNP-pileup file called "Name.ID.txt.gz".} - -\item{filePedRDS}{a \code{character} string representing the path to the -RDS file that contains the information about the sample to analyse. -The RDS file must -include a \code{data.frame} with those mandatory columns: "Name.ID", -"Case.ID", "Sample.Type", "Diagnosis", "Source". All columns must be in -\code{character} strings. The \code{data.frame} -must contain the information for all the samples passed in the -\code{listSamples} parameter.} - -\item{fileNameGDS}{a \code{character} string representing the file name of -the 1KG GDS file. The file must exist.} - -\item{batch}{a single positive \code{integer} representing the current -identifier for the batch. Beware, this field is not stored anymore. -Default: \code{1}.} - -\item{studyDF}{a \code{data.frame} containing the information about the -study associated to the analysed sample(s). The \code{data.frame} must have -those 3 columns: "study.id", "study.desc", "study.platform". All columns -must be in \code{character} strings.} - -\item{listSamples}{a \code{vector} of \code{character} string corresponding -to the sample identifiers that will have a GDS Sample file created. The -sample identifiers must be present in the "Name.ID" column of the RDS file -passed to the \code{filePedRDS} parameter. -If \code{NULL}, all samples in the \code{filePedRDS} are selected. -Default: \code{NULL}.} - -\item{pathProfileGDS}{a \code{character} string representing the path to -the directory where the GDS Sample files will be created. -Default: \code{NULL}.} - -\item{genoSource}{a \code{character} string with two possible values: -'snp-pileup' or 'generic'. It specifies if the genotype files -are generated by snp-pileup (Facets) or are a generic format CSV file -with at least those columns: -'Chromosome', 'Position', 'Ref', 'Alt', 'Count', 'File1R' and 'File1A'. -The 'Count' is the depth at the specified position; -'FileR' is the depth of the reference allele and -'File1A' is the depth of the specific alternative allele.} - -\item{verbose}{a \code{logical} indicating if message information should be -printed. Default: \code{TRUE}.} -} -\value{ -The function returns \code{0L} when successful. -} -\description{ -The function uses the information for the 1KG GDS file and the -RDS Sample Description file to create the GDS Sample file. One GDS Sample -file is created per sample. One GDS Sample file will be created for each -entry present in the \code{listSamples} parameter. -} -\examples{ - -## Path to the demo pedigree file is located in this package -dataDir <- system.file("extdata", package="RAIDS") - -## The data.frame containing the information about the study -## The 3 mandatory columns: "study.id", "study.desc", "study.platform" -## The entries should be strings, not factors (stringsAsFactors=FALSE) -studyInfo <- data.frame(study.id="Pancreatic.WES", - study.desc="Pancreatic study", - study.platform="WES", - stringsAsFactors=FALSE) - -## TODO -filePedRDS <- "TODO" - - -} -\author{ -Pascal Belleau, Astrid Deschênes and Alexander Krasnitz -} diff --git a/man/basePCASample.Rd b/man/basePCASample.Rd deleted file mode 100644 index f55b52ddc..000000000 --- a/man/basePCASample.Rd +++ /dev/null @@ -1,58 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/process1KG.R -\encoding{UTF-8} -\name{basePCASample} -\alias{basePCASample} -\title{Compute principal component axes (PCA) on SNV data using the -reference samples} -\usage{ -basePCASample(gds, listSample.Ref = NULL, listSNP = NULL, np = 1L) -} -\arguments{ -\item{gds}{an object of class -\code{\link[SNPRelate:SNPGDSFileClass]{SNPRelate::SNPGDSFileClass}}, a SNP -GDS file.} - -\item{listSample.Ref}{a \code{vector} of \code{character} strings -corresponding to -the sample identifiers that will be used for the PCA.} - -\item{listSNP}{a \code{vector} of \code{character} strings representing -the SNV identifiers retained for the PCA.} - -\item{np}{a single positive \code{integer} representing the number of -threads. Default: \code{1L}.} -} -\value{ -a \code{list} with 3 entries: -\itemize{ -\item{SNP}{ a \code{vector} of \code{character} strings representing the -SNV identifiers used in the PCA.} -\item{pca.unrel}{ an object of class \code{snpgdsPCAClass} as generated -by the -\code{\link[SNPRelate:snpgdsPCA]{SNPRelate::snpgdsPCA}} function. } -\item{snp.load}{ an object of class \code{snpgdsPCASNPLoading} as generated -by the -\code{\link[SNPRelate:snpgdsPCASNPLoading]{SNPRelate::snpgdsPCASNPLoading}} -function. } -} -} -\description{ -The function runs a Principal Component Analysis (PCA) on -the SNv genotype data. The function also loads SNVs into the PCA to -calculate the SNV eigenvectors. Those 2 steps are done with the - \code{\link[SNPRelate]{snpgdsPCA}} and - \code{\link[SNPRelate]{snpgdsPCASNPLoading}} -functions. -} -\examples{ - -## Path to the demo pedigree file is located in this package -dataDir <- system.file("extdata", package="RAIDS") - -## TODO - -} -\author{ -Pascal Belleau, Astrid Deschênes and Alexander Krasnitz -} diff --git a/man/calcAFMLRNA.Rd b/man/calcAFMLRNA.Rd index c9081599e..656eb4608 100644 --- a/man/calcAFMLRNA.Rd +++ b/man/calcAFMLRNA.Rd @@ -3,30 +3,59 @@ \encoding{UTF-8} \name{calcAFMLRNA} \alias{calcAFMLRNA} -\title{TODO} +\title{Compute the log likelihood ratio based on the coverage of +each allele in a specific block (gene in the case of RNA-seq)} \usage{ -calcAFMLRNA(snp.pos.Hetero) +calcAFMLRNA(snpPosHetero) } \arguments{ -\item{snp.pos.Hetero}{For a specific gene (block) a \code{data.frame} with -lap for the SNV heterozygote dataset with -coverage > \code{minCov}. The \code{data.frame} must contain those columns: -'phase', 'cnt.ref', 'cnt.alt'. TODO} +\item{snpPosHetero}{a \code{data.frame} +containing the SNV information for a specific block (gene if RNA-seq). +The \code{data.frame} must contain those columns: +\describe{ +\item{cnt.ref}{ a single \code{integer} representing the coverage for +the reference allele.} +\item{cnt.alt}{ a single \code{integer} representing the coverage for +the alternative allele.} +\item{phase}{ a single \code{integer} indicating the phase of the variant +if known, \code{3} if not known} +}} } \value{ -TODO a \code{list} of \code{numeric} for the gene lR the score -for aFraction different than 0.5 -aFraction allele estimation, nPhase number of SNV phase, -sumAlleleLow number of read overlapping the allele low -sumAlleleHigh number of read overlapping the allele high TODO +a \code{list} for the block with the information +relative to the heterozygotes. +The \code{list} contains: +\describe{ +\item{lR}{ a single \code{numeric} representing the sum of the log of +read depth of the lowest depth divide by the total depth of the position +minus of likelihood of the allelic fraction of 0.5.} +\item{aFraction}{ a single \code{numeric} representing the allele +fraction estimation.} +\item{sumAlleleLow}{ a \code{integer} representing the +sum of the allele read depth of the lowest read allele depth} +\item{sumAlleleHigh}{ a \code{integer} representing the +sum of the allele read depth +of the highest read allele depth} +} } \description{ -TODO +This function sums the log of read depth of the lowest depth +divide by the total depth of the position minus of likelihood of the allelic +fraction of 0.5 for a block. If the phase is known, the SNVs in the same +haplotype are grouped together. } \examples{ -# TODO -gds <- "Demo GDS TODO" +## Loading demo dataset containing SNV information +data(snpPositionDemo) + +## Only use a subset of heterozygote SNVs related to one block +subset <- snpPositionDemo[which(snpPositionDemo$block.id == 2750 & + snpPositionDemo$hetero), c("cnt.ref", "cnt.alt", "phase")] + +result <- RAIDS:::calcAFMLRNA(subset) + +head(result) } \author{ diff --git a/man/computeAlleleFraction.Rd b/man/computeAlleleFraction.Rd index d17fa259e..a8726a577 100644 --- a/man/computeAlleleFraction.Rd +++ b/man/computeAlleleFraction.Rd @@ -1,36 +1,42 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/allelicFraction.R +% Please edit documentation in R/allelicFraction_internal.R \encoding{UTF-8} \name{computeAlleleFraction} \alias{computeAlleleFraction} -\title{TODO} +\title{Compute the allelic fraction for +each imbalanced segment} \usage{ -computeAlleleFraction(snp.pos, w = 10, cutOff = -3) +computeAlleleFraction(snpPos, w = 10, cutOff = -3) } \arguments{ -\item{snp.pos}{a \code{data.frame} containing the genotype information for -a SNV dataset. TODO} +\item{snpPos}{a \code{data.frame} containing the genotype information for +a SNV dataset.} \item{w}{a single positive \code{numeric} representing the size of the window to compute the allelic fraction. Default: \code{10}.} -\item{cutOff}{a single \code{numeric} representing TODO. Default: \code{-3}.} +\item{cutOff}{a \code{numeric} representing the cut-off for considering +a region imbalanced when comparing likelihood to gave allelic fraction +change and likelihood not to have allelic fraction change. +Default: \code{-3}.} } \value{ a \code{matrix} of \code{numeric} with 3 columns where each row represent a segment of imbalanced SNVs. The first column represents the position, in -\code{snp.pos}, of the first +\code{snpPos}, of the first SNV in the segment. The second column represents the position, in the -\code{snp.pos}, of the last SNV in the segment. The third column represents +\code{snpPos}, of the last SNV in the segment. The third column represents the lower allelic frequency of the segment and is \code{NA} when the value cannot be calculated. The value \code{NULL} is returned when none of the SNVs tested positive for the imbalance. } \description{ -TODO +This function computes the allelic fraction for each segment +different than 0.5. The allelic fraction of the segment can be decomposed in +sub-segments. } \examples{ @@ -50,7 +56,7 @@ snpInfo <- data.frame(cnt.tot=c(41, 17, 27, 15, 11, 37, 16, 32), stringAsFactor=FALSE) ## The function returns NULL when there is not imbalanced SNVs -RAIDS:::computeAlleleFraction(snp.pos=snpInfo, w=10, cutOff=-3) +RAIDS:::computeAlleleFraction(snpPos=snpInfo, w=10, cutOff=-3) } diff --git a/man/computeAllelicFractionDNA.Rd b/man/computeAllelicFractionDNA.Rd index 159d8830d..f8428788b 100644 --- a/man/computeAllelicFractionDNA.Rd +++ b/man/computeAllelicFractionDNA.Rd @@ -23,7 +23,7 @@ computeAllelicFractionDNA( } \arguments{ \item{gdsReference}{an object of class \code{\link[gdsfmt]{gds.class}} -(a GDS file), the opened 1KG GDS file.} +(a GDS file), the opened Reference GDS file.} \item{gdsSample}{an object of class \code{\link[gdsfmt]{gds.class}} (a GDS file), the opened Profile GDS file.} @@ -66,25 +66,25 @@ message when running.} a \code{data.frame} containing the allelic information for the pruned SNV dataset with coverage > \code{minCov}. The \code{data.frame} contains those columns: -\itemize{ -\item{cnt.tot} {a \code{integer} representing the total allele count} -\item{cnt.ref} {a \code{integer} representing the reference allele count} -\item{cnt.alt} {a \code{integer} representing the alternative allele count} -\item{snp.pos} {a \code{integer} representing the position on the chromosome} -\item{snp.chr} {a \code{integer} representing the chromosome} -\item{normal.geno} {a \code{integer} representing the genotype +\describe{ +\item{cnt.tot}{ a \code{integer} representing the total allele count} +\item{cnt.ref}{ a \code{integer} representing the reference allele count} +\item{cnt.alt}{ a \code{integer} representing the alternative allele count} +\item{snp.pos}{ a \code{integer} representing the position on the chromosome} +\item{snp.chr}{ a \code{integer} representing the chromosome} +\item{normal.geno}{ a \code{integer} representing the genotype (0=wild-type reference; 1=heterozygote; 2=homozygote alternative; 3=unkown)} -\item{pruned} {a \code{logical} indicating if the SNV is retained after +\item{pruned}{ a \code{logical} indicating if the SNV is retained after pruning} -\item{snp.index} {a \code{integer} representing the index position of the -SNV in the 1KG GDS file that contains all SNVs} -\item{keep} {a \code{logical} indicating if the genotype exists for the SNV} -\item{hetero} {a \code{logical} indicating if the SNV is heterozygote} -\item{homo} {a \code{logical} indicating if the SNV is homozygote} -\item{lap} {a \code{numeric} indicating lower allelic fraction} -\item{LOH} {a \code{integer} indicating if the SNV is in an LOH region +\item{snp.index}{ a \code{integer} representing the index position of the +SNV in the Reference GDS file that contains all SNVs} +\item{keep}{ a \code{logical} indicating if the genotype exists for the SNV} +\item{hetero}{ a \code{logical} indicating if the SNV is heterozygote} +\item{homo}{ a \code{logical} indicating if the SNV is homozygote} +\item{lap}{ a \code{numeric} representing the lower allelic fraction} +\item{LOH}{ a \code{integer} indicating if the SNV is in an LOH region (0=not LOH, 1=in LOH)} -\item{imbAR} {a \code{integer} indicating if the SNV is in an imbalanced +\item{imbAR}{ a \code{integer} indicating if the SNV is in an imbalanced region (-1=not classified as imbalanced or LOH, 0=in LOH; 1=tested positive for imbalance in at least 1 window)} } @@ -95,49 +95,50 @@ allelic fraction for the pruned SNV dataset specific to a DNA-seq profile } \examples{ +## Required library for GDS +library(SNPRelate) + ## Path to the demo 1KG GDS file is located in this package dataDir <- system.file("extdata/tests", package="RAIDS") -fileGDS <- file.path(dataDir, "ex1_good_small_1KG_GDS.gds") +fileGDS <- file.path(dataDir, "ex1_good_small_1KG.gds") + +## Temporary Profile GDS file for one profile in temporary directory +fileProfile <- file.path(tempdir(), "ex1.gds") ## Copy the Profile GDS file demo that has been pruned and annotated -## into a test directory (deleted after the example has been run) -dataDirAllelicFraction <- file.path(system.file("extdata", package="RAIDS"), - "demoAllelicFraction") -dir.create(dataDirAllelicFraction, showWarnings=FALSE, - recursive=FALSE, mode="0777") file.copy(file.path(dataDir, "ex1_demo_with_pruning_and_1KG_annot.gds"), - file.path(dataDirAllelicFraction, "ex1.gds")) + fileProfile) ## Open the reference GDS file (demo version) gds1KG <- snpgdsOpen(fileGDS) -## Profile GDS file for one profile -fileProfile <- file.path(dataDirAllelicFraction, "ex1.gds") +## Open Profile GDS file for one profile profileGDS <- openfn.gds(fileProfile) -## Chromosome length information -## chr23 is chrX, chr24 is chrY and chrM is 25 -chrInfo <- c(248956422L, 242193529L, 198295559L, 190214555L, - 181538259L, 170805979L, 159345973L, 145138636L, 138394717L, 133797422L, - 135086622L, 133275309L, 114364328L, 107043718L, 101991189L, 90338345L, - 83257441L, 80373285L, 58617616L, 64444167L, 46709983L, 50818468L, - 156040895L, 57227415L, 16569L) - -## The function returns a data frame containing the allelic fraction info -result <- RAIDS:::computeAllelicFractionDNA(gdsReference=gds1KG, - gdsSample=profileGDS, - currentProfile="ex1", studyID="MYDATA", chrInfo=chrInfo, minCov=10L, - minProb=0.999, eProb=0.001, cutOffLOH=-5, - cutOffHomoScore=-3, wAR=9L, verbose=FALSE) -head(result) - -## Close both GDS files (important) -closefn.gds(profileGDS) -closefn.gds(gds1KG) - -## Unlink Profile GDS file (created for demo purpose) -unlink(file.path(dataDirAllelicFraction, "ex1.gds")) -unlink(dataDirAllelicFraction) +## Required library for this example to run correctly +if (requireNamespace("GenomeInfoDb", quietly=TRUE) && + requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) { + + ## Chromosome length information + ## chr23 is chrX, chr24 is chrY and chrM is 25 + chrInfo <- GenomeInfoDb::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25] + + ## The function returns a data frame containing the allelic fraction info + result <- RAIDS:::computeAllelicFractionDNA(gdsReference=gds1KG, + gdsSample=profileGDS, currentProfile="ex1", studyID="MYDATA", + chrInfo=chrInfo, minCov=10L, + minProb=0.999, eProb=0.001, cutOffLOH=-5, + cutOffHomoScore=-3, wAR=9L, verbose=FALSE) + head(result) + + ## Close both GDS files (important) + closefn.gds(profileGDS) + closefn.gds(gds1KG) + + ## Remove Profile GDS file (created for demo purpose) + unlink(fileProfile, force=TRUE) + +} } \author{ diff --git a/man/computeAllelicFractionRNA.Rd b/man/computeAllelicFractionRNA.Rd index 9891a3a83..1b60d3ebb 100644 --- a/man/computeAllelicFractionRNA.Rd +++ b/man/computeAllelicFractionRNA.Rd @@ -24,13 +24,13 @@ computeAllelicFractionRNA( } \arguments{ \item{gdsReference}{an object of class \code{\link[gdsfmt]{gds.class}} -(a GDS file), the opened 1KG GDS file.} +(a GDS file), the opened Reference GDS file.} \item{gdsSample}{an object of class \code{\link[gdsfmt]{gds.class}} (a GDS file), the opened Profile GDS file.} \item{gdsRefAnnot}{an object of class \code{\link[gdsfmt]{gds.class}} -(a GDS file), the opeoned 1KG SNV Annotation GDS file.} +(a GDS file), the opened Reference SNV Annotation GDS file.} \item{currentProfile}{a \code{character} string corresponding to the sample identifier as used in \code{\link{pruningSample}} function.} @@ -55,7 +55,7 @@ Default: \code{0.999}.} \item{eProb}{a single \code{numeric} between 0 and 1 representing the probability of sequencing error. Default: \code{0.001}.} -\item{cutOffLOH}{a single log of the score to be LOH TODO. +\item{cutOffLOH}{a single \code{numeric} log of the score to be LOH. Default: \code{-5}.} \item{cutOffAR}{a single \code{numeric} representing the cutoff, in @@ -67,20 +67,93 @@ Default: \code{3}.} message when running.} } \value{ -a \code{data.frame} with lap for the pruned SNV dataset with -coverage > \code{minCov}. TODO +a \code{data.frame} containing the allelic information for the +pruned SNV dataset with coverage > \code{minCov}. The \code{data.frame} +contains those columns: +\describe{ +\item{cnt.tot}{ a \code{integer} representing the total allele count} +\item{cnt.ref}{ a \code{integer} representing the reference allele count} +\item{cnt.alt}{ a \code{integer} representing the alternative allele count} +\item{snp.pos}{ a \code{integer} representing the position on the chromosome} +\item{snp.chr}{ a \code{integer} representing the chromosome} +\item{normal.geno}{ a \code{integer} representing the genotype +(0=wild-type reference; 1=heterozygote; 2=homozygote alternative; 3=unkown)} +\item{pruned}{ a \code{logical} indicating if the SNV is retained after +pruning} +\item{snp.index}{ a \code{integer} representing the index position of the +SNV in the Reference GDS file that contains all SNVs} +\item{keep}{ a \code{logical} indicating if the genotype exists for the SNV} +\item{hetero}{ a \code{logical} indicating if the SNV is heterozygote} +\item{homo}{ a \code{logical} indicating if the SNV is homozygote} +\item{block.id}{ a \code{integer} indicating the unique identifier of the +block in the Population Reference Annotation +GDS file that contains the current SNV} +\item{phase}{ a \code{integer} indicating the phase of the variant +if known, \code{3} if not known} +\item{lap}{ a \code{numeric} indicating lower allelic fraction} +\item{LOH}{ a \code{integer} indicating if the SNV is in an LOH region +(0=not LOH, 1=in LOH)} +\item{imbAR}{ a \code{integer} indicating if the SNV is in an imbalanced +region (-1=not classified as imbalanced or LOH, 0=in LOH; 1=tested +positive for imbalance in at least 1 window)} +\item{freq}{ a \code{numeric} indicating the frequency of the variant +in the the reference} +} } \description{ The function creates a \code{data.frame} containing the allelic fraction for the pruned SNV dataset specific to a RNA-seq sample. -TODO } \examples{ -## Path to the demo pedigree file is located in this package -dataDir <- system.file("extdata", package="RAIDS") +## Required library for GDS +library(SNPRelate) + +#' ## Path to the demo 1KG GDS file is located in this package +dataDir <- system.file("extdata/tests", package="RAIDS") +fileGDS <- file.path(dataDir, "ex1_good_small_1KG.gds") +fileAnnotGDS <- file.path(dataDir, "ex1_good_small_1KG_Annot.gds") + +## Temporary Profile GDS file for one profile in temporary directory +fileProfile <- file.path(tempdir(), "ex1.gds") + +## Copy the Profile GDS file demo that has been pruned and annotated +file.copy(file.path(dataDir, "ex1_demo_with_pruning_and_1KG_annot.gds"), + fileProfile) + +## Open the reference GDS file (demo version) +gds1KG <- snpgdsOpen(fileGDS) +gdsRefAnnot <- openfn.gds(fileAnnotGDS) + +## Open Profile GDS file for one profile +profileGDS <- openfn.gds(fileProfile) -## TODO +## Required library for this example to run correctly +if (requireNamespace("GenomeInfoDb", quietly=TRUE) && + requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) { + + ## Chromosome length information + ## chr23 is chrX, chr24 is chrY and chrM is 25 + chrInfo <- GenomeInfoDb::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25] + + ## The function returns a data frame containing the allelic fraction info + result <- RAIDS:::computeAllelicFractionRNA(gdsReference=gds1KG, + gdsSample=profileGDS, gdsRefAnnot=gdsRefAnnot, + currentProfile="ex1", studyID="MYDATA", + blockID="GeneS.Ensembl.Hsapiens.v86", + chrInfo=chrInfo, minCov=10L, minProb=0.999, eProb=0.001, + cutOffLOH=-5, cutOffAR=3, verbose=FALSE) + head(result) + + ## Close both GDS files (important) + closefn.gds(profileGDS) + closefn.gds(gds1KG) + closefn.gds(gdsRefAnnot) + + ## Remove Profile GDS file (created for demo purpose) + unlink(fileProfile, force=TRUE) + +} } \author{ diff --git a/man/computeAllelicImbDNAChr.Rd b/man/computeAllelicImbDNAChr.Rd index 1d16ed967..7872acdd9 100644 --- a/man/computeAllelicImbDNAChr.Rd +++ b/man/computeAllelicImbDNAChr.Rd @@ -5,34 +5,34 @@ \alias{computeAllelicImbDNAChr} \title{Verify if SNVs are in an imbalance region} \usage{ -computeAllelicImbDNAChr(snp.pos, chr, wAR = 10, cutOffEmptyBox = -3) +computeAllelicImbDNAChr(snpPos, chr, wAR = 10, cutOffEmptyBox = -3) } \arguments{ -\item{snp.pos}{a \code{data.frame} containing the SNV information for the +\item{snpPos}{a \code{data.frame} containing the SNV information for the chromosome specified by the \code{chr} argument. The \code{data.frame} must contain: -\itemize{ -\item{cnt.tot} {a single \code{integer} representing the total coverage for +\describe{ +\item{cnt.tot}{ a single \code{integer} representing the total coverage for the SNV.} -\item{cnt.ref} {a single \code{integer} representing the coverage for +\item{cnt.ref}{ a single \code{integer} representing the coverage for the reference allele.} -\item{cnt.alt} {a single \code{integer} representing the coverage for +\item{cnt.alt}{ a single \code{integer} representing the coverage for the alternative allele.} -\item{snp.pos} {a single \code{integer} representing the SNV position.} -\item{snp.chr} {a single \code{integer} representing the SNV chromosome.} -\item{normal.geno} {a single \code{numeric} indicating the genotype of the +\item{snp.pos}{ a single \code{integer} representing the SNV position.} +\item{snp.chr}{ a single \code{integer} representing the SNV chromosome.} +\item{normal.geno}{ a single \code{numeric} indicating the genotype of the SNV. The possibles are: \code{0} (wild-type homozygote), \code{1} (heterozygote), \code{2} (altenative homozygote), \code{3} indicating that the normal genotype is unknown.} -\item{pruned} {a \code{logical} indicating if the SNV is retained after +\item{pruned}{ a \code{logical} indicating if the SNV is retained after pruning} -\item{snp.index} {a \code{integer} representing the index position of the -SNV in the 1KG GDS file that contains all SNVs} -\item{keep} {a \code{logical} indicating if the genotype exists for the SNV} -\item{hetero} {a \code{logical} indicating if the SNV is heterozygote} -\item{homo} {a \code{logical} indicating if the SNV is homozygote} -\item{lap} {a \code{numeric} indicating lower allelic fraction} -\item{LOH} {a \code{integer} indicating if the SNV is in an LOH region +\item{snp.index}{ a \code{integer} representing the index position of the +SNV in the Reference GDS file that contains all SNVs} +\item{keep}{ a \code{logical} indicating if the genotype exists for the SNV} +\item{hetero}{ a \code{logical} indicating if the SNV is heterozygote} +\item{homo}{ a \code{logical} indicating if the SNV is homozygote} +\item{lap}{ a \code{numeric} indicating lower allelic fraction} +\item{LOH}{ a \code{integer} indicating if the SNV is in an LOH region (0=not LOH, 1=in LOH)} }} @@ -50,7 +50,7 @@ a \code{vector} of \code{integer} indicating if the SNV is in an imbalanced region (-1=not classified as imbalanced or LOH, 0=in LOH; 1=tested positive for imbalance in at least 1 window). The vector as an entry for each SNV present in the -input \code{snp.pos}. +input \code{snpPos}. } \description{ The function verifies, for each SNV present in the data frame, @@ -63,43 +63,46 @@ library(gdsfmt) ## Path to the demo 1KG GDS file is located in this package dataDir <- system.file("extdata/tests", package="RAIDS") -fileGDS <- file.path(dataDir, "ex1_good_small_1KG_GDS.gds") +fileGDS <- file.path(dataDir, "ex1_good_small_1KG.gds") ## Open the reference GDS file (demo version) gds1KG <- snpgdsOpen(fileGDS) -## Chromosome length information -## chr23 is chrX, chr24 is chrY and chrM is 25 -chrInfo <- c(248956422L, 242193529L, 198295559L, 190214555L, - 181538259L, 170805979L, 159345973L, 145138636L, 138394717L, 133797422L, - 135086622L, 133275309L, 114364328L, 107043718L, 101991189L, 90338345L, - 83257441L, 80373285L, 58617616L, 64444167L, 46709983L, 50818468L, - 156040895L, 57227415L, 16569L) +## Required library for this example to run correctly +if (requireNamespace("GenomeInfoDb", quietly=TRUE) && + requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) { -## Data frame with SNV information for the specified chromosome (chr 1) -snpInfo <- data.frame(cnt.tot=c(41, 17, 27, 15, 11, 37, 16, 32), - cnt.ref=c(40, 17, 27, 15, 4, 14, 16, 32), - cnt.alt=c(0, 0, 0, 0, 7, 23, 0, 0), - snp.pos=c(3722256, 3722328, 3767522, 3868160, 3869467, 4712655, - 6085318, 6213145), - snp.chr=c(rep(1, 8)), - normal.geno=c(rep(1, 8)), pruned=c(TRUE, TRUE, FALSE, TRUE, FALSE, TRUE, - TRUE, TRUE), - pruned=c(TRUE, TRUE, FALSE, TRUE, FALSE, rep(TRUE, 3)), - snp.index=c(160, 162, 204, 256, 259, 288, 366, 465), - keep=rep(TRUE, 8), hetero=c(rep(FALSE, 4), TRUE, TRUE, rep(FALSE, 2)), - homo=c(rep(TRUE, 4), FALSE, FALSE, TRUE, TRUE), - lap=rep(-1, 8), LOH=rep(0, 8), imbAR=rep(-1, 8), - stringAsFactor=FALSE) + ## Chromosome length information + ## chr23 is chrX, chr24 is chrY and chrM is 25 + chrInfo <- GenomeInfoDb::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25] -## The function returns a data frame containing the information about the -## LOH regions in the specified chromosome -result <- RAIDS:::computeAllelicImbDNAChr(snp.pos=snpInfo, chr=1, wAR=10, - cutOffEmptyBox=-3) -head(result) + ## Data frame with SNV information for the specified chromosome (chr 1) + snpInfo <- data.frame(cnt.tot=c(41, 17, 27, 15, 11, 37, 16, 32), + cnt.ref=c(40, 17, 27, 15, 4, 14, 16, 32), + cnt.alt=c(0, 0, 0, 0, 7, 23, 0, 0), + snp.pos=c(3722256, 3722328, 3767522, 3868160, 3869467, 4712655, + 6085318, 6213145), + snp.chr=c(rep(1, 8)), + normal.geno=c(rep(1, 8)), pruned=c(TRUE, TRUE, FALSE, TRUE, + FALSE, TRUE, TRUE, TRUE), + pruned=c(TRUE, TRUE, FALSE, TRUE, FALSE, rep(TRUE, 3)), + snp.index=c(160, 162, 204, 256, 259, 288, 366, 465), + keep=rep(TRUE, 8), + hetero=c(rep(FALSE, 4), TRUE, TRUE, rep(FALSE, 2)), + homo=c(rep(TRUE, 4), FALSE, FALSE, TRUE, TRUE), + lap=rep(-1, 8), LOH=rep(0, 8), imbAR=rep(-1, 8), + stringAsFactor=FALSE) -## Close GDS file (important) -closefn.gds(gds1KG) + ## The function returns a data frame containing the information about + ## the LOH regions in the specified chromosome + result <- RAIDS:::computeAllelicImbDNAChr(snpPos=snpInfo, chr=1, wAR=10, + cutOffEmptyBox=-3) + head(result) + + ## Close GDS file (important) + closefn.gds(gds1KG) + +} } \author{ diff --git a/man/computeAncestryFromSynthetic.Rd b/man/computeAncestryFromSynthetic.Rd new file mode 100644 index 000000000..148d6d685 --- /dev/null +++ b/man/computeAncestryFromSynthetic.Rd @@ -0,0 +1,295 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/processStudy_internal.R +\encoding{UTF-8} +\name{computeAncestryFromSynthetic} +\alias{computeAncestryFromSynthetic} +\title{Select the optimal K and D parameters using the synthetic data and +infer the ancestry of a specific profile} +\usage{ +computeAncestryFromSynthetic( + gdsReference, + gdsProfile, + syntheticKNN, + pedSyn, + currentProfile, + spRef, + studyIDSyn, + np = 1L, + listCatPop = c("EAS", "EUR", "AFR", "AMR", "SAS"), + fieldPopIn1KG = "superPop", + fieldPopInfAnc = "SuperPop", + kList = seq(2, 15, 1), + pcaList = seq(2, 15, 1), + algorithm = c("exact", "randomized"), + eigenCount = 32L, + missingRate = NaN, + verbose = FALSE +) +} +\arguments{ +\item{gdsReference}{an object of class \link[gdsfmt]{gds.class} (a GDS +file), the opened 1KG GDS file.} + +\item{gdsProfile}{an object of class \code{\link[gdsfmt]{gds.class}} +(a GDS file), the opened Profile GDS file.} + +\item{syntheticKNN}{a \code{vector} of \code{character} strings representing +the name of files that contain the results of ancestry inference done on +the synthetic profiles for multiple values of \emph{D} and \emph{K}. The files must +exist.} + +\item{pedSyn}{a \code{data.frame} containing the columns extracted from the +GDS Sample 'study.annot' node with a extra column named as the 'popName' +parameter that has been extracted from the 1KG GDS 'sample.annot' node.} + +\item{currentProfile}{a \code{character} string representing the profile +identifier of the current profile on which ancestry will be inferred.} + +\item{spRef}{a \code{vector} of \code{character} strings representing the +known super population ancestry for the 1KG profiles. The 1KG profile +identifiers are used as names for the \code{vector}.} + +\item{studyIDSyn}{a \code{character} string corresponding to the study +identifier. The study identifier must be present in the GDS Sample file.} + +\item{np}{a single positive \code{integer} representing the number of +threads. Default: \code{1L}.} + +\item{listCatPop}{a \code{vector} of \code{character} string +representing the list of possible ancestry assignations. Default: +\code{("EAS", "EUR", "AFR", "AMR", "SAS")}.} + +\item{fieldPopIn1KG}{a \code{character} string representing the name of the +column that contains the known ancestry for the reference profiles in +the Reference GDS file.} + +\item{fieldPopInfAnc}{a \code{character} string representing the name of +the column that will contain the inferred ancestry for the specified +profiles. Default: \code{"SuperPop"}.} + +\item{kList}{a \code{vector} of \code{integer} representing the list of +values tested for the \emph{K} parameter. The \emph{K} parameter represents the +number of neighbors used in the K-nearest neighbor analysis. If \code{NULL}, +the value \code{seq(2,15,1)} is assigned. +Default: \code{seq(2,15,1)}.} + +\item{pcaList}{a \code{vector} of \code{integer} representing the list of +values tested for the \emph{D} parameter. The \emph{D} parameter represents the +number of dimensions used in the PCA analysis. If \code{NULL}, +the value \code{seq(2,15,1)} is assigned. +Default: \code{seq(2,15,1)}.} + +\item{algorithm}{a \code{character} string representing the algorithm used +to calculate the PCA. The 2 choices are "exact" (traditional exact +calculation) and "randomized" (fast PCA with randomized algorithm +introduced in Galinsky et al. 2016). Default: \code{"exact"}.} + +\item{eigenCount}{a single \code{integer} indicating the number of +eigenvectors that will be in the output of the \link[SNPRelate]{snpgdsPCA} +function; if 'eigenCount' <= 0, then all eigenvectors are returned. +Default: \code{32L}.} + +\item{missingRate}{a \code{numeric} value representing the threshold +missing rate at with the SNVs are discarded; the SNVs are retained in the +\link[SNPRelate]{snpgdsPCA} +with "<= missingRate" only; if \code{NaN}, no missing threshold. +Default: \code{NaN}.} + +\item{verbose}{a \code{logical} indicating if messages should be printed +to show how the different steps in the function. Default: \code{FALSE}.} +} +\value{ +a \code{list} containing 4 entries: +\describe{ +\item{\code{pcaSample}}{ a \code{list} containing the information related +to the eigenvectors. The \code{list} contains those 3 entries: +\describe{ +\item{\code{sample.id}}{ a \code{character} string representing the unique +identifier of the current profile.} +\item{\code{eigenvector.ref}}{ a \code{matrix} of \code{numeric} containing +the eigenvectors for the reference profiles.} +\item{\code{eigenvector}}{ a \code{matrix} of \code{numeric} containing the +eigenvectors for the current profile projected on the PCA from the +reference profiles.} +} +} +\item{\code{paraSample}}{ a \code{list} containing the results with +different \code{D} and \code{K} values that lead to optimal parameter +selection. The \code{list} contains those entries: +\describe{ +\item{\code{dfPCA}}{ a \code{data.frame} containing statistical results +on all combined synthetic results done with a fixed value of \code{D} (the +number of dimensions). The \code{data.frame} contains those columns: +\describe{ +\item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +number of dimensions).} +\item{\code{median}}{ a \code{numeric} representing the median of the +minimum AUROC obtained (within super populations) for all combination of +the fixed \code{D} value and all tested \code{K} values. } +\item{\code{mad}}{ a \code{numeric} representing the MAD of the minimum +AUROC obtained (within super populations) for all combination of the fixed +\code{D} value and all tested \code{K} values. } +\item{\code{upQuartile}}{ a \code{numeric} representing the upper quartile +of the minimum AUROC obtained (within super populations) for all +combination of the fixed \code{D} value and all tested \code{K} values. } +\item{\code{k}}{ a \code{numeric} representing the optimal \code{K} value +(the number of neighbors) for a fixed \code{D} value. } +} +} +\item{\code{dfPop}}{ a \code{data.frame} containing statistical results on +all combined synthetic results done with different values of \code{D} (the +number of dimensions) and \code{K} (the number of neighbors). +The \code{data.frame} contains those columns: +\describe{ +\item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +number of dimensions).} +\item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +number of neighbors).} +\item{\code{AUROC.min}}{ a \code{numeric} representing the minimum accuracy +obtained by grouping all the synthetic results by super-populations, for +the specified values of \code{D} and \code{K}.} +\item{\code{AUROC}}{ a \code{numeric} representing the accuracy obtained +by grouping all the synthetic results for the specified values of \code{D} +and \code{K}.} +\item{\code{Accu.CM}}{ a \code{numeric} representing the value of accuracy +of the confusion matrix obtained by grouping all the synthetic results for +the specified values of \code{D} and \code{K}.} +} +} +\item{\code{dfAUROC}}{ a \code{data.frame} the summary of the results by +super-population. The \code{data.frame} contains +those columns: +\describe{ +\item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +number of dimensions).} +\item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +number of neighbors).} +\item{\code{Call}}{ a \code{character} string representing the +super-population.} +\item{\code{L}}{ a \code{numeric} representing the lower value of the 95\% +confidence interval for the AUROC obtained for the fixed values of +super-population, \code{D} and \code{K}.} +\item{\code{AUROC}}{ a \code{numeric} representing the AUROC obtained for the +fixed values of super-population, \code{D} and \code{K}.} +\item{\code{H}}{ a \code{numeric} representing the higher value of the 95\% +confidence interval for the AUROC obtained for the fixed values of +super-population, \code{D} and \code{K}.} +} +} +\item{\code{D}}{ a \code{numeric} representing the optimal \code{D} value +(the number of dimensions) for the specific profile.} +\item{\code{K}}{ a \code{numeric} representing the optimal \code{K} value +(the number of neighbors) for the specific profile.} +\item{\code{listD}}{ a \code{numeric} representing the optimal \code{D} +values (the number of dimensions) for the specific profile. More than one +\code{D} is possible.} +} +} +\item{\code{KNNSample}}{ a \code{list} containing the inferred ancestry +using different \code{D} and \code{K} values. The \code{list} contains +those entries: +\describe{ +\item{\code{sample.id}}{ a \code{character} string representing the unique +identifier of the current profile.} +\item{\code{matKNN}}{ a \code{data.frame} containing the inferred ancestry +for different values of \code{K} and \code{D}. The \code{data.frame} +contains those columns: +\describe{ +\item{\code{sample.id}}{ a \code{character} string representing the unique +identifier of the current profile.} +\item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +number of dimensions) used to infer the ancestry. } +\item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +number of neighbors) used to infer the ancestry. } +\item{\code{SuperPop}}{ a \code{character} string representing the inferred +ancestry for the specified \code{D} and \code{K} values.} +} +} +} +} +\item{\code{Ancestry}}{ a \code{data.frame} containing the inferred +ancestry for the current profile. The \code{data.frame} contains those +columns: +\describe{ +\item{\code{sample.id}}{ a \code{character} string representing the unique +identifier of the current profile.} +\item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +number of dimensions) used to infer the ancestry.} +\item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +number of neighbors) used to infer the ancestry.} +\item{\code{SuperPop}}{ a \code{character} string representing the inferred +ancestry.} +} +} +} +} +\description{ +The function select the optimal K and D parameters for a +specific profile. The results on the synthetic data are used for the +parameter selection. Once the optimal parameters are selected, the +ancestry is inferred for the specific profile. +} +\examples{ + + +## Required library +library(gdsfmt) + +## Load the known ancestry for the demo 1KG reference profiles +data(demoKnownSuperPop1KG) + +## The Reference GDS file +path1KG <- system.file("extdata/tests", package="RAIDS") + +## Open the Reference GDS file +gdsRef <- snpgdsOpen(file.path(path1KG, "ex1_good_small_1KG.gds")) + +## Path to the demo synthetic results files +## List of the KNN result files from PCA run on synthetic data +dataDirRes <- system.file("extdata/demoAncestryCall/ex1", package="RAIDS") +listFilesName <- dir(file.path(dataDirRes), ".rds") +listFiles <- file.path(file.path(dataDirRes) , listFilesName) +syntheticKNN <- lapply(listFiles, FUN=function(x){return(readRDS(x))}) +syntheticKNN <- do.call(rbind, syntheticKNN) + +# The name of the synthetic study +studyID <- "MYDATA.Synthetic" + +## Path to the demo Profile GDS file is located in this package +dataDir <- system.file("extdata/demoAncestryCall", package="RAIDS") + +## Open the Profile GDS file +gdsProfile <- snpgdsOpen(file.path(dataDir, "ex1.gds")) +\dontrun{ + pedSyn <- RAIDS:::prepPedSynthetic1KG(gdsReference=gdsRef, + gdsSample=gdsProfile, studyID=studyID, popName="superPop") + + ## Run the ancestry inference on one profile called 'ex1' + ## The values of K and D used for the inference are selected using the + ## synthetic results listFiles=listFiles, + resCall <- RAIDS:::computeAncestryFromSynthetic(gdsReference=gdsRef, + gdsProfile=gdsProfile, + syntheticKNN = syntheticKNN, + pedSyn = pedSyn, + currentProfile=c("ex1"), + spRef=demoKnownSuperPop1KG, + studyIDSyn=studyID, np=1L) + + ## The ancestry called with the optimal D and K values + resCall$Ancestry +} +## Close the GDS files (important) +closefn.gds(gdsProfile) +closefn.gds(gdsRef) + + +} +\references{ +Galinsky KJ, Bhatia G, Loh PR, Georgiev S, Mukherjee S, Patterson NJ, +Price AL. Fast Principal-Component Analysis Reveals Convergent Evolution +of ADH1B in Europe and East Asia. Am J Hum Genet. 2016 Mar 3;98(3):456-72. +doi: 10.1016/j.ajhg.2015.12.022. Epub 2016 Feb 25. +} +\author{ +Pascal Belleau, Astrid Deschênes and Alexander Krasnitz +} diff --git a/man/computeAncestryFromSyntheticFile.Rd b/man/computeAncestryFromSyntheticFile.Rd index 6eac914ba..337bba3f0 100644 --- a/man/computeAncestryFromSyntheticFile.Rd +++ b/man/computeAncestryFromSyntheticFile.Rd @@ -34,7 +34,7 @@ file), the opened 1KG GDS file.} \item{listFiles}{a \code{vector} of \code{character} strings representing the name of files that contain the results of ancestry inference done on -the synthetic profiles for multiple values of _D_ and _K_. The files must +the synthetic profiles for multiple values of \emph{D} and \emph{K}. The files must exist.} \item{currentProfile}{a \code{character} string representing the profile @@ -63,13 +63,13 @@ the column that will contain the inferred ancestry for the specified profiles. Default: \code{"SuperPop"}.} \item{kList}{a \code{vector} of \code{integer} representing the list of -values tested for the _K_ parameter. The _K_ parameter represents the +values tested for the \emph{K} parameter. The \emph{K} parameter represents the number of neighbors used in the K-nearest neighbor analysis. If \code{NULL}, the value \code{seq(2,15,1)} is assigned. Default: \code{seq(2,15,1)}.} \item{pcaList}{a \code{vector} of \code{integer} representing the list of -values tested for the _D_ parameter. The _D_ parameter represents the +values tested for the \emph{D} parameter. The \emph{D} parameter represents the number of dimensions used in the PCA analysis. If \code{NULL}, the value \code{seq(2,15,1)} is assigned. Default: \code{seq(2,15,1)}.} @@ -95,10 +95,10 @@ to show how the different steps in the function. Default: \code{FALSE}.} } \value{ a \code{list} containing 4 entries: -\itemize{ +\describe{ \item{\code{pcaSample}}{ a \code{list} containing the information related to the eigenvectors. The \code{list} contains those 3 entries: -\itemize{ +\describe{ \item{\code{sample.id}}{ a \code{character} string representing the unique identifier of the current profile.} \item{\code{eigenvector.ref}}{ a \code{matrix} of \code{numeric} containing @@ -108,15 +108,14 @@ eigenvectors for the current profile projected on the PCA from the reference profiles.} } } - \item{\code{paraSample}}{ a \code{list} containing the results with different \code{D} and \code{K} values that lead to optimal parameter selection. The \code{list} contains those entries: -\itemize{ +\describe{ \item{\code{dfPCA}}{ a \code{data.frame} containing statistical results on all combined synthetic results done with a fixed value of \code{D} (the number of dimensions). The \code{data.frame} contains those columns: -\itemize{ +\describe{ \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the number of dimensions).} \item{\code{median}}{ a \code{numeric} representing the median of the @@ -136,7 +135,7 @@ combination of the fixed \code{D} value and all tested \code{K} values. } all combined synthetic results done with different values of \code{D} (the number of dimensions) and \code{K} (the number of neighbors). The \code{data.frame} contains those columns: -\itemize{ +\describe{ \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the number of dimensions).} \item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the @@ -155,19 +154,19 @@ the specified values of \code{D} and \code{K}.} \item{\code{dfAUROC}}{ a \code{data.frame} the summary of the results by super-population. The \code{data.frame} contains those columns: -\itemize{ +\describe{ \item{\code{pcaD}}{ a \code{numeric} representing the value of \code{D} (the number of dimensions).} \item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the number of neighbors).} \item{\code{Call}}{ a \code{character} string representing the super-population.} -\item{\code{L}}{ a \code{numeric} representing the lower value of the 95% +\item{\code{L}}{ a \code{numeric} representing the lower value of the 95\% confidence interval for the AUROC obtained for the fixed values of super-population, \code{D} and \code{K}.} \item{\code{AUR}}{ a \code{numeric} representing the AUROC obtained for the fixed values of super-population, \code{D} and \code{K}.} -\item{\code{H}}{ a \code{numeric} representing the higher value of the 95% +\item{\code{H}}{ a \code{numeric} representing the higher value of the 95\% confidence interval for the AUROC obtained for the fixed values of super-population, \code{D} and \code{K}.} } @@ -181,17 +180,16 @@ values (the number of dimensions) for the specific profile. More than one \code{D} is possible.} } } - \item{\code{KNNSample}}{ a \code{list} containing the inferred ancestry using different \code{D} and \code{K} values. The \code{list} contains those entries: -\itemize{ +\describe{ \item{\code{sample.id}}{ a \code{character} string representing the unique identifier of the current profile.} \item{\code{matKNN}}{ a \code{data.frame} containing the inferred ancestry for different values of \code{K} and \code{D}. The \code{data.frame} contains those columns: -\itemize{ +\describe{ \item{\code{sample.id}}{ a \code{character} string representing the unique identifier of the current profile.} \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the @@ -204,11 +202,10 @@ ancestry for the specified \code{D} and \code{K} values.} } } } - \item{\code{Ancestry}}{ a \code{data.frame} containing the inferred ancestry for the current profile. The \code{data.frame} contains those columns: -\itemize{ +\describe{ \item{\code{sample.id}}{ a \code{character} string representing the unique identifier of the current profile.} \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the @@ -233,11 +230,14 @@ ancestry is inferred for the specific profile. ## Required library library(gdsfmt) +## Load the known ancestry for the demo 1KG reference profiles +data(demoKnownSuperPop1KG) + ## The Reference GDS file -path1KG <- system.file("extdata/example/gdsRef", package="RAIDS") +path1KG <- system.file("extdata/tests", package="RAIDS") ## Open the Reference GDS file -gdsRef <- snpgdsOpen(file.path(path1KG, "ex1kg.gds")) +gdsRef <- snpgdsOpen(file.path(path1KG, "ex1_good_small_1KG.gds")) ## Path to the demo synthetic results files ## List of the KNN result files from PCA run on synthetic data @@ -248,10 +248,6 @@ listFiles <- file.path(file.path(dataDirRes) , listFilesName) # The name of the synthetic study studyID <- "MYDATA.Synthetic" -## The known ancestry for the 1KG reference profiles -dataDir1KG <- system.file("extdata/demoKNNSynthetic", package="RAIDS") -refKnownSuperPop <- readRDS(file.path(dataDir1KG, "knownSuperPop1KG.RDS")) - ## Path to the demo Profile GDS file is located in this package dataDir <- system.file("extdata/demoAncestryCall", package="RAIDS") @@ -265,7 +261,7 @@ resCall <- computeAncestryFromSyntheticFile(gdsReference=gdsRef, gdsProfile=gdsProfile, listFiles=listFiles, currentProfile=c("ex1"), - spRef=refKnownSuperPop, + spRef=demoKnownSuperPop1KG, studyIDSyn=studyID, np=1L) ## The ancestry called with the optimal D and K values diff --git a/man/computeKNNRefSample.Rd b/man/computeKNNRefSample.Rd index ec0e79fb0..92a5b5617 100644 --- a/man/computeKNNRefSample.Rd +++ b/man/computeKNNRefSample.Rd @@ -34,35 +34,35 @@ the column that will contain the inferred ancestry for the specified profile. Default: \code{"SuperPop"}.} \item{kList}{a \code{vector} of \code{integer} representing the list of -values tested for the _K_ parameter. The _K_ parameter represents the +values tested for the \emph{K} parameter. The \emph{K} parameter represents the number of neighbors used in the K-nearest neighbor analysis. If \code{NULL}, the value \code{seq(2,15,1)} is assigned. Default: \code{seq(2,15,1)}.} \item{pcaList}{a \code{vector} of \code{integer} representing the list of -values tested for the _D_ parameter. The D parameter represents the +values tested for the \emph{D} parameter. The D parameter represents the number of dimensions used in the PCA analysis. If \code{NULL}, the value \code{seq(2, 15, 1)} is assigned. Default: \code{seq(2, 15, 1)}.} } \value{ a \code{list} containing 4 entries: -\itemize{ -\item{\code{sample.id}} { a \code{vector} of \code{character} strings +\describe{ +\item{\code{sample.id}}{ a \code{vector} of \code{character} strings representing the identifier of the profile analysed.} -\item{\code{matKNN}} { a \code{data.frame} containing the super population +\item{\code{matKNN}}{ a \code{data.frame} containing the super population inference for the profile for different values of PCA dimensions \code{D} and k-neighbors values \code{K}. The fourth column title corresponds to the \code{fieldPopInfAnc} parameter. The \code{data.frame} contains 4 columns: -\itemize{ -\item{\code{sample.id}} {a \code{character} string representing +\describe{ +\item{\code{sample.id}}{ a \code{character} string representing the identifier of the profile analysed.} -\item{\code{D}} { a \code{numeric} strings representing +\item{\code{D}}{ a \code{numeric} strings representing the value of the PCA dimension used to infer the ancestry.} -\item{\code{K}} { a \code{numeric} strings representing +\item{\code{K}}{ a \code{numeric} strings representing the value of the k-neighbors used to infer the ancestry..} -\item{\code{fieldPopInfAnc}} { a \code{character} string representing +\item{\code{fieldPopInfAnc}}{ a \code{character} string representing the inferred ancestry.} } } @@ -74,22 +74,23 @@ one specific profile. The function uses the 'knn' package. } \examples{ -## Path to the demo files located in this package -dataDir <- system.file("extdata/demoKNNSynthetic", package="RAIDS") +## Load the demo PCA on the synthetic profiles projected on the +## demo 1KG reference PCA +data(demoPCASyntheticProfiles) + +## Load the known ancestry for the demo 1KG reference profiles +data(demoKnownSuperPop1KG) ## The PCA with 1 profile projected on the 1KG reference PCA ## Only one profile is retained -pca <- readRDS(file.path(dataDir, "pcaSynthetic.RDS")) +pca <- demoPCASyntheticProfiles pca$sample.id <- pca$sample.id[1] pca$eigenvector <- pca$eigenvector[1, , drop=FALSE] -## The known ancestry for the 1KG reference profiles -refKnownSuperPop <- readRDS(file.path(dataDir, "knownSuperPop1KG.RDS")) - ## Projects profile on 1KG PCA results <- computeKNNRefSample(listEigenvector=pca, listCatPop=c("EAS", "EUR", "AFR", "AMR", "SAS"), - spRef=refKnownSuperPop, fieldPopInfAnc="SuperPop", + spRef=demoKnownSuperPop1KG, fieldPopInfAnc="SuperPop", kList=seq(10, 15, 1), pcaList=seq(10, 15, 1)) ## The assigned ancestry to the profile for different values of K and D diff --git a/man/computeKNNRefSynthetic.Rd b/man/computeKNNRefSynthetic.Rd index 919dbd76d..ef47f2bce 100644 --- a/man/computeKNNRefSynthetic.Rd +++ b/man/computeKNNRefSynthetic.Rd @@ -57,28 +57,28 @@ Default: \code{seq(2, 15, 1)}.} } \value{ a \code{list} containing 4 entries: -\itemize{ -\item{\code{sample.id}} {a \code{vector} of \code{character} strings +\describe{ +\item{\code{sample.id}}{ a \code{vector} of \code{character} strings representing the identifiers of the synthetic profiles analysed.} -\item{\code{sample1Kg}} {a \code{vector} of \code{character} strings +\item{\code{sample1Kg}}{ a \code{vector} of \code{character} strings representing the identifiers of the 1KG reference profiles used to generate the synthetic profiles.} -\item{\code{sp}} {a \code{vector} of \code{character} strings representing +\item{\code{sp}}{ a \code{vector} of \code{character} strings representing the known super population ancestry of the 1KG reference profiles used to generate the synthetic profiles.} -\item{\code{matKNN}} {a \code{data.frame} containing the super population +\item{\code{matKNN}}{ a \code{data.frame} containing the super population inference for each synthetic profiles for different values of PCA dimensions \code{D} and k-neighbors values \code{K}. The fourth column title corresponds to the \code{fieldPopInfAnc} parameter. The \code{data.frame} contains 4 columns: -\itemize{ -\item{\code{sample.id}} {a \code{character} string representing +\describe{ +\item{\code{sample.id}}{ a \code{character} string representing the identifier of the synthetic profile analysed.} -\item{\code{D}} {a \code{numeric} strings representing +\item{\code{D}}{ a \code{numeric} strings representing the value of the PCA dimension used to infer the super population.} -\item{\code{K}} {a \code{numeric} strings representing +\item{\code{K}}{ a \code{numeric} strings representing the value of the k-neighbors used to infer the super population.} -\item{\code{fieldPopInfAnc} value} {a \code{character} string representing +\item{\code{fieldPopInfAnc} value}{ a \code{character} string representing the inferred ancestry.} } } @@ -93,27 +93,29 @@ subset of the synthetic data set. The function uses the 'knn' package. ## Required library library(gdsfmt) -## Path to the demo Profile GDS file is located in this package -dataDir <- system.file("extdata/demoKNNSynthetic", package="RAIDS") - -# The name of the synthetic study -studyID <- "MYDATA.Synthetic" +## Load the demo PCA on the synthetic profiles projected on the +## demo 1KG reference PCA +data(demoPCASyntheticProfiles) -## The PCA on the synthetic profiles projected on the 1KG reference PCA -pca <- readRDS(file.path(dataDir, "pcaSynthetic.RDS")) +## Load the known ancestry for the demo 1KG reference profiles +data(demoKnownSuperPop1KG) -## The known ancestry for the 1KG reference profiles -refKnownSuperPop <- readRDS(file.path(dataDir, "knownSuperPop1KG.RDS")) +## Path to the demo Profile GDS file is located in this package +dataDir <- system.file("extdata/demoKNNSynthetic", package="RAIDS") ## Open the Profile GDS file gdsProfile <- snpgdsOpen(file.path(dataDir, "ex1.gds")) +# The name of the synthetic study +studyID <- "MYDATA.Synthetic" + ## Projects synthetic profiles on 1KG PCA -results <- computeKNNRefSynthetic(gdsProfile=gdsProfile, listEigenvector=pca, +results <- computeKNNRefSynthetic(gdsProfile=gdsProfile, + listEigenvector=demoPCASyntheticProfiles, listCatPop=c("EAS", "EUR", "AFR", "AMR", "SAS"), studyIDSyn=studyID, - spRef=refKnownSuperPop) + spRef=demoKnownSuperPop1KG) -## The inferred ancestry for the synthetic profiles for differents values +## The inferred ancestry for the synthetic profiles for different values ## of D and K head(results$matKNN) diff --git a/man/computeKNNSuperPopSample.Rd b/man/computeKNNSuperPopSample.Rd deleted file mode 100644 index 57ef92610..000000000 --- a/man/computeKNNSuperPopSample.Rd +++ /dev/null @@ -1,46 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/processStudy_internal.R -\encoding{UTF-8} -\name{computeKNNSuperPopSample} -\alias{computeKNNSuperPopSample} -\title{Deprecated} -\usage{ -computeKNNSuperPopSample( - gdsSample, - listEigenvector, - name.id, - spRef, - studyIDRef = "Ref.1KG", - kList = seq_len(15), - pcaList = 2:15 -) -} -\arguments{ -\item{listEigenvector}{TODO see return of computePCARefSample} - -\item{spRef}{TODO} - -\item{kList}{a \code{vector} of \code{integer} representing the list of -values tested for the _K_ parameter. The _K_ parameter represents the -number of neighbors used in the K-nearest neighbor analysis. If \code{NULL}, -the value \code{seq_len(15)} is assigned. -Default: \code{seq_len(15)}.} - -\item{pcaList}{TODO array of the pca dimension possible values} - -\item{sample.ref}{TODO} - -\item{study.annot}{a \code{data.frame} with one entry from study.annot in -the gds} -} -\value{ -A \code{list} TODO with the sample.id and eigenvectors -and a table with KNN callfor different K and pca dimension. -} -\description{ -Deprecated -} -\author{ -Pascal Belleau, Astrid Deschênes and Alexander Krasnitz -} -\keyword{internal} diff --git a/man/computeKNNSuperPoprSynthetic.Rd b/man/computeKNNSuperPoprSynthetic.Rd deleted file mode 100644 index d4f087ac5..000000000 --- a/man/computeKNNSuperPoprSynthetic.Rd +++ /dev/null @@ -1,45 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/processStudy_internal.R -\encoding{UTF-8} -\name{computeKNNSuperPoprSynthetic} -\alias{computeKNNSuperPoprSynthetic} -\title{Deprecated} -\usage{ -computeKNNSuperPoprSynthetic( - listEigenvector, - sample.ref, - study.annot, - spRef, - kList = seq_len(15), - pcaList = seq(2, 15, 1) -) -} -\arguments{ -\item{listEigenvector}{TODO see return of computePCAsynthetic} - -\item{sample.ref}{TODO} - -\item{study.annot}{a \code{data.frame} with one entry from study.annot in -the gds} - -\item{spRef}{TODO} - -\item{kList}{a \code{vector} of \code{integer} representing the list of -values tested for the K parameter. The K parameter represents the -number of neighbors used in the K-nearest neighbors analysis. If -\code{NULL}, the value \code{seq_len(15)} is assigned. -Default: \code{seq_len(15)}.} - -\item{pcaList}{TODO array of the pca dimension possible values} -} -\value{ -A \code{list} TODO with the sample.id and eigenvectors -and a table with KNN callfor different K and pca dimension. -} -\description{ -Deprecated -} -\author{ -Pascal Belleau, Astrid Deschênes and Alexander Krasnitz -} -\keyword{internal} diff --git a/man/computeLOHBlocksDNAChr.Rd b/man/computeLOHBlocksDNAChr.Rd index 9a21e5076..d1bf63705 100644 --- a/man/computeLOHBlocksDNAChr.Rd +++ b/man/computeLOHBlocksDNAChr.Rd @@ -5,70 +5,80 @@ \alias{computeLOHBlocksDNAChr} \title{Identify regions of LOH on one chromosome using homozygote SNVs} \usage{ -computeLOHBlocksDNAChr(gdsReference, chrInfo, snp.pos, chr, genoN = 1e-04) +computeLOHBlocksDNAChr(gdsReference, chrInfo, snpPos, chr, genoN = 1e-04) } \arguments{ \item{gdsReference}{an object of class \code{\link[SNPRelate:SNPGDSFileClass]{SNPRelate::SNPGDSFileClass}}, an -opened 1KG GDS file.} +opened Reference GDS file.} \item{chrInfo}{a \code{vector} of \code{integer} representing the length of the chromosomes. As an example, the information ca be obtained from package 'BSgenome.Hsapiens.UCSC.hg38'.} -\item{snp.pos}{a \code{data.frame} containing the SNV information for the +\item{snpPos}{a \code{data.frame} containing the SNV information for the chromosome specified by the \code{chr} argument. The \code{data.frame} must contain: -\itemize{ -\item{cnt.tot} {a single \code{integer} representing the total coverage for +\describe{ +\item{cnt.tot}{ a single \code{integer} representing the total coverage for the SNV.} -\item{cnt.ref} {a single \code{integer} representing the coverage for +\item{cnt.ref}{ a single \code{integer} representing the coverage for the reference allele.} -\item{cnt.alt} {a single \code{integer} representing the coverage for +\item{cnt.alt}{ a single \code{integer} representing the coverage for the alternative allele.} -\item{snp.pos} {a single \code{integer} representing the SNV position.} -\item{snp.chr} {a single \code{integer} representing the SNV chromosome.} -\item{normal.geno} {a single \code{numeric} indicating the genotype of the +\item{snp.pos}{ a single \code{integer} representing the SNV position.} +\item{snp.chr}{ a single \code{integer} representing the SNV chromosome.} +\item{normal.geno}{ a single \code{numeric} indicating the genotype of the SNV. The possibles are: \code{0} (wild-type homozygote), \code{1} (heterozygote), \code{2} (altenative homozygote), \code{3} indicating that the normal genotype is unknown.} -\item{pruned} {a \code{logical} indicating if the SNV is retained after +\item{pruned}{ a \code{logical} indicating if the SNV is retained after pruning} -\item{snp.index} {a \code{integer} representing the index position of the -SNV in the 1KG GDS file that contains all SNVs} -\item{keep} {a \code{logical} indicating if the genotype exists for the SNV} -\item{hetero} {a \code{logical} indicating if the SNV is heterozygote} -\item{homo} {a \code{logical} indicating if the SNV is homozygote} +\item{snp.index}{ a \code{integer} representing the index position of the +SNV in the Reference GDS file that contains all SNVs} +\item{keep}{ a \code{logical} indicating if the genotype exists for the SNV} +\item{hetero}{ a \code{logical} indicating if the SNV is heterozygote} +\item{homo}{ a \code{logical} indicating if the SNV is homozygote} }} \item{chr}{a single positive \code{integer} for the current chromosome. The \code{chrInfo} parameter must contain the value for the specified chromosome.} -\item{genoN}{a single \code{numeric} between 0 and 1 representing TODO. -Default: \code{0.0001}.} +\item{genoN}{a single \code{numeric} between 0 and 1 representing the +probability of sequencing error. Default: \code{0.0001}.} } \value{ a \code{data.frame} with the informations about LOH on a specific chromosome. The \code{data.frame} contains those columns: -\itemize{ -\item{chr} {a \code{integer} representing the current chromosome} -\item{start} {a \code{integer} representing the starting position on the +\describe{ +\item{chr}{ a \code{integer} representing the current chromosome} +\item{start}{ a \code{integer} representing the starting position on the box containing only homozygote SNVs (or not SNV). The first box starts at -position 1.} -\item{end} {a \code{integer} representing the end position on the +position \code{1}.} +\item{end}{ a \code{integer} representing the end position on the box containing only homozygote SNVs (or not SNV). The last box ends at the length of the chromosome.} -\item{logLHR} {TODO} -\item{LH1} {TODO} -\item{LM1} {TODO} -\item{homoScore} {a \code{numeric} representing \code{LH1} - \code{LM1}} -\item{nbSNV} {a \code{integer} representing th number of SNVs in +\item{logLHR}{ a \code{numeric} representing the LOH score basde on +population frequencies. It is the sum of +the log10 of the frequencies of the observed gegenotype minus the +the sum of the log10 of the higher frequent genotype. +(-100 when normal genotype are present)} +\item{LH1}{ a \code{numeric} representing the probability to be +heterozygote based on the coverage of each allele when normal +genotype is present} +\item{LM1}{ a \code{numeric} representing the max probability +for the read coverage at the position} +\item{homoScore}{ a \code{numeric} representing \code{LH1} - \code{LM1}} +\item{nbSNV}{ a \code{integer} representing th number of SNVs in the box} -\item{nbPruned} {a \code{integer} representing th number of pruned SNVs in +\item{nbPruned}{ a \code{integer} representing the number of pruned SNVs in the box} -\item{nbNorm} {TODO} -\item{LOH} {TODO} +\item{nbNorm}{ a \code{integer} representing of the number of +heterozygote genotypes for the normal SNVs in the block} +\item{LOH}{ a \code{integer} representing a flag, if \code{1} it means +the block is satisfying the criteria to be LOH. The value is not assigned +in this function; the value \code{0} is assigned} } } \description{ @@ -78,46 +88,50 @@ chromosome using the homozygote SNVs present on the chromosome. \examples{ ## Required library for GDS -library(gdsfmt) +library(SNPRelate) -## Path to the demo 1KG GDS file is located in this package +## Path to the demo Reference GDS file is located in this package dataDir <- system.file("extdata/tests", package="RAIDS") -fileGDS <- file.path(dataDir, "ex1_good_small_1KG_GDS.gds") +fileGDS <- file.path(dataDir, "ex1_good_small_1KG.gds") -## Open the reference GDS file (demo version) +## Open the Reference GDS file (demo version) gds1KG <- snpgdsOpen(fileGDS) -## Chromosome length information -## chr23 is chrX, chr24 is chrY and chrM is 25 -chrInfo <- c(248956422L, 242193529L, 198295559L, 190214555L, - 181538259L, 170805979L, 159345973L, 145138636L, 138394717L, 133797422L, - 135086622L, 133275309L, 114364328L, 107043718L, 101991189L, 90338345L, - 83257441L, 80373285L, 58617616L, 64444167L, 46709983L, 50818468L, - 156040895L, 57227415L, 16569L) +## Required library for this example to run correctly +if (requireNamespace("GenomeInfoDb", quietly=TRUE) && + requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) { -## Data frame with SNV information for the specified chromosome (chr 1) -snpInfo <- data.frame(cnt.tot=c(41, 17, 27, 15, 11, 37, 16, 32), - cnt.ref=c(40, 17, 27, 15, 4, 14, 16, 32), - cnt.alt=c(0, 0, 0, 0, 7, 23, 0, 0), - snp.pos=c(3722256, 3722328, 3767522, 3868160, 3869467, 4712655, - 6085318, 6213145), - snp.chr=c(rep(1, 8)), - normal.geno=c(rep(3, 8)), pruned=c(TRUE, TRUE, FALSE, TRUE, FALSE, TRUE, - TRUE, TRUE), - pruned=c(TRUE, TRUE, FALSE, TRUE, FALSE, rep(TRUE, 3)), - snp.index=c(160, 162, 204, 256, 259, 288, 366, 465), - keep=rep(TRUE, 8), hetero=c(rep(FALSE, 4), TRUE, TRUE, rep(FALSE, 2)), - homo=c(rep(TRUE, 4), FALSE, FALSE, TRUE, TRUE), - stringAsFactor=FALSE) + ## Chromosome length information + ## chr23 is chrX, chr24 is chrY and chrM is 25 + chrInfo <- + GenomeInfoDb::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25] -## The function returns a data frame containing the information about the -## LOH regions in the specified chromosome -result <- RAIDS:::computeLOHBlocksDNAChr(gdsReference=gds1KG, - chrInfo=chrInfo, snp.pos=snpInfo, chr=1L, genoN=0.0001) -head(result) + ## Data frame with SNV information for the specified chromosome (chr 1) + snpInfo <- data.frame(cnt.tot=c(41, 17, 27, 15, 11, 37, 16, 32), + cnt.ref=c(40, 17, 27, 15, 4, 14, 16, 32), + cnt.alt=c(0, 0, 0, 0, 7, 23, 0, 0), + snp.pos=c(3722256, 3722328, 3767522, 3868160, 3869467, 4712655, + 6085318, 6213145), + snp.chr=c(rep(1, 8)), + normal.geno=c(rep(3, 8)), pruned=c(TRUE, TRUE, FALSE, TRUE, FALSE, + TRUE, TRUE, TRUE), + pruned=c(TRUE, TRUE, FALSE, TRUE, FALSE, rep(TRUE, 3)), + snp.index=c(160, 162, 204, 256, 259, 288, 366, 465), + keep=rep(TRUE, 8), hetero=c(rep(FALSE, 4), TRUE, + TRUE, rep(FALSE, 2)), + homo=c(rep(TRUE, 4), FALSE, FALSE, TRUE, TRUE), + stringAsFactor=FALSE) -## Close GDS file (important) -closefn.gds(gds1KG) + ## The function returns a data frame containing the information about + ## the LOH regions in the specified chromosome + result <- RAIDS:::computeLOHBlocksDNAChr(gdsReference=gds1KG, + chrInfo=chrInfo, snpPos=snpInfo, chr=1L, genoN=0.0001) + head(result) + + ## Close Reference GDS file (important) + closefn.gds(gds1KG) + +} } \author{ diff --git a/man/computePCAForSamples.Rd b/man/computePCAForSamples.Rd deleted file mode 100644 index 08fd6625a..000000000 --- a/man/computePCAForSamples.Rd +++ /dev/null @@ -1,40 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/processStudy_internal.R -\encoding{UTF-8} -\name{computePCAForSamples} -\alias{computePCAForSamples} -\title{Deprecated - Project patients onto existing principal component -axes (PCA)} -\usage{ -computePCAForSamples(gds, pathProfileGDS, listSamples, np = 1L) -} -\arguments{ -\item{gds}{an object of class -\code{\link[SNPRelate:SNPGDSFileClass]{SNPRelate::SNPGDSFileClass}}, a SNP -GDS file.} - -\item{pathProfileGDS}{the path of an object of class \code{gds} related to -the sample} - -\item{listSamples}{a \code{vector} of string representing the samples for -which compute the PCA.} - -\item{np}{a single positive \code{integer} representing the number of -threads. Default: \code{1L}.} -} -\value{ -The integer \code{0L} when successful. -} -\description{ -This function calculates the patient eigenvectors using -the specified SNP loadings. Deprecated -} -\details{ -More information about the method used to calculate the patient eigenvectors -can be found at the Bioconductor SNPRelate website: -https://bioconductor.org/packages/SNPRelate/ -} -\author{ -Pascal Belleau, Astrid Deschênes and Alexander Krasnitz -} -\keyword{internal} diff --git a/man/computePCAMultiSynthetic.Rd b/man/computePCAMultiSynthetic.Rd index 70da79def..d859c4959 100644 --- a/man/computePCAMultiSynthetic.Rd +++ b/man/computePCAMultiSynthetic.Rd @@ -37,13 +37,13 @@ to show how the different steps in the function. Default: \code{FALSE}.} } \value{ a \code{list} containing 3 entries: -\itemize{ -\item{sample.id} { a \code{vector} of \code{character} strings representing +\describe{ +\item{sample.id}{ a \code{vector} of \code{character} strings representing the identifiers of the synthetic profiles that have been projected onto the 1KG PCA. } -\item{eigenvector.ref} { a \code{matrix} of \code{numeric} with the +\item{eigenvector.ref}{ a \code{matrix} of \code{numeric} with the eigenvectors of the 1KG reference profiles used to generate the PCA.} -\item{eigenvector} { a \code{matrix} of \code{numeric} with the +\item{eigenvector}{ a \code{matrix} of \code{numeric} with the eigenvectors of the synthetic profiles projected onto the 1KG PCA. } } } @@ -58,6 +58,9 @@ been removed from the set of reference profiles. ## Required library library(gdsfmt) +## Loading demo PCA on subset of 1KG reference dataset +data(demoPCA1KG) + ## Path to the demo Profile GDS file is located in this package dataDir <- system.file("extdata/demoKNNSynthetic", package="RAIDS") @@ -73,14 +76,12 @@ names(samplesRM) <- c("GBR", "FIN", "CHS","PUR", "CDX", "CLM", "IBS", "PEL", "PJL", "KHV", "ACB", "GWD", "ESN", "BEB", "MSL", "STU", "ITU", "CEU", "YRI", "CHB", "JPT", "LWK", "ASW", "MXL", "TSI", "GIH") -## The PCA on the 1KG reference profiles -pca <- readRDS(file.path(dataDir, "pca1KG.RDS")) - ## Open the Profile GDS file gdsProfile <- snpgdsOpen(file.path(dataDir, "ex1.gds")) ## Projects synthetic profiles on 1KG PCA -results <- computePCAMultiSynthetic(gdsProfile=gdsProfile, listPCA=pca, +results <- computePCAMultiSynthetic(gdsProfile=gdsProfile, + listPCA=demoPCA1KG, sampleRef=samplesRM, studyIDSyn=studyID, verbose=FALSE) ## The eigenvectors for the synthetic profiles diff --git a/man/computePCARefRMMulti.Rd b/man/computePCARefRMMulti.Rd index f8205f908..5f5d84ce3 100644 --- a/man/computePCARefRMMulti.Rd +++ b/man/computePCARefRMMulti.Rd @@ -53,7 +53,7 @@ printed.} } \value{ a \code{list} containing 2 entries: -\itemize{ +\describe{ \item{pruned}{ a \code{vector} of SNV identifiers specifying selected SNVs for the PCA analysis.} \item{pca.unrel}{ a \code{snpgdsPCAClass} object containing the eigenvalues @@ -67,8 +67,11 @@ for a list of pruned SNVs present in a Profile GDS file. The } \examples{ -## Path to the demo Profile GDS file is located in this package -dataDir <- system.file("extdata/demoKNNSynthetic", package="RAIDS") +## Required library +library(SNPRelate) + +## Load the known ancestry for the demo 1KG reference profiles +data(demoKnownSuperPop1KG) # The name of the synthetic study studyID <- "MYDATA.Synthetic" @@ -81,8 +84,8 @@ samplesRM <- c("HG00246", "HG00325", "HG00611", "HG01173", "HG02165", "NA12751", "NA19107", "NA18548", "NA19075", "NA19475", "NA19712", "NA19731", "NA20528", "NA20908") -## The known ancestry for the 1KG reference profiles -refKnownSuperPop <- readRDS(file.path(dataDir, "knownSuperPop1KG.RDS")) +## Path to the demo Profile GDS file is located in this package +dataDir <- system.file("extdata/demoKNNSynthetic", package="RAIDS") ## Open the Profile GDS file gdsProfile <- snpgdsOpen(file.path(dataDir, "ex1.gds")) @@ -90,7 +93,7 @@ gdsProfile <- snpgdsOpen(file.path(dataDir, "ex1.gds")) ## Compute PCA for the 1KG reference profiles excluding ## the profiles used to generate the synthetic profiles results <- RAIDS:::computePCARefRMMulti(gdsProfile=gdsProfile, - refProfileIDs=names(refKnownSuperPop), listRM=samplesRM, np=1L, + refProfileIDs=names(demoKnownSuperPop1KG), listRM=samplesRM, np=1L, algorithm="exact", eigenCount=32L, missingRate=0.025, verbose=FALSE) ## The PCA on the pruned SNVs data set for selected profiles diff --git a/man/computePCARefSample.Rd b/man/computePCARefSample.Rd index babcdfe64..476afc54e 100644 --- a/man/computePCARefSample.Rd +++ b/man/computePCARefSample.Rd @@ -51,12 +51,12 @@ to show how the different steps in the function. Default: \code{FALSE}.} } \value{ a \code{list} containing 3 entries: -\itemize{ -\item{\code{sample.id}} { a \code{character} string representing the unique +\describe{ +\item{\code{sample.id}}{ a \code{character} string representing the unique identifier of the analyzed profile.} -\item{\code{eigenvector.ref}} { a \code{matrix} of \code{numeric} +\item{\code{eigenvector.ref}}{ a \code{matrix} of \code{numeric} representing the eigenvectors of the reference profiles. } -\item{\code{eigenvector}} { a \code{matrix} of \code{numeric} representing +\item{\code{eigenvector}}{ a \code{matrix} of \code{numeric} representing the eigenvectors of the analyzed profile. } } } diff --git a/man/computePCAsynthetic.Rd b/man/computePCAsynthetic.Rd deleted file mode 100644 index a168afc32..000000000 --- a/man/computePCAsynthetic.Rd +++ /dev/null @@ -1,56 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/processStudy_internal.R -\encoding{UTF-8} -\name{computePCAsynthetic} -\alias{computePCAsynthetic} -\title{Deprecated Function} -\usage{ -computePCAsynthetic( - gdsSample, - pruned, - sample.id, - sample.ref, - study.annot, - algorithm = "exact", - eigen.cnt = 32L -) -} -\arguments{ -\item{gdsSample}{an object of class \code{gds} opened related to -the sample} - -\item{pruned}{TODO} - -\item{sample.id}{TODO} - -\item{sample.ref}{TODO} - -\item{study.annot}{a \code{data.frame} with one entry from study.annot in -the gds} - -\item{algorithm}{a \code{character} string representing the algorithm used -to calculate the PCA. The 2 choices are "exact" (traditional exact -calculation) and "randomized" (fast PCA with randomized algorithm -introduced in Galinsky et al. 2016). Default: \code{"exact"}.} - -\item{eigen.cnt}{a single \code{integer} indicating the number of -eigenvectors that will be in the output of the \link[SNPRelate]{snpgdsPCA} -function; if 'eigen.cnt' <= 0, then all eigenvectors are returned. -Default: \code{32L}.} -} -\value{ -A \code{list} TODO with the sample.id and eigenvectors. -} -\description{ -Deprecated -} -\references{ -Galinsky KJ, Bhatia G, Loh PR, Georgiev S, Mukherjee S, Patterson NJ, -Price AL. Fast Principal-Component Analysis Reveals Convergent Evolution -of ADH1B in Europe and East Asia. Am J Hum Genet. 2016 Mar 3;98(3):456-72. -doi: 10.1016/j.ajhg.2015.12.022. Epub 2016 Feb 25. -} -\author{ -Pascal Belleau, Astrid Deschênes and Alexander Krasnitz -} -\keyword{internal} diff --git a/man/computePoolSyntheticAncestry.Rd b/man/computePoolSyntheticAncestry.Rd deleted file mode 100644 index 17bee4431..000000000 --- a/man/computePoolSyntheticAncestry.Rd +++ /dev/null @@ -1,107 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/processStudy.R -\encoding{UTF-8} -\name{computePoolSyntheticAncestry} -\alias{computePoolSyntheticAncestry} -\title{TODO} -\usage{ -computePoolSyntheticAncestry( - gdsReference, - gdsSample, - profileID, - dataRef, - spRef, - studyIDSyn, - np = 1L, - listCatPop = c("EAS", "EUR", "AFR", "AMR", "SAS"), - fieldPopIn1KG = "superPop", - fieldPopInfAnc = "SuperPop", - kList = seq(2, 15, 1), - pcaList = seq(2, 15, 1), - algorithm = "exact", - eigenCount = 32L, - missingRate = 0.025 -) -} -\arguments{ -\item{gdsReference}{an object of class \link[gdsfmt]{gds.class} (a GDS -file), the opened Reference GDS file.} - -\item{gdsSample}{an object of class \link[gdsfmt]{gds.class} (a GDS file), -an opened Profile GDS file.} - -\item{profileID}{a single \code{character} string representing the -profile identifier.} - -\item{dataRef}{a \code{data.frame} containing the information of the -synthetic profiles that will be} - -\item{spRef}{TODO} - -\item{studyIDSyn}{a \code{character} string corresponding to the study -identifier. The study identifier must be present in the Profile GDS file.} - -\item{np}{a single positive \code{integer} representing the number of -threads. Default: \code{1L}.} - -\item{listCatPop}{a \code{vector} of \code{character} string -representing the list of possible ancestry assignations. Default: -\code{("EAS", "EUR", "AFR", "AMR", "SAS")}.} - -\item{fieldPopIn1KG}{a \code{character} string representing TODO . -Default: \code{"superPop"}.} - -\item{fieldPopInfAnc}{a \code{character} string representing the name of -the column that will contain the inferred ancestry for the specified -dataset. Default: \code{"SuperPop"}.} - -\item{kList}{a \code{vector} of \code{integer} representing the list of -values tested for the _K_ parameter. The _K_ parameter represents the -number of neighbors used in the K-nearest neighbors analysis. If -\code{NULL}, the value \code{seq(2,15,1)} is assigned. -Default: \code{seq(2,15,1)}.} - -\item{pcaList}{a \code{vector} of \code{integer} representing the list of -values tested for the _D_ parameter. The _D_ parameter represents the -number of dimensions used in the PCA analysis. If \code{NULL}, -the value \code{seq(2,15,1)} is assigned. -Default: \code{seq(2,15,1)}.} - -\item{algorithm}{a \code{character} string representing the algorithm used -to calculate the PCA. The 2 choices are "exact" (traditional exact -calculation) and "randomized" (fast PCA with randomized algorithm -introduced in Galinsky et al. 2016). Default: \code{"exact"}.} - -\item{eigenCount}{a single \code{integer} indicating the number of -eigenvectors that will be in the output of the \link[SNPRelate]{snpgdsPCA} -function; if 'eigenCount' <= 0, then all eigenvectors are returned. -Default: \code{32L}.} - -\item{missingRate}{a \code{numeric} value representing the threshold -missing rate at with the SNVs are discarded; the SNVs are retained in the -\link[SNPRelate]{snpgdsPCA} -with "<= missingRate" only; if \code{NaN}, no missing threshold. -Default: \code{0.025}.} -} -\value{ -A \code{list} TODO with the sample.id and eigenvectors -and a table with KNN callfor different K and pca dimension. -} -\description{ -TODO -} -\examples{ - -# TODO -listEigenvector <- "TOTO" - -} -\references{ -Galinsky KJ, Bhatia G, Loh PR, Georgiev S, Mukherjee S, Patterson NJ, -Price AL. Fast Principal-Component Analysis Reveals Convergent Evolution -of ADH1B in Europe and East Asia. Am J Hum Genet. 2016 Mar 3;98(3):456-72. -doi: 10.1016/j.ajhg.2015.12.022. Epub 2016 Feb 25. -} -\author{ -Pascal Belleau, Astrid Deschênes and Alexander Krasnitz -} diff --git a/man/computePoolSyntheticAncestryGr.Rd b/man/computePoolSyntheticAncestryGr.Rd index fbf43ba93..906dd9263 100644 --- a/man/computePoolSyntheticAncestryGr.Rd +++ b/man/computePoolSyntheticAncestryGr.Rd @@ -56,13 +56,13 @@ the column that will contain the inferred ancestry for the specified dataset. Default: \code{"SuperPop"}.} \item{kList}{a \code{vector} of \code{integer} representing the list of -values tested for the _K_ parameter. The _K_ parameter represents the +values tested for the \emph{K} parameter. The \emph{K} parameter represents the number of neighbors used in the K-nearest neighbor analysis. If \code{NULL}, the value \code{seq(2,15,1)} is assigned. Default: \code{seq(2,15,1)}.} \item{pcaList}{a \code{vector} of \code{integer} representing the list of -values tested for the _D_ parameter. The _D_ parameter represents the +values tested for the \emph{D} parameter. The \emph{D} parameter represents the number of dimensions used in the PCA analysis. If \code{NULL}, the value \code{seq(2,15,1)} is assigned. Default: \code{seq(2,15,1)}.} @@ -88,16 +88,16 @@ printed. Default: \code{FALSE}.} } \value{ a \code{list} containing the following entries: -\itemize{ -\item{sample.id}{a \code{vector} of \code{character} strings representing +\describe{ +\item{sample.id}{ a \code{vector} of \code{character} strings representing the identifiers of the synthetic profiles. } -\item{sample1Kg}{a \code{vector} of \code{character} strings representing +\item{sample1Kg}{ a \code{vector} of \code{character} strings representing the identifiers of the reference 1KG profiles used to generate the synthetic profiles. } -\item{sp}{a \code{vector} of \code{character} strings representing the +\item{sp}{ a \code{vector} of \code{character} strings representing the known ancestry for the reference 1KG profiles used to generate the synthetic profiles. } -\item{matKNN}{a \code{data.frame} containing 4 columns. The first column +\item{matKNN}{ a \code{data.frame} containing 4 columns. The first column 'sample.id' contains the name of the synthetic profile. The second column 'D' represents the dimension D used to infer the ancestry. The third column 'K' represents the number of neighbors K used to infer the ancestry. The @@ -118,8 +118,9 @@ analysis using a range of K and D values is done. ## Required library library(gdsfmt) -## Path to the demo Profile GDS file is located in this package -dataDir <- system.file("extdata/demoKNNSynthetic", package="RAIDS") +## Load the known ancestry for the demo 1KG reference profiles +data(demoKnownSuperPop1KG) + # The name of the synthetic study studyID <- "MYDATA.Synthetic" @@ -133,8 +134,8 @@ names(samplesRM) <- c("GBR", "FIN", "CHS","PUR", "CDX", "CLM", "IBS", "PEL", "PJL", "KHV", "ACB", "GWD", "ESN", "BEB", "MSL", "STU", "ITU", "CEU", "YRI", "CHB", "JPT", "LWK", "ASW", "MXL", "TSI", "GIH") -## The known ancestry for the 1KG reference profiles -refKnownSuperPop <- readRDS(file.path(dataDir, "knownSuperPop1KG.RDS")) +## Path to the demo Profile GDS file is located in this package +dataDir <- system.file("extdata/demoKNNSynthetic", package="RAIDS") ## Open the Profile GDS file gdsProfile <- snpgdsOpen(file.path(dataDir, "ex1.gds")) @@ -142,7 +143,8 @@ gdsProfile <- snpgdsOpen(file.path(dataDir, "ex1.gds")) ## Run a PCA analysis and a K-nearest neighbors analysis on a small set ## of synthetic data results <- computePoolSyntheticAncestryGr(gdsProfile=gdsProfile, - sampleRM=samplesRM, studyIDSyn=studyID, np=1L, spRef=refKnownSuperPop, + sampleRM=samplesRM, studyIDSyn=studyID, np=1L, + spRef=demoKnownSuperPop1KG, kList=seq(10,15,1), pcaList=seq(10,15,1), eigenCount=15L) ## The ancestry inference for the synthetic data using diff --git a/man/computePrunedPCARef.Rd b/man/computePrunedPCARef.Rd deleted file mode 100644 index 03d1a2fe0..000000000 --- a/man/computePrunedPCARef.Rd +++ /dev/null @@ -1,49 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/processStudy.R -\encoding{UTF-8} -\name{computePrunedPCARef} -\alias{computePrunedPCARef} -\title{Compute principal component axes (PCA) on pruned SNV with the -reference samples} -\usage{ -computePrunedPCARef(gdsProfile, listRef, np = 1L, verbose = FALSE) -} -\arguments{ -\item{gdsProfile}{an object of class -\code{\link[SNPRelate:SNPGDSFileClass]{SNPRelate::SNPGDSFileClass}}, a SNP -GDS file.} - -\item{listRef}{a \code{vector} of string representing the -identifier of the profiles in the reference (unrelated).} - -\item{np}{a single positive \code{integer} representing the number of -threads. Default: \code{1L}.} - -\item{verbose}{a \code{logical} indicating if the PCA functions should be -verbose. Default: \code{FALSE}.} -} -\value{ -listPCA a \code{list} containing two objects -pca.unrel -> \code{snpgdsPCAClass} -and a snp.load -> \code{snpgdsPCASNPLoading} -} -\description{ -This function compute the PCA on pruned SNV with the -reference samples -} -\details{ -More information about the method used to calculate the patient eigenvectors -can be found at the Bioconductor SNPRelate website: -https://bioconductor.org/packages/SNPRelate/ -} -\examples{ - -## Path to the demo pedigree file is located in this package -dataDir <- system.file("extdata", "RAIDS") - -## TODO - -} -\author{ -Pascal Belleau, Astrid Deschênes and Alexander Krasnitz -} diff --git a/man/computeSyntheticConfMat.Rd b/man/computeSyntheticConfMat.Rd index c0cad93d7..ab9604d2c 100644 --- a/man/computeSyntheticConfMat.Rd +++ b/man/computeSyntheticConfMat.Rd @@ -17,7 +17,7 @@ computeSyntheticConfMat( } \arguments{ \item{matKNN}{a \code{data.frame} containing the inferred ancestry results -for fixed values of _D_ and _K_. The \code{data.frame} must contained +for fixed values of \emph{D} and \emph{K}. The \code{data.frame} must contained those columns: "sample.id", "D", "K" and the fourth column name must correspond to the \code{matKNNAncestryColumn} argument.} @@ -41,35 +41,37 @@ the list of possible ancestry assignations.} } \value{ \code{list} containing 2 entries: -\itemize{ -\item{confMat} { a \code{matrix} representing the confusion matrix } -\item{matAccuracy} { a \code{data.frame} containing the statistics +\describe{ +\item{confMat}{ a \code{matrix} representing the confusion matrix } +\item{matAccuracy}{ a \code{data.frame} containing the statistics associated to the confusion matrix} } } \description{ The function calculates the confusion matrix of the inferences -for fixed values of _D_ and _K_ using the inferred ancestry results done +for fixed values of \emph{D} and \emph{K} using the inferred ancestry results done on the synthetic profiles. } \examples{ -dataDirRes <- system.file("extdata/demoAncestryCall", package="RAIDS") +## Loading demo dataset containing pedigree information for synthetic +## profiles and known ancestry of the profiles used to generate the +## synthetic profiles +data(pedSynthetic) + +## Loading demo dataset containing the inferred ancestry results +## for the synthetic data +data(matKNNSynthetic) ## The inferred ancestry results for the synthetic data using ## values of D=6 and K=5 -matKNN <- readRDS(file.path(dataDirRes, "matKNN.RDS")) -matKNN <- matKNN[matKNN$K == 6 & matKNN$D == 5, ] - -## The known ancestry from the reference profiles used to generate the -## synthetic profiles -syntheticInfo <- readRDS(file.path(dataDirRes, "pedSyn.RDS")) +matKNN <- matKNNSynthetic[matKNNSynthetic$K == 6 & matKNNSynthetic$D == 5, ] ## Compile the confusion matrix using the ## synthetic profiles for fixed values of D and K values results <- RAIDS:::computeSyntheticConfMat(matKNN=matKNN, matKNNAncestryColumn="SuperPop", - pedCall=syntheticInfo, pedCallAncestryColumn="superPop", + pedCall=pedSynthetic, pedCallAncestryColumn="superPop", listCall=c("EAS", "EUR", "AFR", "AMR", "SAS")) results$confMat diff --git a/man/computeSyntheticROC.Rd b/man/computeSyntheticROC.Rd index 8ab8bfe88..e14b32ef6 100644 --- a/man/computeSyntheticROC.Rd +++ b/man/computeSyntheticROC.Rd @@ -17,7 +17,7 @@ computeSyntheticROC( } \arguments{ \item{matKNN}{a \code{data.frame} containing the inferred ancestry results -for fixed values of _D_ and _K_. On of the column names of the +for fixed values of \emph{D} and \emph{K}. On of the column names of the \code{data.frame} must correspond to the \code{matKNNAncestryColumn} argument.} @@ -30,7 +30,8 @@ argument.} \item{pedCall}{a \code{data.frame} containing the information about the super-population information from the 1KG GDS file for profiles used to generate the synthetic profiles. The \code{data.frame} -must contained a column named as the \code{pedCallAncestryColumn} argument.} +must contained a column named as the \code{pedCallAncestryColumn} argument. +The row names must correspond to the sample identifiers (mandatory).} \item{pedCallAncestryColumn}{a \code{character} string representing the name of the column that contains the known ancestry for the reference @@ -43,7 +44,7 @@ Default: \code{c("EAS", "EUR", "AFR", "AMR", "SAS")}.} } \value{ \code{list} containing 3 entries: -\itemize{ +\describe{ \item{\code{matAUROC.All}}{ a \code{data.frame} containing the AUROC for all the ancestry results. } \item{\code{matAUROC.Call}}{ a \code{data.frame} containing the AUROC @@ -60,22 +61,24 @@ well as on all the results together. } \examples{ -dataDirRes <- system.file("extdata/demoAncestryCall", package="RAIDS") +## Loading demo dataset containing pedigree information for synthetic +## profiles and known ancestry of the profiles used to generate the +## synthetic profiles +data(pedSynthetic) + +## Loading demo dataset containing the inferred ancestry results +## for the synthetic data +data(matKNNSynthetic) ## The inferred ancestry results for the synthetic data using ## values of D=6 and K=5 -matKNN <- readRDS(file.path(dataDirRes, "matKNN.RDS")) -matKNN <- matKNN[matKNN$K == 6 & matKNN$D == 5, ] - -## The known ancestry from the reference profiles used to generate the -## synthetic profiles -syntheticInfo <- readRDS(file.path(dataDirRes, "pedSyn.RDS")) +matKNN <- matKNNSynthetic[matKNNSynthetic$K == 6 & matKNNSynthetic$D == 5, ] ## Compile statistics from the ## synthetic profiles for fixed values of D and K results <- RAIDS:::computeSyntheticROC(matKNN=matKNN, matKNNAncestryColumn="SuperPop", - pedCall=syntheticInfo, pedCallAncestryColumn="superPop", + pedCall=pedSynthetic, pedCallAncestryColumn="superPop", listCall=c("EAS", "EUR", "AFR", "AMR", "SAS")) results$matAUROC.All diff --git a/man/createAUROCGraph.Rd b/man/createAUROCGraph.Rd new file mode 100644 index 000000000..28efae9a5 --- /dev/null +++ b/man/createAUROCGraph.Rd @@ -0,0 +1,70 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/visualization.R +\encoding{UTF-8} +\name{createAUROCGraph} +\alias{createAUROCGraph} +\title{Generate accuracy graph} +\usage{ +createAUROCGraph( + dfAUROC, + title = "", + selectD = c(3, 7, 11), + selectColor = c("#5e688a", "#cd5700", "#CC79A7") +) +} +\arguments{ +\item{dfAUROC}{a \code{data.frame} corresponding to res$paraSample$dfAUROC +where res is the result of inferAncestry() or inferAncestryGeneAware() +functions.} + +\item{title}{a \code{character} string representing the title of the graph. +Default: \code{""}.} + +\item{selectD}{a \code{array} of \code{integer} representing the selected +PCA dimensions to plot. The length of the \code{array} cannot be more than +5 entries. The dimensions must tested by RAIDS (i.e. be present in the +RDS file). Default: \code{c(3,7,11)}.} + +\item{selectColor}{a \code{array} of \code{character} strings representing +the selected colors for the associated PCA dimensions to plot. The length +of the \code{array} must correspond to the length of the \code{selectD} +parameter. In addition, the length of the \code{array} cannot be more than +5 entries. +Default: \code{c("#5e688a", "#cd5700", "#CC79A7")}.} +} +\value{ +a \code{ggplot} object containing the graphic representation of the +accuracy for different values of PCA dimensions and K-neighbors through +all tested ancestries. +} +\description{ +This function extracts the required information from an +output generated by RAIDS to create a graphic representation of the +accuracy for different values of PCA dimensions and K-neighbors through +all tested ancestries. +} +\examples{ + +## Required library +library(ggplot2) + +## Path to RDS file with ancestry information generated by RAIDS (demo file) +dataDir <- system.file("extdata", package="RAIDS") +fileRDS <- file.path(dataDir, "TEST_01.infoCall.RDS") +info <- readRDS(fileRDS) +dfAUROC <- info$paraSample$dfAUROC + +## Some of the column names must be updated to fit new standards +colnames(dfAUROC) <- c("D", "K", "Call", "L", "AUROC", "H") + +## Create accuracy graph +accuracyGraph <- createAUROCGraph(dfAUROC=dfAUROC, title="Test 01", + selectD=c(3, 6, 9, 12, 15), + selectColor=c("steelblue", "darkorange", "violet", "pink", "gray40")) + +accuracyGraph + +} +\author{ +Astrid Deschênes and Pascal Belleau +} diff --git a/man/createAccuracyGraph.Rd b/man/createAccuracyGraph.Rd new file mode 100644 index 000000000..491d67509 --- /dev/null +++ b/man/createAccuracyGraph.Rd @@ -0,0 +1,65 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/visualization.R +\encoding{UTF-8} +\name{createAccuracyGraph} +\alias{createAccuracyGraph} +\title{Generate accuracy graph} +\usage{ +createAccuracyGraph( + fileRDS, + title = "", + selectD = c(3, 7, 11), + selectColor = c("#5e688a", "#cd5700", "#CC79A7") +) +} +\arguments{ +\item{fileRDS}{a \code{character} string representing the path and file +name of the RDS file containing the ancestry information as generated by +RAIDS.} + +\item{title}{a \code{character} string representing the title of the graph. +Default: \code{""}.} + +\item{selectD}{a \code{array} of \code{integer} representing the selected +PCA dimensions to plot. The length of the \code{array} cannot be more than +5 entries. The dimensions must tested by RAIDS (i.e. be present in the +RDS file). Default: \code{c(3,7,11)}.} + +\item{selectColor}{a \code{array} of \code{character} strings representing +the selected colors for the associated PCA dimensions to plot. The length +of the \code{array} must correspond to the length of the \code{selectD} +parameter. In addition, the length of the \code{array} cannot be more than +5 entries. +Default: \code{c("#5e688a", "#cd5700", "#CC79A7")}.} +} +\value{ +a \code{ggplot} object containing the graphic representation of the +accuracy for different values of PCA dimensions and K-neighbors through +all tested ancestries. +} +\description{ +This function extracts the required information from an +output generated by RAIDS to create a graphic representation of the +accuracy for different values of PCA dimensions and K-neighbors through +all tested ancestries. +} +\examples{ + +## Required library +library(ggplot2) + +## Path to RDS file with ancestry information generated by RAIDS (demo file) +dataDir <- system.file("extdata", package="RAIDS") +fileRDS <- file.path(dataDir, "TEST_01.infoCall.RDS") + +## Create accuracy graph +accuracyGraph <- createAccuracyGraph(fileRDS=fileRDS, title="Test 01", + selectD=c(3,6,9,12,15), + selectColor=c("steelblue", "darkorange", "violet", "pink", "gray80")) + +accuracyGraph + +} +\author{ +Astrid Deschênes and Pascal Belleau +} diff --git a/man/createProfile.Rd b/man/createProfile.Rd new file mode 100644 index 000000000..d07cc4e78 --- /dev/null +++ b/man/createProfile.Rd @@ -0,0 +1,149 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/processStudy_internal.R +\encoding{UTF-8} +\name{createProfile} +\alias{createProfile} +\title{Create the Profile GDS file(s) for one or multiple specific profiles +using the information from a RDS Sample description file and the 1KG +GDS file} +\usage{ +createProfile( + profileFile, + profileName, + filePedRDS = NULL, + pedStudy = NULL, + fileNameGDS, + batch = 1, + studyDF, + listProfiles = NULL, + pathProfileGDS = NULL, + genoSource = c("snp-pileup", "generic", "VCF", "bam"), + paramProfile = list(ScanBamParam = NULL, PileupParam = NULL, yieldSize = 1e+07), + verbose = FALSE +) +} +\arguments{ +\item{profileFile}{a \code{character} string representing the path to the +file: with genotype and the allele information of the profile A profile would have an +associated file called +if genoSource is "VCF", then "\emph{vcf.gz", +if genoSource is "generic", then "}.txt.gz" +if genoSource is "snp-pileup", then "\emph{.txt.gz". +if genoSource is "bam", then "}.bam" and "*.bai".} + +\item{profileName}{a \code{character} string representing the the profile Name.ID} + +\item{filePedRDS}{a \code{character} string representing the path to the +RDS file that contains the information about the sample to analyse. +The RDS file must +include a \code{data.frame} with those mandatory columns: "Name.ID", +"Case.ID", "Sample.Type", "Diagnosis", "Source". All columns must be in +\code{character} strings. The \code{data.frame} +must contain the information for all the samples passed in the +\code{listSamples} parameter. Only \code{filePedRDS} or \code{pedStudy} +can be defined.} + +\item{pedStudy}{a \code{data.frame} with those mandatory columns: "Name.ID", +"Case.ID", "Sample.Type", "Diagnosis", "Source". All columns must be in +\code{character} strings (no factor). The \code{data.frame} +must contain the information for all the samples passed in the +\code{listSamples} parameter. Only \code{filePedRDS} or \code{pedStudy} +can be defined.} + +\item{fileNameGDS}{a \code{character} string representing the file name of +the Reference GDS file. The file must exist.} + +\item{batch}{a single positive \code{integer} representing the current +identifier for the batch. Beware, this field is not stored anymore. +Default: \code{1}.} + +\item{studyDF}{a \code{data.frame} containing the information about the +study associated to the analysed sample(s). The \code{data.frame} must have +those 3 columns: "study.id", "study.desc", "study.platform". All columns +must be in \code{character} strings (no factor).} + +\item{listProfiles}{a \code{vector} of \code{character} string corresponding +to the profile identifiers that will have a Profile GDS file created. The +profile identifiers must be present in the "Name.ID" column of the Profile +RDS file passed to the \code{filePedRDS} parameter. +If \code{NULL}, all profiles present in the \code{filePedRDS} are selected. +Default: \code{NULL}.} + +\item{pathProfileGDS}{a \code{character} string representing the path to +the directory where the Profile GDS files will be created. +Default: \code{NULL}.} + +\item{genoSource}{a \code{character} string with two possible values: +'snp-pileup', 'generic' or 'VCF'. It specifies if the genotype files +are generated by snp-pileup (Facets) or are a generic format CSV file +with at least those columns: +'Chromosome', 'Position', 'Ref', 'Alt', 'Count', 'File1R' and 'File1A'. +The 'Count' is the depth at the specified position; +'FileR' is the depth of the reference allele and +'File1A' is the depth of the specific alternative allele. +Finally the file can be a VCF file with at least those genotype +fields: GT, AD, DP.} + +\item{verbose}{a \code{logical} indicating if message information should be +printed. Default: \code{FALSE}.} +} +\value{ +The function returns \code{0L} when successful. +} +\description{ +The function uses the information for the Reference GDS file +and the RDS Sample Description file to create the Profile GDS file. One +Profile GDS file is created per profile. One Profile GDS file will be +created for each entry present in the \code{listProfiles} parameter. +} +\examples{ + +## Path to the demo 1KG GDS file is located in this package +dataDir <- system.file("extdata/tests", package="RAIDS") +fileGDS <- file.path(dataDir, "ex1_good_small_1KG.gds") + +## The data.frame containing the information about the study +## The 3 mandatory columns: "study.id", "study.desc", "study.platform" +## The entries should be strings, not factors (stringsAsFactors=FALSE) +studyDF <- data.frame(study.id = "MYDATA", + study.desc = "Description", + study.platform = "PLATFORM", + stringsAsFactors = FALSE) + +## The data.frame containing the information about the samples +## The entries should be strings, not factors (stringsAsFactors=FALSE) +samplePED <- data.frame(Name.ID=c("ex1"), + Case.ID=c("Patient_h11"), + Diagnosis=rep("Cancer"), + Sample.Type=c("Primary Tumor"), + Source=c("Databank B"), stringsAsFactors=FALSE, + drop=FALSE) +rownames(samplePED) <- samplePED$Name.ID + +## Create the Profile GDS File for samples in 'listSamples' vector +## (in this case, samples "ex1") +## The Profile GDS file is created in the pathProfileGDS directory +result <- RAIDS:::createProfile(profileFile=file.path(dataDir, "ex1.txt.gz"), + profileName="ex1", + pedStudy=samplePED, fileNameGDS=fileGDS, + studyDF=studyDF, listProfiles=c("ex1"), + pathProfileGDS=tempdir(), + genoSource="snp-pileup", + verbose=FALSE) + +## The function returns OL when successful +result + +## The Profile GDS file 'ex1.gds' has been created in the +## specified directory +list.files(tempdir()) + +## Remove Profile GDS file (created for demo purpose) +unlink(file.path(tempdir(), "ex1.gds"), force=TRUE) + + +} +\author{ +Pascal Belleau, Astrid Deschênes and Alexander Krasnitz +} +\keyword{internal} diff --git a/man/createStudy2GDS1KG.Rd b/man/createStudy2GDS1KG.Rd index dcc460070..e8f1d8f60 100644 --- a/man/createStudy2GDS1KG.Rd +++ b/man/createStudy2GDS1KG.Rd @@ -16,7 +16,7 @@ createStudy2GDS1KG( studyDF, listProfiles = NULL, pathProfileGDS = NULL, - genoSource = c("snp-pileup", "generic"), + genoSource = c("snp-pileup", "generic", "VCF"), verbose = FALSE ) } @@ -25,7 +25,10 @@ createStudy2GDS1KG( directory containing the VCF output of SNP-pileup for each sample. The SNP-pileup files must be compressed (gz files) and have the name identifiers of the samples. A sample with "Name.ID" identifier would have an -associated SNP-pileup file called "Name.ID.txt.gz".} +associated file called +if genoSource is "VCF", then "Name.ID.vcf.gz", +if genoSource is "generic", then "Name.ID.generic.txt.gz" +if genoSource is "snp-pileup", then "Name.ID.txt.gz".} \item{filePedRDS}{a \code{character} string representing the path to the RDS file that contains the information about the sample to analyse. @@ -45,7 +48,7 @@ must contain the information for all the samples passed in the can be defined.} \item{fileNameGDS}{a \code{character} string representing the file name of -the 1KG GDS file. The file must exist.} +the Reference GDS file. The file must exist.} \item{batch}{a single positive \code{integer} representing the current identifier for the batch. Beware, this field is not stored anymore. @@ -68,13 +71,15 @@ the directory where the Profile GDS files will be created. Default: \code{NULL}.} \item{genoSource}{a \code{character} string with two possible values: -'snp-pileup' or 'generic'. It specifies if the genotype files +'snp-pileup', 'generic' or 'VCF'. It specifies if the genotype files are generated by snp-pileup (Facets) or are a generic format CSV file with at least those columns: 'Chromosome', 'Position', 'Ref', 'Alt', 'Count', 'File1R' and 'File1A'. The 'Count' is the depth at the specified position; 'FileR' is the depth of the reference allele and -'File1A' is the depth of the specific alternative allele.} +'File1A' is the depth of the specific alternative allele. +Finally the file can be a VCF file with at least those genotype +fields: GT, AD, DP.} \item{verbose}{a \code{logical} indicating if message information should be printed. Default: \code{FALSE}.} @@ -83,16 +88,16 @@ printed. Default: \code{FALSE}.} The function returns \code{0L} when successful. } \description{ -The function uses the information for the 1KG GDS file and the -RDS Sample Description file to create the Profile GDS file. One Profile GDS -file is created per profile. One Profile GDS file will be created for each -entry present in the \code{listProfiles} parameter. +The function uses the information for the Reference GDS file +and the RDS Sample Description file to create the Profile GDS file. One +Profile GDS file is created per profile. One Profile GDS file will be +created for each entry present in the \code{listProfiles} parameter. } \examples{ ## Path to the demo 1KG GDS file is located in this package dataDir <- system.file("extdata/tests", package="RAIDS") -fileGDS <- file.path(dataDir, "ex1_good_small_1KG_GDS.gds") +fileGDS <- file.path(dataDir, "ex1_good_small_1KG.gds") ## The data.frame containing the information about the study ## The 3 mandatory columns: "study.id", "study.desc", "study.platform" @@ -111,13 +116,13 @@ samplePED <- data.frame(Name.ID=c("ex1", "ex2"), Source=rep("Databank B", 2), stringsAsFactors=FALSE) rownames(samplePED) <- samplePED$Name.ID -## Create the Profile GDS File for sample in listSamples vector +## Create the Profile GDS File for samples in 'listSamples' vector ## (in this case, samples "ex1") ## The Profile GDS file is created in the pathProfileGDS directory result <- createStudy2GDS1KG(pathGeno=dataDir, pedStudy=samplePED, fileNameGDS=fileGDS, studyDF=studyDF, listProfiles=c("ex1"), - pathProfileGDS=dataDir, + pathProfileGDS=tempdir(), genoSource="snp-pileup", verbose=FALSE) @@ -126,10 +131,11 @@ result ## The Profile GDS file 'ex1.gds' has been created in the ## specified directory -list.files(dataDir) +list.files(tempdir()) + +## Remove Profile GDS file (created for demo purpose) +unlink(file.path(tempdir(), "ex1.gds"), force=TRUE) -## Unlink Profile GDS file (created for demo purpose) -unlink(file.path(dataDir, "ex1.gds")) } \author{ diff --git a/man/demoKnownSuperPop1KG.Rd b/man/demoKnownSuperPop1KG.Rd new file mode 100644 index 000000000..055ac3e5c --- /dev/null +++ b/man/demoKnownSuperPop1KG.Rd @@ -0,0 +1,70 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RAIDS.R +\docType{data} +\name{demoKnownSuperPop1KG} +\alias{demoKnownSuperPop1KG} +\title{The known super population ancestry of the demo 1KG reference profiles.} +\format{ +The \code{vector} containing the know super population ancestry +for the demo 1KG reference profiles. +} +\usage{ +data(demoKnownSuperPop1KG) +} +\value{ +The \code{vector} containing the know super population ancestry +for the demo 1KG reference profiles. +} +\description{ +The object is a \code{vector}. +} +\details{ +This object can be +used to test the \code{\link{computeKNNRefSynthetic}} and +\code{\link{computePoolSyntheticAncestryGr}} functions. +} +\examples{ + +## Required library +library(gdsfmt) + +## Load the demo PCA on the synthetic profiles projected on the +## demo 1KG reference PCA +data(demoPCASyntheticProfiles) + +## Load the known ancestry for the demo 1KG reference profiles +data(demoKnownSuperPop1KG) + +## Path to the demo Profile GDS file is located in this package +dataDir <- system.file("extdata/demoKNNSynthetic", package="RAIDS") + +## Open the Profile GDS file +gdsProfile <- snpgdsOpen(file.path(dataDir, "ex1.gds")) + +# The name of the synthetic study +studyID <- "MYDATA.Synthetic" + +## Projects synthetic profiles on 1KG PCA +results <- computeKNNRefSynthetic(gdsProfile=gdsProfile, + listEigenvector=demoPCASyntheticProfiles, + listCatPop=c("EAS", "EUR", "AFR", "AMR", "SAS"), studyIDSyn=studyID, + spRef=demoKnownSuperPop1KG) + +## The inferred ancestry for the synthetic profiles for different values +## of D and K +head(results$matKNN) + +## Close Profile GDS file (important) +closefn.gds(gdsProfile) + +} +\seealso{ +\describe{ +\item{\code{\link{computeKNNRefSynthetic}}}{ for running a k-nearest +neighbors analysis on a subset of the synthetic data set.} +\item{\code{\link{computePoolSyntheticAncestryGr}}}{ for running a +PCA analysis using 1 synthetic profile from each sub-continental +population.} +} +} +\keyword{datasets} diff --git a/man/demoPCA1KG.Rd b/man/demoPCA1KG.Rd new file mode 100644 index 000000000..325b2a276 --- /dev/null +++ b/man/demoPCA1KG.Rd @@ -0,0 +1,78 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RAIDS.R +\docType{data} +\name{demoPCA1KG} +\alias{demoPCA1KG} +\title{The PCA results of the demo 1KG reference dataset for demonstration purpose. +Beware that the PCA has been run on a very small subset of the +1KG reference dataset +and should not be used to call ancestry inference on a real profile.} +\format{ +The \code{list} containing the PCA results for a small subset of +the reference 1KG dataset. The \code{list} contains 2 entries: +\describe{ +\item{pruned}{ a \code{vector} of SNV identifiers specifying selected SNVs +for the PCA analysis.} +\item{pca.unrel}{ a \code{snpgdsPCAClass} object containing the eigenvalues +as generated by \link[SNPRelate]{snpgdsPCA} function.} +} +} +\usage{ +data(demoPCA1KG) +} +\value{ +The \code{list} containing the PCA results for a small subset of +the reference 1KG dataset. The \code{list} contains 2 entries: +\describe{ +\item{pruned}{ a \code{vector} of SNV identifiers specifying selected SNVs +for the PCA analysis.} +\item{pca.unrel}{ a \code{snpgdsPCAClass} object containing the eigenvalues +as generated by \link[SNPRelate]{snpgdsPCA} function.} +} +} +\description{ +The object is a \code{list}. +} +\details{ +This object can be +used to test the \code{\link{computePCAMultiSynthetic}} function. +} +\examples{ + +## Required library +library(gdsfmt) + +## Loading demo PCA on subset of 1KG reference dataset +data(demoPCA1KG) + +## Path to the demo Profile GDS file is located in this package +dataDir <- system.file("extdata/demoKNNSynthetic", package="RAIDS") + +# The name of the synthetic study +studyID <- "MYDATA.Synthetic" + +samplesRM <- c("HG00246", "HG00325", "HG00611", "HG01173", "HG02165", + "HG01112", "HG01615", "HG01968", "HG02658", "HG01850", "HG02013", + "HG02465", "HG02974", "HG03814", "HG03445", "HG03689", "HG03789", + "NA12751", "NA19107", "NA18548", "NA19075", "NA19475", "NA19712", + "NA19731", "NA20528", "NA20908") +names(samplesRM) <- c("GBR", "FIN", "CHS","PUR", "CDX", "CLM", "IBS", + "PEL", "PJL", "KHV", "ACB", "GWD", "ESN", "BEB", "MSL", "STU", "ITU", + "CEU", "YRI", "CHB", "JPT", "LWK", "ASW", "MXL", "TSI", "GIH") + +## Open the Profile GDS file +gdsProfile <- snpgdsOpen(file.path(dataDir, "ex1.gds")) + +## Projects synthetic profiles on demo 1KG PCA +results <- computePCAMultiSynthetic(gdsProfile=gdsProfile, + listPCA=demoPCA1KG, sampleRef=samplesRM, studyIDSyn=studyID, + verbose=FALSE) + +## The eigenvectors for the synthetic profiles +head(results$eigenvector) + +## Close Profile GDS file (important) +closefn.gds(gdsProfile) + +} +\keyword{datasets} diff --git a/man/demoPCASyntheticProfiles.Rd b/man/demoPCASyntheticProfiles.Rd new file mode 100644 index 000000000..7bcf99a3b --- /dev/null +++ b/man/demoPCASyntheticProfiles.Rd @@ -0,0 +1,87 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RAIDS.R +\docType{data} +\name{demoPCASyntheticProfiles} +\alias{demoPCASyntheticProfiles} +\title{The PCA result of demo synthetic profiles projected on the demo subset +1KG reference PCA.} +\format{ +The \code{list} containing the PCA result of demo synthetic +profiles projected on the demo subset 1KG reference PCA. +The \code{list} contains 3 entries: +\describe{ +\item{sample.id}{ a \code{character} string representing the unique +identifier of the synthetic profiles.} +\item{eigenvector.ref}{ a \code{matrix} of \code{numeric} containing +the eigenvectors for the reference profiles.} +\item{eigenvector}{ a \code{matrix} of \code{numeric} containing the +eigenvectors for the current synthetic profiles projected on the demo +PCA 1KG reference profiles.} +} +} +\usage{ +data(demoPCASyntheticProfiles) +} +\value{ +The \code{list} containing the PCA result of demo synthetic +profiles projected on the demo subset 1KG reference PCA. +The \code{list} contains 3 entries: +\describe{ +\item{sample.id}{ a \code{character} string representing the unique +identifier of the synthetic profiles.} +\item{eigenvector.ref}{ a \code{matrix} of \code{numeric} containing +the eigenvectors for the reference profiles.} +\item{eigenvector}{ a \code{matrix} of \code{numeric} containing the +eigenvectors for the current synthetic profiles projected on the demo +PCA 1KG reference profiles.} +} +} +\description{ +The object is a \code{list}. +} +\details{ +This object can be +used to test the \code{\link{computeKNNRefSynthetic}} function. +} +\examples{ + +## Required library +library(gdsfmt) + +## Load the demo PCA on the synthetic profiles projected on the +## demo 1KG reference PCA +data(demoPCASyntheticProfiles) + +## Load the known ancestry for the demo 1KG reference profiles +data(demoKnownSuperPop1KG) + +## Path to the demo Profile GDS file is located in this package +dataDir <- system.file("extdata/demoKNNSynthetic", package="RAIDS") + +## Open the Profile GDS file +gdsProfile <- snpgdsOpen(file.path(dataDir, "ex1.gds")) + +# The name of the synthetic study +studyID <- "MYDATA.Synthetic" + +## Projects synthetic profiles on 1KG PCA +results <- computeKNNRefSynthetic(gdsProfile=gdsProfile, + listEigenvector=demoPCASyntheticProfiles, + listCatPop=c("EAS", "EUR", "AFR", "AMR", "SAS"), studyIDSyn=studyID, + spRef=demoKnownSuperPop1KG) + +## The inferred ancestry for the synthetic profiles for different values +## of D and K +head(results$matKNN) + +## Close Profile GDS file (important) +closefn.gds(gdsProfile) + +} +\seealso{ +\describe{ +\item{\code{\link{computeKNNRefSynthetic}}}{ for running a k-nearest +neighbors analysis on a subset of the synthetic data set.} +} +} +\keyword{datasets} diff --git a/man/demoPedigreeEx1.Rd b/man/demoPedigreeEx1.Rd new file mode 100644 index 000000000..0dea25bb8 --- /dev/null +++ b/man/demoPedigreeEx1.Rd @@ -0,0 +1,138 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RAIDS.R +\docType{data} +\name{demoPedigreeEx1} +\alias{demoPedigreeEx1} +\title{The pedigree information about a demo profile called 'ex1'.} +\format{ +The \code{data.frame} containing the information about a demo +profile called 'ex1'. the \code{data.frame} has 5 columns: +\describe{ +\item{Name.ID}{ a \code{character} string representing the unique +identifier of the profile.} +\item{Case.ID}{ a \code{character} string representing the unique +identifier of the case associated to the profile.} +\item{Sample.Type}{ a \code{character} string describing the type of +profile.} +\item{Diagnosis}{ a \code{character} string describing the diagnosis of the +profile.} +\item{Source}{ a \code{character} string describing the source of the +profile.} +} +} +\usage{ +data(demoPedigreeEx1) +} +\value{ +The \code{data.frame} containing the information about a demo +profile called 'ex1'. the \code{data.frame} has 5 columns: +\describe{ +\item{Name.ID}{ a \code{character} string representing the unique +identifier of the profile.} +\item{Case.ID}{ a \code{character} string representing the unique +identifier of the case associated to the profile.} +\item{Sample.Type}{ a \code{character} string describing the type of +profile.} +\item{Diagnosis}{ a \code{character} string describing the diagnosis of the +profile.} +\item{Source}{ a \code{character} string describing the source of the +profile.} +} +} +\description{ +The object is a \code{data.frame}. +} +\details{ +This object can be +used to test the \code{\link{runExomeAncestry}} function. +} +\examples{ + + +## Required library for GDS +library(SNPRelate) + +## Path to the demo 1KG GDS file is located in this package +dataDir <- system.file("extdata", package="RAIDS") + +################################################################# +## Load the information about the profile +################################################################# +data(demoPedigreeEx1) +head(demoPedigreeEx1) + +################################################################# +## The 1KG GDS file and the 1KG SNV Annotation GDS file +## need to be located in the same directory +## Note that the 1KG GDS file used for this example is a +## simplified version and CANNOT be used for any real analysis +################################################################# +path1KG <- file.path(dataDir, "tests") + +fileReferenceGDS <- file.path(path1KG, "ex1_good_small_1KG.gds") +fileAnnotGDS <- file.path(path1KG, "ex1_good_small_1KG_Annot.gds") + +################################################################# +## The Sample SNP pileup files (one per sample) need +## to be located in the same directory. +################################################################# +pathGeno <- file.path(dataDir, "example", "snpPileup") + +################################################################# +## The path where the Profile GDS Files (one per sample) +## will be created need to be specified. +################################################################# +pathProfileGDS <- file.path(tempdir(), "out.tmp") + +pathOut <- file.path(tempdir(), "res.out") + +################################################################# +## A data frame containing general information about the study +## is also required. The data frame must have +## those 3 columns: "studyID", "study.desc", "study.platform" +################################################################# +studyDF <- data.frame(study.id="MYDATA", + study.desc="Description", + study.platform="PLATFORM", + stringsAsFactors=FALSE) + +#################################################################### +## Fix seed to ensure reproducible results +#################################################################### +set.seed(2043) + +gds1KG <- snpgdsOpen(fileReferenceGDS) +dataRef <- select1KGPop(gds1KG, nbProfiles=2L) +closefn.gds(gds1KG) + +## Required library for this example to run correctly +if (requireNamespace("GenomeInfoDb", quietly=TRUE) && + requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) { + + ## Chromosome length information + ## chr23 is chrX, chr24 is chrY and chrM is 25 + chrInfo <- GenomeInfoDb::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25] + + \donttest{ + runExomeAncestry(pedStudy=demoPedigreeEx1, studyDF=studyDF, + pathProfileGDS=pathProfileGDS, + pathGeno=pathGeno, pathOut=pathOut, + fileReferenceGDS=fileReferenceGDS, + fileReferenceAnnotGDS=fileAnnotGDS, + chrInfo=chrInfo, syntheticRefDF=dataRef, + genoSource="snp-pileup") + + unlink(pathProfileGDS, recursive=TRUE, force=TRUE) + unlink(pathOut, recursive=TRUE, force=TRUE) + } +} + +} +\seealso{ +\describe{ +\item{\code{\link{runExomeAncestry}}}{ for running runs most +steps leading to the ancestry inference call on a specific exome +profile.} +} +} +\keyword{datasets} diff --git a/man/estimateAllelicFraction.Rd b/man/estimateAllelicFraction.Rd index e4fd8a310..953737d63 100644 --- a/man/estimateAllelicFraction.Rd +++ b/man/estimateAllelicFraction.Rd @@ -27,7 +27,7 @@ estimateAllelicFraction( } \arguments{ \item{gdsReference}{an object of class \code{\link[gdsfmt]{gds.class}} -(a GDS file), the opened 1KG GDS file.} +(a GDS file), the opened Reference GDS file.} \item{gdsProfile}{an object of class \code{\link[gdsfmt]{gds.class}} (a GDS file), the opened Profile GDS file.} @@ -49,7 +49,8 @@ way the estimation of the allelic fraction is done. Default: \code{"DNA"}.} \item{minCov}{a single positive \code{integer} representing the minimum required coverage. Default: \code{10L}.} -\item{minProb}{a single \code{numeric} between 0 and 1 representing TODO. +\item{minProb}{a single \code{numeric} between 0 and 1 representing the +probability that the calculated genotype call is correct. Default: \code{0.999}.} \item{eProb}{a single \code{numeric} between 0 and 1 representing the @@ -71,12 +72,12 @@ log score, that the SNVs in a gene are allelic fraction different 0.5 Default: \code{3}.} \item{gdsRefAnnot}{an object of class \code{\link[gdsfmt]{gds.class}} -(a GDS file), the1 1KG Annotation GDS file. +(a GDS file), the opened Reference SNV Annotation GDS file. This parameter is RNA specific. Default: \code{NULL}.} \item{blockID}{a \code{character} string corresponding to the block -identifier in \code{gdsRefAnnot}. **This parameter is RNA specific.** +identifier in \code{gdsRefAnnot}. \strong{This parameter is RNA specific.} Default: \code{NULL}} \item{verbose}{a \code{logicial} indicating if the function should print @@ -87,92 +88,77 @@ The integer \code{0L} when successful. } \description{ The function estimates the allelic fraction of the -SNVs for a specific prfile and add the information to the associated +SNVs for a specific profile and add the information to the associated Profile GDS file. The allelic fraction estimation method is adapted to the type of study (DNA or RNA). } \details{ -The `chrInfo` parameter contains the length of the chromosomes. The +The \code{chrInfo} parameter contains the length of the chromosomes. The length of the chromosomes can be obtain through the -\code{\link[BSgenome]{BSgenome-class}} +\code{\link[GenomeInfoDb]{seqlengths}} library. -As example: +As example, for hg38 genome: -``` +\if{html}{\out{
}}\preformatted{ +if (requireNamespace("GenomeInfoDb", quietly=TRUE) && + requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) \{ + chrInfo <- GenomeInfoDb::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25] +\} -library(BSgenome.Hsapiens.UCSC.hg38) - -chrInfo <- integer(25L) - -for(i in seq_len(22L)){ chrInfo[i] <- length(Hsapiens[[paste0("chr", i)]])} - -chrInfo[23] <- length(Hsapiens[["chrX"]]) -chrInfo[24] <- length(Hsapiens[["chrY"]]) -chrInfo[25] <- length(Hsapiens[["chrM"]]) - -``` +}\if{html}{\out{
}} } \examples{ ## Required library for GDS library(gdsfmt) -## Path to the demo 1KG GDS file is located in this package +## Path to the demo 1KG GDS file located in this package dataDir <- system.file("extdata/tests", package="RAIDS") -fileGDS <- file.path(dataDir, "ex1_good_small_1KG_GDS.gds") +fileGDS <- file.path(dataDir, "ex1_good_small_1KG.gds") + +## Profile GDS file for one profile +fileProfile <- file.path(tempdir(), "ex1.gds") ## Copy the Profile GDS file demo that has been pruned and annotated -## into a test directory (deleted after the example has been run) -dataDirAllelicFraction <- file.path(system.file("extdata", package="RAIDS"), - "demoAllelicFraction") -dir.create(dataDirAllelicFraction, showWarnings=FALSE, - recursive=FALSE, mode="0777") +## into current directory file.copy(file.path(dataDir, "ex1_demo_with_pruning_and_1KG_annot.gds"), - file.path(dataDirAllelicFraction, "ex1.gds")) + fileProfile) ## Open the reference GDS file (demo version) gds1KG <- snpgdsOpen(fileGDS) ## Profile GDS file for one profile -fileProfile <- file.path(dataDirAllelicFraction, "ex1.gds") profileGDS <- openfn.gds(fileProfile, readonly=FALSE) -## Chromosome length information -## chr23 is chrX, chr24 is chrY and chrM is 25 -chrInfo <- c(248956422L, 242193529L, 198295559L, 190214555L, - 181538259L, 170805979L, 159345973L, 145138636L, 138394717L, 133797422L, - 135086622L, 133275309L, 114364328L, 107043718L, 101991189L, 90338345L, - 83257441L, 80373285L, 58617616L, 64444167L, 46709983L, 50818468L, - 156040895L, 57227415L, 16569L) - -## A formal way to get the chormosome length information -## library(BSgenome.Hsapiens.UCSC.hg38) -## chrInfo <- integer(25L) -## for(i in seq_len(22L)){ chrInfo[i] <- -## length(Hsapiens[[paste0("chr", i)]])} -## chrInfo[23] <- length(Hsapiens[["chrX"]]) -## chrInfo[24] <- length(Hsapiens[["chrY"]]) -## chrInfo[25] <- length(Hsapiens[["chrM"]]) - -## Estimate the allelic fraction of the pruned SNVs -estimateAllelicFraction(gdsReference=gds1KG, gdsProfile=profileGDS, - currentProfile="ex1", studyID="MYDATA", chrInfo=chrInfo, - studyType="DNA", minCov=10L, minProb=0.999, eProb=0.001, - cutOffLOH=-5, cutOffHomoScore=-3, wAR=9, cutOffAR=3, - gdsRefAnnot=NULL, blockID=NULL) - -## The allelic fraction is saved in the 'lap' node of the Profile GDS file -## The 'lap' entry should be present -profileGDS - -## Close both GDS files (important) -closefn.gds(profileGDS) -closefn.gds(gds1KG) - -## Unlink Profile GDS file (created for demo purpose) -unlink(file.path(dataDirAllelicFraction, "ex1.gds")) -unlink(dataDirAllelicFraction) +## Required library for this example to run correctly +if (requireNamespace("GenomeInfoDb", quietly=TRUE) && + requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) { + + ## Chromosome length information + ## chr23 is chrX, chr24 is chrY and chrM is 25 + chrInfo <- GenomeInfoDb::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25] + + ## Estimate the allelic fraction of the pruned SNVs + estimateAllelicFraction(gdsReference=gds1KG, gdsProfile=profileGDS, + currentProfile="ex1", studyID="MYDATA", chrInfo=chrInfo, + studyType="DNA", minCov=10L, minProb=0.999, eProb=0.001, + cutOffLOH=-5, cutOffHomoScore=-3, wAR=9, cutOffAR=3, + gdsRefAnnot=NULL, blockID=NULL) + + ## The allelic fraction is saved in the 'lap' node of Profile GDS file + ## The 'lap' entry should be present + profileGDS + + ## Close both GDS files (important) + closefn.gds(profileGDS) + closefn.gds(gds1KG) + + ## Remove Profile GDS file (created for demo purpose) + unlink(fileProfile, force=TRUE) + +} + } \author{ diff --git a/man/extractNucleotide.Rd b/man/extractNucleotide.Rd new file mode 100644 index 000000000..3d17a7708 --- /dev/null +++ b/man/extractNucleotide.Rd @@ -0,0 +1,43 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/tools_internal.R +\encoding{UTF-8} +\name{extractNucleotide} +\alias{extractNucleotide} +\title{Filtering the read counts for a specific nucleotide} +\usage{ +extractNucleotide(nucleotide, count, curNucleo) +} +\arguments{ +\item{nucleotide}{a \code{vector} of a \code{character} strings +representing the nucleotides (ex: A, C, G or T).} + +\item{count}{a \code{vector} of \code{numeric} representing the counts for +each nucleotide listed in \code{nucleotide} parameter.} + +\item{curNucleo}{a \code{character} strings representing the nucleotide +that will be retained (ex: A, C, G or T).} +} +\value{ +a \code{numeric} representing the counts for the selected +nucleotide. The default value is \code{0}. +} +\description{ +The function returns the read counts for the specific +nucleotide or zero when read counts are not available. +} +\examples{ + +## Nucleotides vector +nuc <- c("A", "G", "C", "T") + +## Count vector +cnt <- c(100, 200, 4, 32) + +## Return the count for the nucleotide "G" +RAIDS:::extractNucleotide(nucleotide=nuc, count=cnt, curNucleo="G") + +} +\author{ +Pascal Belleau, Astrid Deschênes and Alexander Krasnitz +} +\keyword{internal} diff --git a/man/gds2tfam.Rd b/man/gds2tfam.Rd deleted file mode 100644 index 3b361b420..000000000 --- a/man/gds2tfam.Rd +++ /dev/null @@ -1,33 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/gdsWrapper.R -\encoding{UTF-8} -\name{gds2tfam} -\alias{gds2tfam} -\title{create a file tfam file for plink from the gdsReference file} -\usage{ -gds2tfam(gdsReference, listSample, pedOUT) -} -\arguments{ -\item{gdsReference}{an object of class -\link[gdsfmt]{gds.class} (a GDS file), the opened 1KG GDS file.} - -\item{listSample}{a \code{array} with the sample to keep TODO} - -\item{pedOUT}{TODO a PATH and file name to the output file} -} -\value{ -TODO a \code{vector} of \code{numeric} -} -\description{ -TODO -} -\examples{ - -# TODO -gds <- "Demo GDS TODO" - -} -\author{ -Pascal Belleau, Astrid Deschênes and Alexander Krasnitz -} -\keyword{export} diff --git a/man/gds2tfamSample.Rd b/man/gds2tfamSample.Rd deleted file mode 100644 index 33c9d736f..000000000 --- a/man/gds2tfamSample.Rd +++ /dev/null @@ -1,36 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/gdsWrapper.R -\encoding{UTF-8} -\name{gds2tfamSample} -\alias{gds2tfamSample} -\title{create a file tfam file for plink from the gdsProfile file} -\usage{ -gds2tfamSample(gdsProfile, listSample, sampleANNO, pedOUT) -} -\arguments{ -\item{gdsProfile}{an object of class \link[gdsfmt]{gds.class} (a GDS -file), the open Profile GDS file.} - -\item{listSample}{a \code{array} with the sample to keep} - -\item{sampleANNO}{a \code{data.frame} with at least column sex and the name -must be sample.id} - -\item{pedOUT}{TODO a PATH and file name to the output file} -} -\value{ -TODO a \code{vector} of \code{numeric} -} -\description{ -TODO -} -\examples{ - -# TODO -gds <- "Demo GDS TODO" - -} -\author{ -Pascal Belleau, Astrid Deschênes and Alexander Krasnitz -} -\keyword{export} diff --git a/man/gds2tped.Rd b/man/gds2tped.Rd deleted file mode 100644 index be72b7509..000000000 --- a/man/gds2tped.Rd +++ /dev/null @@ -1,34 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/gdsWrapper.R -\encoding{UTF-8} -\name{gds2tped} -\alias{gds2tped} -\title{create a file tped file for plink from the gds file} -\usage{ -gds2tped(gds, listSample, listSNP, pedOUT) -} -\arguments{ -\item{gds}{a \code{gds} object.} - -\item{listSample}{a \code{array} with the sample to keep} - -\item{listSNP}{a \code{array} with the snp.id to keep} - -\item{pedOUT}{TODO a PATH and file name to the output file} -} -\value{ -TODO a \code{vector} of \code{numeric} -} -\description{ -TODO -} -\examples{ - -# TODO -gds <- "Demo GDS TODO" - -} -\author{ -Pascal Belleau, Astrid Deschênes and Alexander Krasnitz -} -\keyword{export} diff --git a/man/generateGDS1KG.Rd b/man/generateGDS1KG.Rd index eba1b9cd2..fff2a4c71 100644 --- a/man/generateGDS1KG.Rd +++ b/man/generateGDS1KG.Rd @@ -3,8 +3,8 @@ \encoding{UTF-8} \name{generateGDS1KG} \alias{generateGDS1KG} -\title{Generate the GDS file that will contain the information from 1KG -data set (reference data set)} +\title{Generate the GDS file that will contain the information from +Reference data set (reference data set)} \usage{ generateGDS1KG( pathGeno = file.path("data", "sampleGeno"), @@ -54,8 +54,8 @@ The integer \code{0L} when successful. } \description{ This function generates the GDS file that will contain the -information from 1KG. The function also add the samples information, the -SNP information and the genotyping information into the GDS file. +information from Reference. The function also add the samples information, +the SNP information and the genotyping information into the GDS file. } \details{ More information about GDS file format can be found at the Bioconductor @@ -64,14 +64,14 @@ https://bioconductor.org/packages/gdsfmt/ } \examples{ -## Required package -library(withr) - ## Path to the demo pedigree file is located in this package dataDir <- system.file("extdata", package="RAIDS") +## Path to the CSV genoytype files +pathGeno <- file.path(dataDir, "demoProfileGenotypes") + ## The RDS file containing the pedigree information -pedigreeFile <- file.path(dataDir, "PedigreeDemo.rds") +pedigreeFile <- file.path(dataDir, "PedigreeReferenceDemo.rds") ## The RDS file containing the indexes of the retained SNPs snpIndexFile <- file.path(dataDir, "listSNPIndexes_Demo.rds") @@ -79,16 +79,17 @@ snpIndexFile <- file.path(dataDir, "listSNPIndexes_Demo.rds") ## The RDS file containing the filtered SNP information filterSNVFile <- file.path(dataDir, "mapSNVSelected_Demo.rds") -## Temporary GDS file containing 1KG information -gdsFile <- local_file(file.path(dataDir, "1KG_TEMP.gds")) +## Temporary Reference GDS file +tempRefGDS <- file.path(tempdir(), "1KG_TEMP.gds") -## Create a temporary GDS file containing information from 1KG -generateGDS1KG(pathGeno=dataDir, filePedRDS=pedigreeFile, - fileSNVIndex=snpIndexFile, fileSNVSelected=filterSNVFile, - fileNameGDS=gdsFile, listSamples=NULL) +## Create a temporary Reference GDS file +generateGDS1KG(pathGeno=pathGeno, filePedRDS=pedigreeFile, + fileSNVIndex=snpIndexFile, fileSNVSelected=filterSNVFile, + fileNameGDS=tempRefGDS, listSamples=NULL) ## Remove temporary files -deferred_run() +unlink(tempRefGDS, force=TRUE) + } \author{ diff --git a/man/generateGDS1KGgenotypeFromSNPPileup.Rd b/man/generateGDS1KGgenotypeFromSNPPileup.Rd index e553576be..d18b8579f 100644 --- a/man/generateGDS1KGgenotypeFromSNPPileup.Rd +++ b/man/generateGDS1KGgenotypeFromSNPPileup.Rd @@ -23,9 +23,14 @@ generateGDS1KGgenotypeFromSNPPileup( ) } \arguments{ -\item{pathGeno}{a \code{character} string representing the path to a -directory with the genotype files for the profiles, as generated by -snp-pileup. The path must exist.} +\item{pathGeno}{a \code{character} string representing the path to the +directory containing the VCF output of SNP-pileup for each sample. The +SNP-pileup files must be compressed (gz files) and have the name identifiers +of the samples. A sample with "Name.ID" identifier would have an +associated file called +if genoSource is "VCF", then "Name.ID.vcf.gz", +if genoSource is "generic", then "Name.ID.generic.txt.gz" +if genoSource is "snp-pileup", then "Name.ID.txt.gz".} \item{listSamples}{a \code{vector} of \code{character} string corresponding to the sample identifiers that will have a Profile GDS file created. The @@ -53,8 +58,8 @@ representing the sequencing error rate. Default: \code{0.001}.} \item{dfPedProfile}{a \code{data.frame} with the information about the sample(s). Those are mandatory columns: "Name.ID", -"Case.ID", "Sample.Type", "Diagnosis", "Source". All columns must be in -\code{character} strings. The \code{data.frame} +"Case.ID", "Sample.Type", "Diagnosis" and "Source". All columns must be in +\code{character} strings format. The \code{data.frame} must contain the information for all the samples passed in the \code{listSamples} parameter.} @@ -70,13 +75,15 @@ must be in \code{character} strings.} the directory where the GDS Sample files will be created.} \item{genoSource}{a \code{character} string with two possible values: -'snp-pileup' or 'generic'. It specifies if the genotype files +'snp-pileup', 'generic' or 'VCF'. It specifies if the genotype files are generated by snp-pileup (Facets) or are a generic format CSV file with at least those columns: 'Chromosome', 'Position', 'Ref', 'Alt', 'Count', 'File1R' and 'File1A'. The 'Count' is the depth at the specified position; 'FileR' is the depth of the reference allele and -'File1A' is the depth of the specific alternative allele.} +'File1A' is the depth of the specific alternative allele. +Finally the file can be a VCF file with at least those genotype +fields: GT, AD, DP.} \item{verbose}{a \code{logical} indicating if the function must print messages when running.} @@ -91,8 +98,12 @@ from a SNV file as generated by SNP-pileup or other tools. } \examples{ -## Path to the files in this package -dataDir <- system.file("extdata/tests", package="RAIDS") +## Current directory +dataDir <- file.path(tempdir()) + +## Copy required file into current directory +file.copy(from=file.path(system.file("extdata/tests", package="RAIDS"), + "ex1.txt.gz"), to=dataDir) ## The data.frame containing the information about the study ## The 3 mandatory columns: "study.id", "study.desc", "study.platform" @@ -113,8 +124,8 @@ rownames(samplePED) <- samplePED$Name.ID ## List of SNV positions listPositions <- data.frame(snp.chromosome=c(rep(1, 10)), - snp.position=c(3467333, 3467428, 3469375, 3469387, 3469502, 3469527, - 3469737, 3471497, 3471565, 3471618)) + snp.position=c(3467333, 3467428, 3469375, 3469387, 3469502, 3469527, + 3469737, 3471497, 3471565, 3471618)) ## Append genotype information to the Profile GDS file result <- RAIDS:::generateGDS1KGgenotypeFromSNPPileup(pathGeno=dataDir, @@ -133,6 +144,8 @@ list.files(dataDir) ## Unlink Profile GDS file (created for demo purpose) unlink(file.path(dataDir, "ex1.gds")) +unlink(file.path(dataDir, "ex1.txt.gz")) + } \author{ diff --git a/man/generateGDSRefSample.Rd b/man/generateGDSRefSample.Rd index a9a7d6290..9955c2c90 100644 --- a/man/generateGDSRefSample.Rd +++ b/man/generateGDSRefSample.Rd @@ -41,28 +41,27 @@ the \code{data.frame} passed to the function. The nodes "sample.id" and ## Required library library(gdsfmt) -## Create a temporary GDS file in an test directory -dataDir <- system.file("extdata/tests", package="RAIDS") -gdsFilePath <- file.path(dataDir, "GDS_TEMP_10.gds") +## Temporary GDS file in current directory +gdsFilePath <- file.path(tempdir(), "GDS_TEMP_10.gds") ## Create and open the GDS file tmpGDS <- createfn.gds(filename=gdsFilePath) ## Create "sample.annot" node (the node must be present) pedInformation <- data.frame(sample.id=c("sample_01", "sample_02"), - Name.ID=c("sample_01", "sample_02"), - sex=c(1,1), # 1:Male 2: Female - pop.group=c("ACB", "ACB"), - superPop=c("AFR", "AFR"), - batch=c(1, 1), - stringsAsFactors=FALSE) + Name.ID=c("sample_01", "sample_02"), + sex=c(1,1), # 1:Male 2: Female + pop.group=c("ACB", "ACB"), + superPop=c("AFR", "AFR"), + batch=c(1, 1), + stringsAsFactors=FALSE) ## The row names must be the sample identifiers rownames(pedInformation) <- pedInformation$Name.ID ## Add information about 2 samples to the GDS file RAIDS:::generateGDSRefSample(gdsReference=tmpGDS, - dfPedReference=pedInformation, listSamples=NULL) + dfPedReference=pedInformation, listSamples=NULL) ## Read sample identifier list read.gdsn(index.gdsn(node=tmpGDS, path="sample.id")) @@ -76,6 +75,7 @@ closefn.gds(gdsfile=tmpGDS) ## Delete the temporary GDS file unlink(x=gdsFilePath, force=TRUE) + } \author{ Pascal Belleau, Astrid Deschênes and Alexander Krasnitz diff --git a/man/generateGDSSNPinfo.Rd b/man/generateGDSSNPinfo.Rd index c13e993cc..31bfe77d9 100644 --- a/man/generateGDSSNPinfo.Rd +++ b/man/generateGDSSNPinfo.Rd @@ -3,15 +3,15 @@ \encoding{UTF-8} \name{generateGDSSNPinfo} \alias{generateGDSSNPinfo} -\title{Add information related to SNVs into a Reference GDS file} +\title{Add information related to SNVs into a Population Reference GDS file} \usage{ -generateGDSSNPinfo(gdsReference, fileFREQ, verbose) +generateGDSSNPinfo(gdsReference, fileFreq, verbose) } \arguments{ \item{gdsReference}{an object of class \link[gdsfmt]{gds.class} (a GDS file), the opened Reference GDS file.} -\item{fileFREQ}{a \code{character} string representing the path and file +\item{fileFreq}{a \code{character} string representing the path and file name of the RDS file with the filtered SNP information.} \item{verbose}{a \code{logical} indicating if messages should be printed @@ -21,33 +21,31 @@ to show how the different steps in the function.} The integer \code{0L} when successful. } \description{ -the function adds the SNV information into a Reference -GDS file. +The function adds the SNV information into a Population +Reference GDS file. } \examples{ ## Required package -library(withr) +library(gdsfmt) -## Path to the demo pedigree file is located in this package +## The RDS file containing the filtered SNP information dataDir <- system.file("extdata", package="RAIDS") +fileFilerterSNVs <- file.path(dataDir, "mapSNVSelected_Demo.rds") -## Temporary Reference GDS file -file1KG <- local_file(file.path(dataDir, "1KG_TEMP_002.gds")) +## Temporary Reference GDS file in temporary directory +file1KG <- file.path(tempdir(), "1KG_TEMP_002.gds") filenewGDS <- createfn.gds(file1KG) -## The RDS file containing the filtered SNP information -fileFilerterSNVs <- file.path(dataDir, "mapSNVSelected_Demo.rds") - ## Add SNV information to Reference GDS RAIDS:::generateGDSSNPinfo(gdsReference=filenewGDS, - fileFREQ=fileFilerterSNVs, verbose=TRUE) + fileFreq=fileFilerterSNVs, verbose=TRUE) ## Close GDS file (important) closefn.gds(filenewGDS) -## Remove temporary files -deferred_run() +## Remove temporary 1KG_TEMP_002.gds file +unlink(file1KG, force=TRUE) } \author{ diff --git a/man/generateGDSgenotype.Rd b/man/generateGDSgenotype.Rd index 10d8006e1..a9bfa0cf0 100644 --- a/man/generateGDSgenotype.Rd +++ b/man/generateGDSgenotype.Rd @@ -3,16 +3,18 @@ \encoding{UTF-8} \name{generateGDSgenotype} \alias{generateGDSgenotype} -\title{Add information related to profile genotype into a Reference GDS file} +\title{Add information related to profile genotypes into a Population +Reference GDS file} \usage{ generateGDSgenotype(gds, pathGeno, fileSNPsRDS, listSamples, verbose) } \arguments{ \item{gds}{an object of class -\link[gdsfmt]{gds.class} (a GDS file), the opened Reference GDS file.} +\link[gdsfmt]{gds.class} (a GDS file), the opened Population Reference +GDS file.} \item{pathGeno}{a \code{character} string representing the path where -the 1K genotyping files for each sample are located. The name of the +the reference genotyping files for each sample are located. The name of the genotyping files must correspond to the individual identification (Individual.ID) in the pedigree file.} @@ -32,11 +34,56 @@ The integer \code{0L} when successful. } \description{ This function adds the genotype fields with the associated -information into the Reference GDS file for the selected profiles. +information into the Population Reference GDS file for the selected +profiles. } \examples{ -# TODO +## Required library +library(gdsfmt) + +## Path to the demo pedigree file is located in this package +dataDir <- system.file("extdata", package="RAIDS") + +## Path where the demo genotype CSV files are located +pathGeno <- file.path(dataDir, "demoProfileGenotypes") + +## The RDS file containing the pedigree information +pedigreeFile <- file.path(dataDir, "PedigreeReferenceDemo.rds") + +## The RDS file containing the indexes of the retained SNPs +snpIndexFile <- file.path(dataDir, "listSNPIndexes_Demo.rds") + +## The RDS file containing the filtered SNP information +filterSNVFile <- file.path(dataDir, "mapSNVSelected_Demo.rds") + +## Temporary Reference GDS file in temporary directory +tempRefGDS <- file.path(tempdir(), "Ref_TEMP01.gds") + +## Create temporary Reference GDS file +newGDS <- createfn.gds(tempRefGDS) +put.attr.gdsn(newGDS$root, "FileFormat", "SNP_ARRAY") + +## Read the pedigree file +ped1KG <- readRDS(pedigreeFile) + +## Add information about samples to the Reference GDS file +listSampleGDS <- RAIDS:::generateGDSRefSample(gdsReference=newGDS, + dfPedReference=ped1KG, listSamples=NULL) + +## Add SNV information to the Reference GDS +RAIDS:::generateGDSSNPinfo(gdsReference=newGDS, fileFreq=filterSNVFile, + verbose=FALSE) + +## Add genotype information to the Reference GDS +RAIDS:::generateGDSgenotype(gds=newGDS, pathGeno=pathGeno, + fileSNPsRDS=snpIndexFile, listSamples=listSampleGDS, verbose=FALSE) + +## Close file +closefn.gds(newGDS) + +## Remove temporary files +unlink(tempRefGDS, force=TRUE) } \author{ diff --git a/man/generateGeneBlock.Rd b/man/generateGeneBlock.Rd index 4f53b12f5..f369846eb 100644 --- a/man/generateGeneBlock.Rd +++ b/man/generateGeneBlock.Rd @@ -5,7 +5,7 @@ \alias{generateGeneBlock} \title{Generate two indexes based on gene annotation for gdsAnnot1KG block} \usage{ -generateGeneBlock(gdsReference, winSize = 10000, EnsDb) +generateGeneBlock(gdsReference, winSize = 10000, ensDb) } \arguments{ \item{gdsReference}{an object of class @@ -15,19 +15,24 @@ generateGeneBlock(gdsReference, winSize = 10000, EnsDb) size of the window to use to group the SNVs when the SNVs are in a non-coding region. Default: \code{10000}.} -\item{EnsDb}{An object of class \code{EnsDb} with the Ensembl genome +\item{ensDb}{An object of class \code{EnsDb} with the Ensembl genome annotation. By default, the \code{EnsDb.Hsapiens.v86} class has been used.} } \value{ a \code{data.frame} with those columns: -\itemize{ -\item{chr} {} -\item{pos} {} -\item{snp.allele} {} -\item{Exon} {} -\item{GName} {} -\item{Gene} {} -\item{GeneS} {} +\describe{ +\item{chr}{ a single \code{integer} representing the SNV chromosome.} +\item{pos}{ a single \code{integer} representing the SNV position.} +\item{snp.allele}{ a \code{character} string representing the reference allele +and alternative allele for each of the SNV} +\item{Exon}{ a \code{character} with the ensembl GeneId(s) if the SNV is in +one exon. If more than one GeneId they are separted by ':'} +\item{GName}{ a \code{character} with the ensembl GeneId(s) if the SNV is in +the gene. If more than one GeneId they are separted by ':'} +\item{Gene}{ a single \code{integer} specific to the SNVs that share +at least one genes} +\item{GeneS}{ a single \code{integer} specific to the SNVs that share +a unique combination of genes} } "chr", "pos", "snp.allele", "Exon", "GName", "Gene", "GeneS" Example for GName and the two indexes "Gene", "GeneS" @@ -43,14 +48,42 @@ GName Gene GeneS 493 ENSG00000230021:ENSG00000228794 17 3825 } \description{ -TODO +Generate two indexes based on gene annotation for +gdsAnnot1KG block } \examples{ +## Required library +library(SNPRelate) + ## Path to the demo pedigree file is located in this package dataDir <- system.file("extdata", package="RAIDS") -# TODO +## Required library +if (requireNamespace("EnsDb.Hsapiens.v86", quietly=TRUE)) { + + ## Making a "short cut" on the ensDb object + edb <- EnsDb.Hsapiens.v86::EnsDb.Hsapiens.v86 + + path1KG <- file.path(dataDir, "tests") + + ## Reference GDS file + fileReferenceGDS <- file.path(path1KG, "ex1_good_small_1KG.gds") + + \donttest{ + ## Open the reference GDS file (demo version) + gds1KG <- snpgdsOpen(fileReferenceGDS) + + ## The function returns a data.frame containing + ## gene block information + matGeneBlock <- RAIDS:::generateGeneBlock(gdsReference=gds1KG, + ensDb=edb) + print(head(matGeneBlock[grep("ENSG00000157152", + matGeneBlock$GName),])) + + closefn.gds(gds1KG) + } +} } \author{ diff --git a/man/generateMapSnvSel.Rd b/man/generateMapSnvSel.Rd index 2ff5932d0..67a161329 100644 --- a/man/generateMapSnvSel.Rd +++ b/man/generateMapSnvSel.Rd @@ -12,8 +12,8 @@ generateMapSnvSel(cutOff = 0.01, fileSNV, fileSNPsRDS, fileFREQ) for the frequency in at least one super population. Default: \code{0.01}.} \item{fileSNV}{a \code{character} string representing the path and -file name of the bulk SNP information file from 1KG. The file must be in -text format. The file must exist.} +file name of the bulk SNP information file from Reference. The file must +be in text format. The file must exist.} \item{fileSNPsRDS}{a \code{character} string representing the path and file name of the RDS file that will contain the indexes of the retained @@ -36,30 +36,27 @@ indexes of the retained SNP is also created. \details{ The filtered SNP information RDS file (parameter \code{fileFREQ}), contains a \code{data.frame} with those columns: -\itemize{ -\item{CHROM} {a \code{character} string representing the chromosome where +\describe{ +\item{CHROM}{ a \code{character} string representing the chromosome where the SNV is located.} -\item{POS} {a \code{character} string representing the SNV position on the +\item{POS}{ a \code{character} string representing the SNV position on the chromosome.} -\item{REF} {a \code{character} string representing the reference DNA base +\item{REF}{ a \code{character} string representing the reference DNA base for the SNV.} -\item{ALT} {a \code{character} string representing the alternative DNA base +\item{ALT}{ a \code{character} string representing the alternative DNA base for the SNV.}\ -\item{EAS_AF} {a \code{character} string representing the allele frequency +\item{EAS_AF}{ a \code{character} string representing the allele frequency of the EAS super population.} -\item{AFR_AF} {a \code{character} string representing the allele frequency +\item{AFR_AF}{ a \code{character} string representing the allele frequency of the AFR super population.} -\item{AMR_AF} {a \code{character} string representing the allele frequency +\item{AMR_AF}{ a \code{character} string representing the allele frequency of the AMR super population.} -\item{SAS_AF} {a \code{character} string representing the allele frequency +\item{SAS_AF}{ a \code{character} string representing the allele frequency of the SAS super population.} } } \examples{ -## Needed package -library(withr) - ## Path to the demo pedigree file is located in this package dataDir <- system.file("extdata", package="RAIDS") @@ -69,16 +66,17 @@ snvFile <- file.path(dataDir, "matFreqSNV_Demo.txt.bz2") ## Temporary output files ## The first file contains the indexes of the retained SNPs ## The second file contains the filtered SNP information -snpIndexFile <- local_file(file.path(dataDir, "listSNP_TEMP.rds")) -filterSNVFile <- local_file(file.path(dataDir, "mapSNVSel_TEMP.rds")) +snpIndexFile <- file.path(tempdir(), "listSNP_TEMP.rds") +filterSNVFile <- file.path(tempdir(), "mapSNVSel_TEMP.rds") ## Create a data.frame containing the information of the retained ## samples (samples with existing genotyping files) generateMapSnvSel(cutOff=0.01, fileSNV=snvFile, - fileSNPsRDS=snpIndexFile, fileFREQ=filterSNVFile) + fileSNPsRDS=snpIndexFile, fileFREQ=filterSNVFile) ## Remove temporary files -deferred_run() +unlink(snpIndexFile, force=TRUE) +unlink(filterSNVFile, force=TRUE) } \author{ diff --git a/man/generatePhase1KG2GDS.Rd b/man/generatePhase1KG2GDS.Rd index dec5438ed..d35d53f9f 100644 --- a/man/generatePhase1KG2GDS.Rd +++ b/man/generatePhase1KG2GDS.Rd @@ -9,7 +9,7 @@ generatePhase1KG2GDS( gdsReference, gdsReferencePhase, pathGeno, - fileSNPsRDS, + fileSNVIndex, verbose = FALSE ) } @@ -26,7 +26,7 @@ genotyping files must correspond to the individual identification (Individual.ID) in the pedigree file. Default: \code{"./data/sampleGeno"}.} -\item{fileSNPsRDS}{a \code{character} string representing the path and file +\item{fileSNVIndex}{a \code{character} string representing the path and file name of the RDS file that contains the indexes of the retained SNPs. The file must exist. The file must be a RDS file.} @@ -34,7 +34,7 @@ file must exist. The file must be a RDS file.} print messages when running. Default: \code{FALSE}.} } \value{ -The function returns \code{0L} when succesful. +The function returns \code{0L} when successful. } \description{ The function is adding the phase information into the @@ -45,13 +45,16 @@ GDS file and is added into a Reference Phase GDS file. An entry called \examples{ ## Required package -library(withr) +library(gdsfmt) ## Path to the demo pedigree file is located in this package dataDir <- system.file("extdata", package="RAIDS") +## Path where the demo genotype CSV files are located +pathGeno <- file.path(dataDir, "demoProfileGenotypes") + ## The RDS file containing the pedigree information -pedigreeFile <- file.path(dataDir, "PedigreeDemo.rds") +pedigreeFile <- file.path(dataDir, "PedigreeReferenceDemo.rds") ## The RDS file containing the indexes of the retained SNPs snpIndexFile <- file.path(dataDir, "listSNPIndexes_Demo.rds") @@ -59,16 +62,16 @@ snpIndexFile <- file.path(dataDir, "listSNPIndexes_Demo.rds") ## The RDS file containing the filtered SNP information filterSNVFile <- file.path(dataDir, "mapSNVSelected_Demo.rds") -## Temporary Reference GDS file containing 1KG information -fileReferenceGDS <- local_file(file.path(dataDir, "1KG_TEMP_02.gds")) +## Temporary Reference GDS file containing reference information +fileReferenceGDS <- file.path(tempdir(), "1KG_TEMP_02.gds") ## Create a temporary Reference GDS file containing information from 1KG -generateGDS1KG(pathGeno=dataDir, filePedRDS=pedigreeFile, - fileSNVIndex=snpIndexFile, fileSNVSelected=filterSNVFile, - fileNameGDS=fileReferenceGDS, listSamples=NULL) +generateGDS1KG(pathGeno=pathGeno, filePedRDS=pedigreeFile, + fileSNVIndex=snpIndexFile, fileSNVSelected=filterSNVFile, + fileNameGDS=fileReferenceGDS, listSamples=NULL) ## Temporary Phase GDS file that will contain the 1KG Phase information -fileRefPhaseGDS <- local_file(file.path(dataDir, "1KG_TEMP_Phase_02.gds")) +fileRefPhaseGDS <- file.path(tempdir(), "1KG_TEMP_Phase_02.gds") ## Create Reference Phase GDS file gdsPhase <- createfn.gds(fileRefPhaseGDS) @@ -76,20 +79,23 @@ gdsPhase <- createfn.gds(fileRefPhaseGDS) ## Open Reference GDS file gdsRef <- openfn.gds(fileReferenceGDS) -\dontrun{ ## Fill temporary Reference Phase GDS file -generatePhase1KG2GDS(gdsReference=gdsRef, gdsReferencePhase=gdsPhase, - pathGeno=dataDir, fileSNPsRDS=filterSNVFile, verbose=FALSE) +if (FALSE) { + generatePhase1KG2GDS(gdsReference=gdsRef, + gdsReferencePhase=gdsPhase, + pathGeno=pathGeno, fileSNVIndex=snpIndexFile, + verbose=FALSE) } -## Close 1KG Phase information file +## Close Reference Phase information file closefn.gds(gdsPhase) ## Close Reference information file closefn.gds(gdsRef) ## Remove temporary files -deferred_run() +unlink(fileReferenceGDS, force=TRUE) +unlink(fileRefPhaseGDS, force=TRUE) } \author{ diff --git a/man/generatePhaseRef.Rd b/man/generatePhaseRef.Rd new file mode 100644 index 000000000..7397aea8c --- /dev/null +++ b/man/generatePhaseRef.Rd @@ -0,0 +1,91 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/process1KG.R +\encoding{UTF-8} +\name{generatePhaseRef} +\alias{generatePhaseRef} +\title{Adding the phase information into the Reference GDS file} +\usage{ +generatePhaseRef( + fileReferenceGDS, + fileReferenceAnnotGDS, + pathGeno, + fileSNVIndex, + verbose = FALSE +) +} +\arguments{ +\item{fileReferenceGDS}{a \code{character} string representing the file +name of the Reference GDS file. The file must exist.} + +\item{fileReferenceAnnotGDS}{a \code{character} string representing the +file name of the Population Reference GDS Annotation file. The file +must exist.} + +\item{pathGeno}{a \code{character} string representing the path where +the 1K genotyping files for each sample are located. The name of the +genotyping files must correspond to +the individual identification (Individual.ID) in the pedigree file. +Default: \code{"./data/sampleGeno"}.} + +\item{fileSNVIndex}{a \code{character} string representing the path and file +name of the RDS file that contains the indexes of the retained SNPs. The +file must exist. The file must be a RDS file.} + +\item{verbose}{a \code{logicial} indicating if the function should +print messages when running. Default: \code{FALSE}.} +} +\value{ +The function returns \code{0L} when successful. +} +\description{ +The function is adding the phase information into the +Reference Phase GDS file. The phase information is extracted from a Reference +GDS file and is added into a Reference Phase GDS file. An entry called +'phase' is added to the Reference Phase GDS file. +} +\examples{ + +## Path to the demo pedigree file is located in this package +dataDir <- system.file("extdata", package="RAIDS") + +## Path where the demo genotype CSV files are located +pathGeno <- file.path(dataDir, "demoProfileGenotypes") + +## The RDS file containing the pedigree information +pedigreeFile <- file.path(dataDir, "PedigreeReferenceDemo.rds") + +## The RDS file containing the indexes of the retained SNPs +snpIndexFile <- file.path(dataDir, "listSNPIndexes_Demo.rds") + +## The RDS file containing the filtered SNP information +filterSNVFile <- file.path(dataDir, "mapSNVSelected_Demo.rds") + +## Temporary Reference GDS file containing reference information +fileReferenceGDS <- file.path(tempdir(), "1KG_TEMP_02.gds") + +## Create a temporary Reference GDS file containing information from 1KG +generateGDS1KG(pathGeno=pathGeno, filePedRDS=pedigreeFile, + fileSNVIndex=snpIndexFile, fileSNVSelected=filterSNVFile, + fileNameGDS=fileReferenceGDS, listSamples=NULL) + +## Temporary Phase GDS file that will contain the 1KG Phase information +fileRefPhaseGDS <- file.path(tempdir(), "1KG_TEMP_Phase_02.gds") + + +## Fill temporary Reference Phase GDS file +if (FALSE) { + generatePhaseRef(fileReferenceGDS=fileReferenceGDS, + fileReferenceAnnotGDS=fileRefPhaseGDS, + pathGeno=pathGeno, fileSNVIndex=snpIndexFile, + verbose=FALSE) +} + + +## Remove temporary files +unlink(fileReferenceGDS, force=TRUE) +unlink(fileRefPhaseGDS, force=TRUE) + +} +\author{ +Pascal Belleau, Astrid Deschênes and Alexander Krasnitz +} diff --git a/man/generateProfileGDS.Rd b/man/generateProfileGDS.Rd new file mode 100644 index 000000000..bfa95cd9f --- /dev/null +++ b/man/generateProfileGDS.Rd @@ -0,0 +1,149 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/gdsWrapper_internal.R +\encoding{UTF-8} +\name{generateProfileGDS} +\alias{generateProfileGDS} +\title{Append the genotype information from a profile into the associated +Profile GDS File} +\usage{ +generateProfileGDS( + profileFile, + profileName, + listPos, + offset, + minCov = 10, + minProb = 0.999, + seqError = 0.001, + dfPedProfile, + batch, + studyDF, + pathProfileGDS, + genoSource, + paramProfileGDS, + verbose +) +} +\arguments{ +\item{profileFile}{a \code{character} string representing the path and the +file name of the genotype file or the bam if genoSource is snp-pileup the +fine extension must be .txt.gz, if VCF the extension must be .vcf.gz} + +\item{profileName}{a \code{character} string representing the profileName} + +\item{listPos}{a \code{data.frame} containing 2 columns. The first column, +called "snp.chromosome" contains the name of the chromosome where the +SNV is located. The second column, called "snp.position" contains the +position of the SNV on the chromosome.} + +\item{offset}{a \code{integer} to adjust if the genome start at 0 or 1.} + +\item{minCov}{a single positive \code{integer} representing the minimum +coverage needed to keep the SNVs in the analysis. Default: \code{10}.} + +\item{minProb}{a single positive \code{numeric} between 0 and 1 +representing the probability that the base change at the SNV position +is not an error. +Default: \code{0.999}.} + +\item{seqError}{a single positive \code{numeric} between 0 and 1 +representing the sequencing error rate. Default: \code{0.001}.} + +\item{dfPedProfile}{a \code{data.frame} with the information about +the sample(s). +Those are mandatory columns: "Name.ID", +"Case.ID", "Sample.Type", "Diagnosis" and "Source". All columns must be in +\code{character} strings format. The \code{data.frame} +must contain the information for all the samples passed in the +\code{listSamples} parameter.} + +\item{batch}{a single positive \code{integer} representing the current +identifier for the batch. Beware, this field is not stored anymore.} + +\item{studyDF}{a \code{data.frame} containing the information about the +study associated to the analysed sample(s). The \code{data.frame} must have +those 3 columns: "study.id", "study.desc", "study.platform". All columns +must be in \code{character} strings.} + +\item{pathProfileGDS}{a \code{character} string representing the path to +the directory where the GDS Sample files will be created.} + +\item{genoSource}{a \code{character} string with two possible values: +'snp-pileup', 'generic' or 'VCF'. It specifies if the genotype files +are generated by snp-pileup (Facets) or are a generic format CSV file +with at least those columns: +'Chromosome', 'Position', 'Ref', 'Alt', 'Count', 'File1R' and 'File1A'. +The 'Count' is the depth at the specified position; +'FileR' is the depth of the reference allele and +'File1A' is the depth of the specific alternative allele. +Finally the file can be a VCF file with at least those genotype +fields: GT, AD, DP.} + +\item{paramProfileGDS}{a \code{list} parameters ...} + +\item{verbose}{a \code{logical} indicating if the function must print +messages when running.} +} +\value{ +The function returns \code{0L} when successful. +} +\description{ +This function append the genotype information from a specific +profile into the Profile GDS file. The genotype information is extracted +from a SNV file as generated by SNP-pileup or other tools. +} +\examples{ + +## Current directory +dataDir <- file.path(tempdir()) + +## Copy required file into current directory +file.copy(from=file.path(system.file("extdata/tests", package="RAIDS"), + "ex1.txt.gz"), to=dataDir) + +## The data.frame containing the information about the study +## The 3 mandatory columns: "study.id", "study.desc", "study.platform" +## The entries should be strings, not factors (stringsAsFactors=FALSE) +studyDF <- data.frame(study.id = "MYDATA", + study.desc = "Description", + study.platform = "PLATFORM", + stringsAsFactors = FALSE) + +## The data.frame containing the information about the samples +## The entries should be strings, not factors (stringsAsFactors=FALSE) +samplePED <- data.frame(Name.ID=c("ex1", "ex2"), + Case.ID=c("Patient_h11", "Patient_h12"), + Diagnosis=rep("Cancer", 2), + Sample.Type=rep("Primary Tumor", 2), + Source=rep("Databank B", 2), stringsAsFactors=FALSE) +rownames(samplePED) <- samplePED$Name.ID + +## List of SNV positions +listPositions <- data.frame(snp.chromosome=c(rep(1, 10)), + snp.position=c(3467333, 3467428, 3469375, 3469387, 3469502, 3469527, + 3469737, 3471497, 3471565, 3471618)) + +## Append genotype information to the Profile GDS file +result <- RAIDS:::generateProfileGDS(profileFile=file.path(dataDir, "ex1.txt.gz"), + profileName="ex1", listPos=listPositions, + offset=-1, minCov=10, minProb=0.999, seqError=0.001, + dfPedProfile=samplePED, batch=1, studyDF=studyDF, + pathProfileGDS=dataDir, genoSource="snp-pileup", + verbose=FALSE) + +## The function returns OL when successful +result + +## The Profile GDS file 'ex1.gds' has been created in the +## specified directory +list.files(dataDir) + +## Unlink Profile GDS file (created for demo purpose) +unlink(file.path(dataDir, "ex1.gds")) +unlink(file.path(dataDir, "ex1.txt.gz")) + + +} +\author{ +Pascal Belleau, Astrid Deschênes and Alexander Krasnitz +} +\keyword{internal} diff --git a/man/getBlockIDs.Rd b/man/getBlockIDs.Rd new file mode 100644 index 000000000..60228ae23 --- /dev/null +++ b/man/getBlockIDs.Rd @@ -0,0 +1,59 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/gdsWrapper_internal.R +\encoding{UTF-8} +\name{getBlockIDs} +\alias{getBlockIDs} +\title{Extract the block identifiers for a list of SNVs} +\usage{ +getBlockIDs(gdsRefAnnot, snpIndex, blockTypeID) +} +\arguments{ +\item{gdsRefAnnot}{an object of class \code{\link[gdsfmt]{gds.class}} +(a GDS file), the opened Reference SNV Annotation GDS file.} + +\item{snpIndex}{a \code{vectcor} of \code{integer} representing the +indexes of the SNVs of interest.} + +\item{blockTypeID}{a \code{character} string corresponding to the block +type used to extract the block identifiers. The block type must be +present in the GDS Reference Annotation file.} +} +\value{ +a \code{vector} of \code{numeric} corresponding to the +block identifiers for the SNVs of interest. +} +\description{ +The function uses the GDS Reference Annotation file to extract +the unique block identifiers for a list of SNVs. The block type that is +going to be used to extract the information has to be provided by the +user. +} +\examples{ + +# Required library +library(gdsfmt) + +## Path to the demo 1KG Annotation GDS file located in this package +dataDir <- system.file("extdata", package="RAIDS") + +path1KG <- file.path(dataDir, "tests") +fileAnnotGDS <- file.path(path1KG, "ex1_good_small_1KG_Annot.gds") + +gdsRefAnnotation <- openfn.gds(fileAnnotGDS) + +## The indexes for the SNVs of interest +snpIndex <- c(1,3,5,6,9) + +## Extract the block identifiers for the SNVs represented by their indexes +## for the block created using the genes from Hsapiens Ensembl v86 +RAIDS:::getBlockIDs(gdsRefAnnot=gdsRefAnnotation, snpIndex=snpIndex, + blockTypeID="GeneS.Ensembl.Hsapiens.v86") + +closefn.gds(gdsRefAnnotation) + + +} +\author{ +Pascal Belleau, Astrid Deschênes and Alexander Krasnitz +} +\keyword{internal} diff --git a/man/getGeneBlock.Rd b/man/getGeneBlock.Rd deleted file mode 100644 index 4e682eef6..000000000 --- a/man/getGeneBlock.Rd +++ /dev/null @@ -1,37 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/gdsWrapper.R -\encoding{UTF-8} -\name{getGeneBlock} -\alias{getGeneBlock} -\title{Get the block number for each SNV in snp.index} -\usage{ -getGeneBlock(gdsRefAnnot, snp.index, blockID) -} -\arguments{ -\item{gdsRefAnnot}{an object of class \code{\link[gdsfmt]{gds.class}} -(a GDS file), the opened 1KG SNV Annotation GDS file. RNA specific -Default: \code{NULL}.} - -\item{snp.index}{TODO} - -\item{blockID}{a \code{character} string corresponding to the block -identifier in \code{gdsRefAnnot}. RNA specific -Default: \code{NULL}} -} -\value{ -TODO a \code{vector} of \code{numeric} corresponding to the -block identifier -} -\description{ -TODO -} -\examples{ - -# TODO -gds <- "Demo GDS TODO" - -} -\author{ -Pascal Belleau, Astrid Deschênes and Alexander Krasnitz -} -\keyword{internal} diff --git a/man/getRef1KGPop.Rd b/man/getRef1KGPop.Rd index 13972760c..f6e226a3e 100644 --- a/man/getRef1KGPop.Rd +++ b/man/getRef1KGPop.Rd @@ -10,12 +10,12 @@ getRef1KGPop(gdsReference, popName = "superPop") } \arguments{ \item{gdsReference}{an object of class -\link[gdsfmt]{gds.class} (a GDS file), the opened 1KG GDS file.} +\link[gdsfmt]{gds.class} (a GDS file), the opened Reference GDS file.} \item{popName}{a \code{character} string representing the name of the column -that will be fetched in the \code{data.frame} present in the 1KG GDS +that will be fetched in the \code{data.frame} present in the Reference GDS "sample.ref" node. The column must be present in the \code{data.frame}. - Default: \code{"superPop"}.} +Default: \code{"superPop"}.} } \value{ \code{vector} of \code{character} strings representing the content @@ -25,18 +25,21 @@ identifiers are used as names for the \code{vector}. } \description{ The function extract the specified column for the 'sample.ref' -node present in the 1KG GDS file. The column must be present in the +node present in the Reference GDS file. The column must be present in the \code{data.frame} saved in the 'sample.ref' node. Only the information for the reference profiles is returned. The values represent the known ancestry assignation. } \examples{ +## Required library +library(gdsfmt) + ## Path to the demo pedigree file is located in this package dataDir <- system.file("extdata", package="RAIDS") -## Open existing 1K GDS file with "sample.ref" node -nameFileGDS <- file.path(dataDir, "1KG_Demo_with_sampleREF.gds") +## Open existing demo 1K GDS file with "sample.ref" node +nameFileGDS <- file.path(dataDir, "PopulationReferenceDemo.gds") fileGDS <- snpgdsOpen(nameFileGDS) ## Extract super population information for the 1KG profiles diff --git a/man/getRefSuperPop.Rd b/man/getRefSuperPop.Rd new file mode 100644 index 000000000..b61c7511a --- /dev/null +++ b/man/getRefSuperPop.Rd @@ -0,0 +1,44 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/process1KG.R +\encoding{UTF-8} +\name{getRefSuperPop} +\alias{getRefSuperPop} +\title{Extract the from the 1KG GDS 'sample.ref' node +for the reference profiles (real ancestry assignation)} +\usage{ +getRefSuperPop(fileReferenceGDS) +} +\arguments{ +\item{fileReferenceGDS}{a \code{character} string representing the file +name of the Reference GDS file. The file must exist.} +} +\value{ +\code{vector} of \code{character} strings representing the content +of the extracted column for the 1KG GDS 'sample.ref' node. The values +represent the known ancestry assignation. The profile +identifiers are used as names for the \code{vector}. +} +\description{ +The function extract the specified column for the 'sample.ref' +node present in the Reference GDS file. The column must be present in the +\code{data.frame} saved in the 'sample.ref' node. Only the information for +the reference profiles is returned. The values +represent the known ancestry assignation. +} +\examples{ + + +## Path to the demo pedigree file is located in this package +dataDir <- system.file("extdata", package="RAIDS") + +## Open existing demo 1K GDS file with "sample.ref" node +nameFileGDS <- file.path(dataDir, "PopulationReferenceDemo.gds") + +## Extract super population information for the 1KG profiles +getRefSuperPop(fileReferenceGDS=nameFileGDS) + + +} +\author{ +Pascal Belleau, Astrid Deschênes and Alexander Krasnitz +} diff --git a/man/getTableSNV.Rd b/man/getTableSNV.Rd index acf2b2165..7f0ef426c 100644 --- a/man/getTableSNV.Rd +++ b/man/getTableSNV.Rd @@ -4,7 +4,7 @@ \name{getTableSNV} \alias{getTableSNV} \title{Extract the genotype information for a SNV dataset using -the Profile GDS file and the 1KG GDS file} +the Profile GDS file and the Reference GDS file} \usage{ getTableSNV( gdsReference, @@ -19,7 +19,7 @@ getTableSNV( } \arguments{ \item{gdsReference}{an object of class \code{\link[gdsfmt]{gds.class}} (a -GDS file), the opened 1KG GDS file.} +GDS file), the opened Reference GDS file.} \item{gdsSample}{an object of class \code{\link[gdsfmt]{gds.class}} (a GDS file), the opened Profile GDS file.} @@ -45,32 +45,32 @@ when the function is running.} } \value{ a \code{data.frame} containing: -\itemize{ -\item{cnt.tot} {a single \code{integer} representing the total coverage for +\describe{ +\item{cnt.tot}{ a single \code{integer} representing the total coverage for the SNV.} -\item{cnt.ref} {a single \code{integer} representing the coverage for +\item{cnt.ref}{ a single \code{integer} representing the coverage for the reference allele.} -\item{cnt.alt} {a single \code{integer} representing the coverage for +\item{cnt.alt}{ a single \code{integer} representing the coverage for the alternative allele.} -\item{snp.pos} {a single \code{integer} representing the SNV position.} -\item{snp.chr} {a single \code{integer} representing the SNV chromosome.} -\item{normal.geno} {a single \code{numeric} indicating the genotype of the +\item{snpPos}{ a single \code{integer} representing the SNV position.} +\item{snp.chr}{ a single \code{integer} representing the SNV chromosome.} +\item{normal.geno}{ a single \code{numeric} indicating the genotype of the SNV. The possibles are: \code{0} (wild-type homozygote), \code{1} (heterozygote), \code{2} (altenative homozygote), \code{3} indicating that the normal genotype is unknown.} -\item{pruned} { a \code{logical}} -\item{snp.index} {a \code{vector} of \code{integer} representing the -position of the SNVs in the 1KG GDS file.} -\item{keep} {a \code{logical} } -\item{hetero} {a \code{logical} } -\item{homo} {a \code{logical} } +\item{pruned}{ a \code{logical}} +\item{snp.index}{ a \code{vector} of \code{integer} representing the +position of the SNVs in the Reference GDS file.} +\item{keep}{ a \code{logical} } +\item{hetero}{ a \code{logical} } +\item{homo}{ a \code{logical} } } } \description{ The function generates a \code{data.frame} containing the genotype information from a initial list of SNVs associated to a specific -profile. The function uses the information present in the 1KG GDS file -(reference) and the Profile GDS file. +profile. The function uses the information present in the Reference GDS file +and the Profile GDS file. } \examples{ @@ -79,37 +79,34 @@ library(gdsfmt) ## Path to the demo 1KG GDS file is located in this package dataDir <- system.file("extdata/tests", package="RAIDS") -fileGDS <- file.path(dataDir, "ex1_good_small_1KG_GDS.gds") +fileGDS <- file.path(dataDir, "ex1_good_small_1KG.gds") + +## Temporary Profile GDS file for one profile in temporary directory +fileProfile <- file.path(tempdir(), "ex1.gds") ## Copy the Profile GDS file demo that has been pruned and annotated -## into a test directory (deleted after the example has been run) -dataDirAllelicFraction <- file.path(system.file("extdata", package="RAIDS"), - "demoAllelicFraction") -dir.create(dataDirAllelicFraction, showWarnings=FALSE, - recursive=FALSE, mode="0777") file.copy(file.path(dataDir, "ex1_demo_with_pruning_and_1KG_annot.gds"), - file.path(dataDirAllelicFraction, "ex1.gds")) + fileProfile) ## Open the reference GDS file (demo version) gds1KG <- snpgdsOpen(fileGDS) -## Profile GDS file for one profile -fileProfile <- file.path(dataDirAllelicFraction, "ex1.gds") +## Open Profile GDS file for one profile profileGDS <- openfn.gds(fileProfile) ## The function returns a data frame containing the SNVs information result <- RAIDS:::getTableSNV(gdsReference=gds1KG, gdsSample=profileGDS, - currentProfile="ex1", studyID="MYDATA", minCov=10L, minProb=0.999, - eProb=0.001, verbose=FALSE) + currentProfile="ex1", studyID="MYDATA", minCov=10L, minProb=0.999, + eProb=0.001, verbose=FALSE) head(result) ## Close both GDS files (important) closefn.gds(profileGDS) closefn.gds(gds1KG) -## Unlink Profile GDS file (created for demo purpose) -unlink(file.path(dataDirAllelicFraction, "ex1.gds")) -unlink(dataDirAllelicFraction) +## Remove Profile GDS file (created for demo purpose) +unlink(fileProfile, force=TRUE) + } \author{ diff --git a/man/groupChr1KGSNV.Rd b/man/groupChr1KGSNV.Rd index a31cf95df..82a9f806f 100644 --- a/man/groupChr1KGSNV.Rd +++ b/man/groupChr1KGSNV.Rd @@ -19,7 +19,7 @@ the merged genotyping files for each sample will be created. The path must exists.} } \value{ -The integer \code{0L} when successful. +The integer \code{0L} when successful or \code{FALSE} if not. } \description{ This function merge all the genotyping files associated to one @@ -30,10 +30,40 @@ files for all samples present in the input directory. } \examples{ -## Path to the demo pedigree file is located in this package +## Path to the demo vcf files in this package dataDir <- system.file("extdata", package="RAIDS") +pathGenoTar <- file.path(dataDir, "demoGenoChr", "demoGenoChr.tar") -## TODO +## Path where the chromosomes files will be located +pathGeno <- file.path(tempdir(), "tempGeno") +dir.create(pathGeno, showWarnings=FALSE) + +## Untar the file that contains the VCF files for 3 samples split by +## chromosome (one directory per chromosome) +untar(tarfile=pathGenoTar, exdir=pathGeno) + +## Path where the output VCF file will be created is +## the same where the split VCF are (pathGeno) + +## The files must not exist +if (!file.exists(file.path(pathGeno, "NA12003.csv.bz2")) && + !file.exists(file.path(pathGeno, "NA12004.csv.bz2")) && + !file.exists(file.path(pathGeno, "NA12005.csv.bz2"))) { + + ## Return 0 when successful + ## The files "NA12003.csv.bz2", "NA12004.csv.bz2" and + ## "NA12005.csv.bz2" should not be present in the current directory + groupChr1KGSNV(pathGenoChr=pathGeno, pathOut=pathGeno) + + ## Validate that files have been created + file.exists(file.path(pathGeno, "NA12003.csv.bz2")) + file.exists(file.path(pathGeno, "NA12004.csv.bz2")) + file.exists(file.path(pathGeno, "NA12005.csv.bz2")) + +} + +## Remove temporary directory +unlink(pathGeno, recursive=TRUE, force=TRUE) } \author{ diff --git a/man/groupChrPruning.Rd b/man/groupChrPruning.Rd deleted file mode 100644 index 9f987e76d..000000000 --- a/man/groupChrPruning.Rd +++ /dev/null @@ -1,34 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/tools.R -\encoding{UTF-8} -\name{groupChrPruning} -\alias{groupChrPruning} -\title{Merge the pruning files by chromosome in one file} -\usage{ -groupChrPruning(pathPrunedGDS, filePref, fileOUT) -} -\arguments{ -\item{pathPrunedGDS}{TODO} - -\item{filePref}{TODO} - -\item{fileOUT}{TODO} -} -\value{ -TODO a \code{vector} of \code{numeric} -} -\description{ -TODO -} -\examples{ - -## Path to the demo pedigree file is located in this package -dataDir <- system.file("extdata", package="RAIDS") - -## TODO - -} -\author{ -Pascal Belleau, Astrid Deschênes and Alexander Krasnitz -} -\keyword{internal} diff --git a/man/identifyRelative.Rd b/man/identifyRelative.Rd index be12f2791..29aad31d6 100644 --- a/man/identifyRelative.Rd +++ b/man/identifyRelative.Rd @@ -3,14 +3,14 @@ \encoding{UTF-8} \name{identifyRelative} \alias{identifyRelative} -\title{Identify genetically unrelated patients in GDS 1KG file} +\title{Identify genetically unrelated patients in GDS Reference file} \usage{ identifyRelative(gds, maf = 0.05, thresh = 2^(-11/2), fileIBD, filePart) } \arguments{ \item{gds}{an object of class \code{\link[SNPRelate:SNPGDSFileClass]{SNPRelate::SNPGDSFileClass}}, the -1KG GDS file.} +Reference GDS file.} \item{maf}{a single \code{numeric} representing the threshold for the minor allele frequency. Only the SNPs with ">= maf" will be used. @@ -27,7 +27,7 @@ The extension of the file must be '.rds'.} \item{filePart}{a \code{character} string representing the path and file name of the RDS file that will be created. The RDS file will contain the -information about the 1KG patients that are unrelated. The file will +information about the Reference patients that are unrelated. The file will contains two lists: the \code{list} of related samples, called \code{rels} and the list of unrelated samples, called \code{unrels}. The extension of the file must be '.rds'.} @@ -37,40 +37,61 @@ The extension of the file must be '.rds'.} } \description{ The function identify patients that are genetically related in -the 1KG file. It generates a first RDS file with the list of unrelated +the Reference file. It generates a first RDS file with the list of unrelated patient. It also generates a second RDS file with the kinship coefficient between the patients. } \examples{ -## Needed packages -library(withr) +## Required package library(gdsfmt) ## Path to the demo pedigree file is located in this package dataDir <- system.file("extdata", package="RAIDS") -## Open existing 1K GDS file -fileGDS <- file.path(dataDir, "1KG_Demo.gds") +## Open existing demo Reference GDS file +fileGDS <- file.path(dataDir, "PopulationReferenceDemo.gds") tmpGDS <- snpgdsOpen(fileGDS) ## Temporary output files ## The first RDS file will contain the list of unrelated patients ## The second RDS file will contain the kinship information between patients -patientTmpFile <- local_file(file.path(dataDir, - "unrelatedPatients_TEMP.rds")) -ibdTmpFile <- local_file(file.path(dataDir,"ibd_TEMP.rds")) +patientTmpFile <- "unrelatedPatients_TEMP.rds" +ibdTmpFile <- "ibd_TEMP.rds" -## Identify unrelated patients in 1KG GDS file -identifyRelative(gds=tmpGDS, maf=0.05, thresh=2^(-11/2), - fileIBD=ibdTmpFile, filePart=patientTmpFile) +## Different code depending of the withr package availability +if (requireNamespace("withr", quietly=TRUE)) { -## Close 1K GDS file -closefn.gds(tmpGDS) + ## Temporary output files + ## The first RDS file will contain the list of unrelated patients + ## The second RDS file will contain the kinship information + ## between patients + patientTmpFileLocal <- withr::local_file(patientTmpFile) + ibdTmpFileLocal <- withr::local_file(ibdTmpFile) -## Remove temporary files -deferred_run() + ## Identify unrelated patients in demo Reference GDS file + identifyRelative(gds=tmpGDS, maf=0.05, thresh=2^(-11/2), + fileIBD=ibdTmpFileLocal, filePart=patientTmpFileLocal) + ## Close demo Reference GDS file + closefn.gds(tmpGDS) + + ## Remove temporary files + withr::deferred_run() + +} else { + + ## Identify unrelated patients in demo Reference GDS file + identifyRelative(gds=tmpGDS, maf=0.05, thresh=2^(-11/2), + fileIBD=ibdTmpFile, filePart=patientTmpFile) + + ## Close demo Reference GDS file + closefn.gds(tmpGDS) + + ## Remove temporary files + unlink(patientTmpFile, force=TRUE) + unlink(ibdTmpFile, force=TRUE) +} } \author{ diff --git a/man/identifyRelativeRef.Rd b/man/identifyRelativeRef.Rd new file mode 100644 index 000000000..fc04f768e --- /dev/null +++ b/man/identifyRelativeRef.Rd @@ -0,0 +1,96 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/process1KG.R +\encoding{UTF-8} +\name{identifyRelativeRef} +\alias{identifyRelativeRef} +\title{Identify genetically unrelated patients in GDS Reference file} +\usage{ +identifyRelativeRef( + fileReferenceGDS, + maf = 0.05, + thresh = 2^(-11/2), + fileIBD, + filePart +) +} +\arguments{ +\item{fileReferenceGDS}{a \code{character} string representing the file +name of the Reference GDS file. The file must exist.} + +\item{maf}{a single \code{numeric} representing the threshold for the minor +allele frequency. Only the SNPs with ">= maf" will be used. +Default: \code{0.05}.} + +\item{thresh}{a single \code{numeric} representing the threshold value used +to decide if a pair of individuals is ancestrally divergent. +Default: \code{2^(-11/2)}.} + +\item{fileIBD}{a \code{character} string representing the path and file +name of the RDS file that will be created. The RDS file will contain the +kinship coefficient between the patients. +The extension of the file must be '.rds'.} + +\item{filePart}{a \code{character} string representing the path and file +name of the RDS file that will be created. The RDS file will contain the +information about the Reference patients that are unrelated. The file will +contains two lists: the \code{list} of related samples, called \code{rels} +and the list of unrelated samples, called \code{unrels}. +The extension of the file must be '.rds'.} +} +\value{ +\code{NULL} invisibly. +} +\description{ +The function identify patients that are genetically related in +the Reference file. It generates a first RDS file with the list of unrelated +patient. It also generates a second RDS file with the kinship coefficient +between the patients. +} +\examples{ + + +## Path to the demo pedigree file is located in this package +dataDir <- system.file("extdata", package="RAIDS") + +## Open existing demo Reference GDS file +fileGDS <- file.path(dataDir, "PopulationReferenceDemo.gds") + + +## Temporary output files +## The first RDS file will contain the list of unrelated patients +## The second RDS file will contain the kinship information between patients +patientTmpFile <- "unrelatedPatients_TEMP.rds" +ibdTmpFile <- "ibd_TEMP.rds" + +## Different code depending of the withr package availability +if (requireNamespace("withr", quietly=TRUE)) { + + ## Temporary output files + ## The first RDS file will contain the list of unrelated patients + ## The second RDS file will contain the kinship information + ## between patients + patientTmpFileLocal <- withr::local_file(patientTmpFile) + ibdTmpFileLocal <- withr::local_file(ibdTmpFile) + + ## Identify unrelated patients in demo Reference GDS file + identifyRelativeRef(fileReferenceGDS=fileGDS, maf=0.05, thresh=2^(-11/2), + fileIBD=ibdTmpFileLocal, filePart=patientTmpFileLocal) + + ## Remove temporary files + withr::deferred_run() + +} else { + + ## Identify unrelated patients in demo Reference GDS file + identifyRelativeRef(fileReferenceGDS=fileGDS, maf=0.05, thresh=2^(-11/2), + fileIBD=ibdTmpFile, filePart=patientTmpFile) + + ## Remove temporary files + unlink(patientTmpFile, force=TRUE) + unlink(ibdTmpFile, force=TRUE) +} + +} +\author{ +Pascal Belleau, Astrid Deschênes and Alexander Krasnitz +} diff --git a/man/inferAncestry.Rd b/man/inferAncestry.Rd new file mode 100644 index 000000000..a139c6f27 --- /dev/null +++ b/man/inferAncestry.Rd @@ -0,0 +1,281 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/processStudy.R +\encoding{UTF-8} +\name{inferAncestry} +\alias{inferAncestry} +\title{Run most steps leading to the ancestry inference call on a specific +DNA profile} +\usage{ +inferAncestry( + profileFile, + pathProfileGDS, + fileReferenceGDS, + fileReferenceAnnotGDS, + chrInfo, + syntheticRefDF, + genoSource = c("snp-pileup", "generic", "VCF", "bam"), + np = 1L, + verbose = FALSE +) +} +\arguments{ +\item{profileFile}{a \code{character} string representing the path and the +file name of the genotype file or the bam if genoSource is snp-pileup the +fine extension must be .txt.gz, if VCF the extension must be .vcf.gz} + +\item{pathProfileGDS}{a \code{character} string representing the path to +the directory where the GDS Profile files will be created. +Default: \code{NULL}.} + +\item{fileReferenceGDS}{a \code{character} string representing the file +name of the Population Reference GDS file. The file must exist.} + +\item{fileReferenceAnnotGDS}{a \code{character} string representing the +file name of the Population Reference GDS Annotation file. The file +must exist.} + +\item{chrInfo}{a \code{vector} of positive \code{integer} values +representing the length of the chromosomes. See 'details' section.} + +\item{syntheticRefDF}{a \code{data.frame} containing a subset of +reference profiles for each sub-population present in the Reference GDS +file. The \code{data.frame} must have those columns: +\describe{ +\item{sample.id}{ a \code{character} string representing the sample +identifier. } +\item{pop.group}{ a \code{character} string representing the +subcontinental population assigned to the sample. } +\item{superPop}{ a \code{character} string representing the +super-population assigned to the sample. } +}} + +\item{genoSource}{a \code{character} string with four possible values: +'snp-pileup', 'generic', 'VCF' or 'bam'. It specifies if the genotype files +are generated by snp-pileup (Facets) or are a generic format CSV file +with at least those columns: +'Chromosome', 'Position', 'Ref', 'Alt', 'Count', 'File1R' and 'File1A'. +The 'Count' is the depth at the specified position; +'FileR' is the depth of the reference allele and +'File1A' is the depth of the specific alternative allele. +Finally the file can be a VCF file with at least those genotype +fields: GT, AD, DP.} + +\item{np}{a single positive \code{integer} specifying the number of +threads to be used. Default: \code{1L}.} + +\item{verbose}{a \code{logical} indicating if messages should be printed +to show how the different steps in the function. Default: \code{FALSE}.} +} +\value{ +a \code{list} containing 4 entries: +\describe{ +\item{\code{pcaSample}}{ a \code{list} containing the information related +to the eigenvectors. The \code{list} contains those 3 entries: +\describe{ +\item{\code{sample.id}}{ a \code{character} string representing the unique +identifier of the current profile.} +\item{\code{eigenvector.ref}}{ a \code{matrix} of \code{numeric} containing +the eigenvectors for the reference profiles.} +\item{\code{eigenvector}}{ a \code{matrix} of \code{numeric} containing the +eigenvectors for the current profile projected on the PCA from the +reference profiles.} +} +} +\item{\code{paraSample}}{ a \code{list} containing the results with +different \code{D} and \code{K} values that lead to optimal parameter +selection. The \code{list} contains those entries: +\describe{ +\item{\code{dfPCA}}{ a \code{data.frame} containing statistical results +on all combined synthetic results done with a fixed value of \code{D} (the +number of dimensions). The \code{data.frame} contains those columns: +\describe{ +\item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +number of dimensions).} +\item{\code{median}}{ a \code{numeric} representing the median of the +minimum AUROC obtained (within super populations) for all combination of +the fixed \code{D} value and all tested \code{K} values. } +\item{\code{mad}}{ a \code{numeric} representing the MAD of the minimum +AUROC obtained (within super populations) for all combination of the fixed +\code{D} value and all tested \code{K} values. } +\item{\code{upQuartile}}{ a \code{numeric} representing the upper quartile +of the minimum AUROC obtained (within super populations) for all +combination of the fixed \code{D} value and all tested \code{K} values. } +\item{\code{k}}{ a \code{numeric} representing the optimal \code{K} value +(the number of neighbors) for a fixed \code{D} value. } +} +} +\item{\code{dfPop}}{ a \code{data.frame} containing statistical results on +all combined synthetic results done with different values of \code{D} (the +number of dimensions) and \code{K} (the number of neighbors). +The \code{data.frame} contains those columns: +\describe{ +\item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +number of dimensions).} +\item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +number of neighbors).} +\item{\code{AUROC.min}}{ a \code{numeric} representing the minimum accuracy +obtained by grouping all the synthetic results by super-populations, for +the specified values of \code{D} and \code{K}.} +\item{\code{AUROC}}{ a \code{numeric} representing the accuracy obtained +by grouping all the synthetic results for the specified values of \code{D} +and \code{K}.} +\item{\code{Accu.CM}}{ a \code{numeric} representing the value of accuracy +of the confusion matrix obtained by grouping all the synthetic results for +the specified values of \code{D} and \code{K}.} +} +} +\item{\code{dfAUROC}}{ a \code{data.frame} the summary of the results by +super-population. The \code{data.frame} contains +those columns: +\describe{ +\item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +number of dimensions).} +\item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +number of neighbors).} +\item{\code{Call}}{ a \code{character} string representing the +super-population.} +\item{\code{L}}{ a \code{numeric} representing the lower value of the 95\% +confidence interval for the AUROC obtained for the fixed values of +super-population, \code{D} and \code{K}.} +\item{\code{AUROC}}{ a \code{numeric} representing the AUROC obtained for the +fixed values of super-population, \code{D} and \code{K}.} +\item{\code{H}}{ a \code{numeric} representing the higher value of the 95\% +confidence interval for the AUROC obtained for the fixed values of +super-population, \code{D} and \code{K}.} +} +} +\item{\code{D}}{ a \code{numeric} representing the optimal \code{D} value +(the number of dimensions) for the specific profile.} +\item{\code{K}}{ a \code{numeric} representing the optimal \code{K} value +(the number of neighbors) for the specific profile.} +\item{\code{listD}}{ a \code{numeric} representing the optimal \code{D} +values (the number of dimensions) for the specific profile. More than one +\code{D} is possible.} +} +} +\item{\code{KNNSample}}{ a \code{data.frame} containing the inferred ancestry +for different values of \code{K} and \code{D}. The \code{data.frame} +contains those columns: +\describe{ +\item{\code{sample.id}}{ a \code{character} string representing the unique +identifier of the current profile.} +\item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +number of dimensions) used to infer the ancestry. } +\item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +number of neighbors) used to infer the ancestry. } +\item{\code{SuperPop}}{ a \code{character} string representing the inferred +ancestry for the specified \code{D} and \code{K} values.} +} +} +\item{\code{KNNSynthetic}}{ a \code{data.frame} containing the inferred ancestry +for each synthetic data for different values of \code{K} and \code{D}. +The \code{data.frame} +contains those columns: "sample.id", "D", "K", "infer.superPop", "ref.superPop" +\describe{ +\item{\code{sample.id}}{ a \code{character} string representing the unique +identifier of the current synthetic data.} +\item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +number of dimensions) used to infer the ancestry. } +\item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +number of neighbors) used to infer the ancestry. } +\item{\code{infer.superPop}}{ a \code{character} string representing the inferred +ancestry for the specified \code{D} and \code{K} values.} +\item{\code{ref.superPop}}{ a \code{character} string representing the known +ancestry from the reference} +} +} +\item{\code{Ancestry}}{ a \code{data.frame} containing the inferred +ancestry for the current profile. The \code{data.frame} contains those +columns: +\describe{ +\item{\code{sample.id}}{ a \code{character} string representing the unique +identifier of the current profile.} +\item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +number of dimensions) used to infer the ancestry.} +\item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +number of neighbors) used to infer the ancestry.} +\item{\code{SuperPop}}{ a \code{character} string representing the inferred +ancestry.} +} +} +} +} +\description{ +This function runs most steps leading to the ancestry inference +call on a specific RNA profile. First, the function creates the +Profile GDS file for the specific profile using the information from a +RDS Sample description file and the Population Reference GDS file. +} +\examples{ + +## Required library for GDS +library(SNPRelate) + +## Path to the demo 1KG GDS file is located in this package +dataDir <- system.file("extdata", package="RAIDS") + +################################################################# +## The 1KG GDS file and the 1KG SNV Annotation GDS file +## need to be located in the same directory +## Note that the 1KG GDS file used for this example is a +## simplified version and CANNOT be used for any real analysis +################################################################# +path1KG <- file.path(dataDir, "tests") + +fileReferenceGDS <- file.path(path1KG, "ex1_good_small_1KG.gds") +fileAnnotGDS <- file.path(path1KG, "ex1_good_small_1KG_Annot.gds") + +################################################################# +## The Sample SNP pileup files (one per sample) need +## to be located in the same directory. +################################################################# +demoProfileEx1 <- file.path(dataDir, "example", "snpPileup", "ex1.txt.gz") + +################################################################# +## The path where the Profile GDS Files (one per sample) +## will be created need to be specified. +################################################################# +pathProfileGDS <- file.path(tempdir(), "out.tmp") + +#################################################################### +## Fix seed to ensure reproducible results +#################################################################### +set.seed(3043) + +gds1KG <- snpgdsOpen(fileReferenceGDS) +dataRef <- select1KGPop(gds1KG, nbProfiles=2L) +closefn.gds(gds1KG) + +## Required library for this example to run correctly +if (requireNamespace("GenomeInfoDb", quietly=TRUE) && + requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) { + + ## Chromosome length information + ## chr23 is chrX, chr24 is chrY and chrM is 25 + chrInfo <- GenomeInfoDb::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25] + + \donttest{ + + res <- inferAncestry(profileFile=demoProfileEx1, + pathProfileGDS=pathProfileGDS, + fileReferenceGDS=fileReferenceGDS, + fileReferenceAnnotGDS=fileAnnotGDS, + chrInfo=chrInfo, + syntheticRefDF=dataRef, + genoSource="snp-pileup") + + unlink(pathProfileGDS, recursive=TRUE, force=TRUE) + + } +} + +} +\references{ +Galinsky KJ, Bhatia G, Loh PR, Georgiev S, Mukherjee S, Patterson NJ, +Price AL. Fast Principal-Component Analysis Reveals Convergent Evolution +of ADH1B in Europe and East Asia. Am J Hum Genet. 2016 Mar 3;98(3):456-72. +doi: 10.1016/j.ajhg.2015.12.022. Epub 2016 Feb 25. +} +\author{ +Pascal Belleau, Astrid Deschênes and Alexander Krasnitz +} diff --git a/man/inferAncestryDNA.Rd b/man/inferAncestryDNA.Rd new file mode 100644 index 000000000..e5aeb0ba5 --- /dev/null +++ b/man/inferAncestryDNA.Rd @@ -0,0 +1,281 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/processStudy.R +\encoding{UTF-8} +\name{inferAncestryDNA} +\alias{inferAncestryDNA} +\title{Run most steps leading to the ancestry inference call on a specific +DNA profile (alias for inferAncestry )} +\usage{ +inferAncestryDNA( + profileFile, + pathProfileGDS, + fileReferenceGDS, + fileReferenceAnnotGDS, + chrInfo, + syntheticRefDF, + genoSource = c("snp-pileup", "generic", "VCF", "bam"), + np = 1L, + verbose = FALSE +) +} +\arguments{ +\item{profileFile}{a \code{character} string representing the path and the +file name of the genotype file or the bam if genoSource is snp-pileup the +fine extension must be .txt.gz, if VCF the extension must be .vcf.gz} + +\item{pathProfileGDS}{a \code{character} string representing the path to +the directory where the GDS Profile files will be created. +Default: \code{NULL}.} + +\item{fileReferenceGDS}{a \code{character} string representing the file +name of the Population Reference GDS file. The file must exist.} + +\item{fileReferenceAnnotGDS}{a \code{character} string representing the +file name of the Population Reference GDS Annotation file. The file +must exist.} + +\item{chrInfo}{a \code{vector} of positive \code{integer} values +representing the length of the chromosomes. See 'details' section.} + +\item{syntheticRefDF}{a \code{data.frame} containing a subset of +reference profiles for each sub-population present in the Reference GDS +file. The \code{data.frame} must have those columns: +\describe{ +\item{sample.id}{ a \code{character} string representing the sample +identifier. } +\item{pop.group}{ a \code{character} string representing the +subcontinental population assigned to the sample. } +\item{superPop}{ a \code{character} string representing the +super-population assigned to the sample. } +}} + +\item{genoSource}{a \code{character} string with four possible values: +'snp-pileup', 'generic', 'VCF' or 'bam'. It specifies if the genotype files +are generated by snp-pileup (Facets) or are a generic format CSV file +with at least those columns: +'Chromosome', 'Position', 'Ref', 'Alt', 'Count', 'File1R' and 'File1A'. +The 'Count' is the depth at the specified position; +'FileR' is the depth of the reference allele and +'File1A' is the depth of the specific alternative allele. +Finally the file can be a VCF file with at least those genotype +fields: GT, AD, DP.} + +\item{np}{a single positive \code{integer} specifying the number of +threads to be used. Default: \code{1L}.} + +\item{verbose}{a \code{logical} indicating if messages should be printed +to show how the different steps in the function. Default: \code{FALSE}.} +} +\value{ +a \code{list} containing 4 entries: +\describe{ +\item{\code{pcaSample}}{ a \code{list} containing the information related +to the eigenvectors. The \code{list} contains those 3 entries: +\describe{ +\item{\code{sample.id}}{ a \code{character} string representing the unique +identifier of the current profile.} +\item{\code{eigenvector.ref}}{ a \code{matrix} of \code{numeric} containing +the eigenvectors for the reference profiles.} +\item{\code{eigenvector}}{ a \code{matrix} of \code{numeric} containing the +eigenvectors for the current profile projected on the PCA from the +reference profiles.} +} +} +\item{\code{paraSample}}{ a \code{list} containing the results with +different \code{D} and \code{K} values that lead to optimal parameter +selection. The \code{list} contains those entries: +\describe{ +\item{\code{dfPCA}}{ a \code{data.frame} containing statistical results +on all combined synthetic results done with a fixed value of \code{D} (the +number of dimensions). The \code{data.frame} contains those columns: +\describe{ +\item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +number of dimensions).} +\item{\code{median}}{ a \code{numeric} representing the median of the +minimum AUROC obtained (within super populations) for all combination of +the fixed \code{D} value and all tested \code{K} values. } +\item{\code{mad}}{ a \code{numeric} representing the MAD of the minimum +AUROC obtained (within super populations) for all combination of the fixed +\code{D} value and all tested \code{K} values. } +\item{\code{upQuartile}}{ a \code{numeric} representing the upper quartile +of the minimum AUROC obtained (within super populations) for all +combination of the fixed \code{D} value and all tested \code{K} values. } +\item{\code{k}}{ a \code{numeric} representing the optimal \code{K} value +(the number of neighbors) for a fixed \code{D} value. } +} +} +\item{\code{dfPop}}{ a \code{data.frame} containing statistical results on +all combined synthetic results done with different values of \code{D} (the +number of dimensions) and \code{K} (the number of neighbors). +The \code{data.frame} contains those columns: +\describe{ +\item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +number of dimensions).} +\item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +number of neighbors).} +\item{\code{AUROC.min}}{ a \code{numeric} representing the minimum accuracy +obtained by grouping all the synthetic results by super-populations, for +the specified values of \code{D} and \code{K}.} +\item{\code{AUROC}}{ a \code{numeric} representing the accuracy obtained +by grouping all the synthetic results for the specified values of \code{D} +and \code{K}.} +\item{\code{Accu.CM}}{ a \code{numeric} representing the value of accuracy +of the confusion matrix obtained by grouping all the synthetic results for +the specified values of \code{D} and \code{K}.} +} +} +\item{\code{dfAUROC}}{ a \code{data.frame} the summary of the results by +super-population. The \code{data.frame} contains +those columns: +\describe{ +\item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +number of dimensions).} +\item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +number of neighbors).} +\item{\code{Call}}{ a \code{character} string representing the +super-population.} +\item{\code{L}}{ a \code{numeric} representing the lower value of the 95\% +confidence interval for the AUROC obtained for the fixed values of +super-population, \code{D} and \code{K}.} +\item{\code{AUROC}}{ a \code{numeric} representing the AUROC obtained for the +fixed values of super-population, \code{D} and \code{K}.} +\item{\code{H}}{ a \code{numeric} representing the higher value of the 95\% +confidence interval for the AUROC obtained for the fixed values of +super-population, \code{D} and \code{K}.} +} +} +\item{\code{D}}{ a \code{numeric} representing the optimal \code{D} value +(the number of dimensions) for the specific profile.} +\item{\code{K}}{ a \code{numeric} representing the optimal \code{K} value +(the number of neighbors) for the specific profile.} +\item{\code{listD}}{ a \code{numeric} representing the optimal \code{D} +values (the number of dimensions) for the specific profile. More than one +\code{D} is possible.} +} +} +\item{\code{KNNSample}}{ a \code{data.frame} containing the inferred ancestry +for different values of \code{K} and \code{D}. The \code{data.frame} +contains those columns: +\describe{ +\item{\code{sample.id}}{ a \code{character} string representing the unique +identifier of the current profile.} +\item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +number of dimensions) used to infer the ancestry. } +\item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +number of neighbors) used to infer the ancestry. } +\item{\code{SuperPop}}{ a \code{character} string representing the inferred +ancestry for the specified \code{D} and \code{K} values.} +} +} +\item{\code{KNNSynthetic}}{ a \code{data.frame} containing the inferred ancestry +for each synthetic data for different values of \code{K} and \code{D}. +The \code{data.frame} +contains those columns: "sample.id", "D", "K", "infer.superPop", "ref.superPop" +\describe{ +\item{\code{sample.id}}{ a \code{character} string representing the unique +identifier of the current synthetic data.} +\item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +number of dimensions) used to infer the ancestry. } +\item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +number of neighbors) used to infer the ancestry. } +\item{\code{infer.superPop}}{ a \code{character} string representing the inferred +ancestry for the specified \code{D} and \code{K} values.} +\item{\code{ref.superPop}}{ a \code{character} string representing the known +ancestry from the reference} +} +} +\item{\code{Ancestry}}{ a \code{data.frame} containing the inferred +ancestry for the current profile. The \code{data.frame} contains those +columns: +\describe{ +\item{\code{sample.id}}{ a \code{character} string representing the unique +identifier of the current profile.} +\item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +number of dimensions) used to infer the ancestry.} +\item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +number of neighbors) used to infer the ancestry.} +\item{\code{SuperPop}}{ a \code{character} string representing the inferred +ancestry.} +} +} +} +} +\description{ +This function runs most steps leading to the ancestry inference +call on a specific RNA profile. First, the function creates the +Profile GDS file for the specific profile using the information from a +RDS Sample description file and the Population Reference GDS file. +} +\examples{ + +## Required library for GDS +library(SNPRelate) + +## Path to the demo 1KG GDS file is located in this package +dataDir <- system.file("extdata", package="RAIDS") + +################################################################# +## The 1KG GDS file and the 1KG SNV Annotation GDS file +## need to be located in the same directory +## Note that the 1KG GDS file used for this example is a +## simplified version and CANNOT be used for any real analysis +################################################################# +path1KG <- file.path(dataDir, "tests") + +fileReferenceGDS <- file.path(path1KG, "ex1_good_small_1KG.gds") +fileAnnotGDS <- file.path(path1KG, "ex1_good_small_1KG_Annot.gds") + +################################################################# +## The Sample SNP pileup files (one per sample) need +## to be located in the same directory. +################################################################# +demoProfileEx1 <- file.path(dataDir, "example", "snpPileup", "ex1.txt.gz") + +################################################################# +## The path where the Profile GDS Files (one per sample) +## will be created need to be specified. +################################################################# +pathProfileGDS <- file.path(tempdir(), "out.tmp") + +#################################################################### +## Fix seed to ensure reproducible results +#################################################################### +set.seed(3043) + +gds1KG <- snpgdsOpen(fileReferenceGDS) +dataRef <- select1KGPop(gds1KG, nbProfiles=2L) +closefn.gds(gds1KG) + +## Required library for this example to run correctly +if (requireNamespace("GenomeInfoDb", quietly=TRUE) && + requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) { + + ## Chromosome length information + ## chr23 is chrX, chr24 is chrY and chrM is 25 + chrInfo <- GenomeInfoDb::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25] + + \donttest{ + + res <- inferAncestryDNA(profileFile=demoProfileEx1, + pathProfileGDS=pathProfileGDS, + fileReferenceGDS=fileReferenceGDS, + fileReferenceAnnotGDS=fileAnnotGDS, + chrInfo=chrInfo, + syntheticRefDF=dataRef, + genoSource="snp-pileup") + + unlink(pathProfileGDS, recursive=TRUE, force=TRUE) + + } +} + +} +\references{ +Galinsky KJ, Bhatia G, Loh PR, Georgiev S, Mukherjee S, Patterson NJ, +Price AL. Fast Principal-Component Analysis Reveals Convergent Evolution +of ADH1B in Europe and East Asia. Am J Hum Genet. 2016 Mar 3;98(3):456-72. +doi: 10.1016/j.ajhg.2015.12.022. Epub 2016 Feb 25. +} +\author{ +Pascal Belleau, Astrid Deschênes and Alexander Krasnitz +} diff --git a/man/inferAncestryGeneAware.Rd b/man/inferAncestryGeneAware.Rd new file mode 100644 index 000000000..27c7f9c09 --- /dev/null +++ b/man/inferAncestryGeneAware.Rd @@ -0,0 +1,304 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/processStudy.R +\encoding{UTF-8} +\name{inferAncestryGeneAware} +\alias{inferAncestryGeneAware} +\title{Run most steps leading to the ancestry inference call on a specific +RNA profile} +\usage{ +inferAncestryGeneAware( + profileFile, + pathProfileGDS, + fileReferenceGDS, + fileReferenceAnnotGDS, + chrInfo, + syntheticRefDF, + genoSource = c("snp-pileup", "generic", "VCF", "bam"), + np = 1L, + blockTypeID, + verbose = FALSE +) +} +\arguments{ +\item{profileFile}{a \code{character} string representing the path and the +file name of the genotype file or the bam if genoSource is snp-pileup the +fine extension must be .txt.gz, if VCF the extension must be .vcf.gz} + +\item{pathProfileGDS}{a \code{character} string representing the path to +the directory where the GDS Profile files will be created. +Default: \code{NULL}.} + +\item{fileReferenceGDS}{a \code{character} string representing the file +name of the Population Reference GDS file. The file must exist.} + +\item{fileReferenceAnnotGDS}{a \code{character} string representing the +file name of the Population Reference GDS Annotation file. The file +must exist.} + +\item{chrInfo}{a \code{vector} of positive \code{integer} values +representing the length of the chromosomes. See 'details' section.} + +\item{syntheticRefDF}{a \code{data.frame} containing a subset of +reference profiles for each sub-population present in the Reference GDS +file. The \code{data.frame} must have those columns: +\describe{ +\item{sample.id}{ a \code{character} string representing the sample +identifier. } +\item{pop.group}{ a \code{character} string representing the +subcontinental population assigned to the sample. } +\item{superPop}{ a \code{character} string representing the +super-population assigned to the sample. } +}} + +\item{genoSource}{a \code{character} string with four possible values: +'snp-pileup', 'generic', 'VCF' or 'bam'. It specifies if the genotype files +are generated by snp-pileup (Facets) or are a generic format CSV file +with at least those columns: +'Chromosome', 'Position', 'Ref', 'Alt', 'Count', 'File1R' and 'File1A'. +The 'Count' is the depth at the specified position; +'FileR' is the depth of the reference allele and +'File1A' is the depth of the specific alternative allele. +Finally the file can be a VCF file with at least those genotype +fields: GT, AD, DP.} + +\item{np}{a single positive \code{integer} specifying the number of +threads to be used. Default: \code{1L}.} + +\item{blockTypeID}{a \code{character} string corresponding to the block +type used to extract the block identifiers. The block type must be +present in the GDS Reference Annotation file.} + +\item{verbose}{a \code{logical} indicating if messages should be printed +to show how the different steps in the function. Default: \code{FALSE}.} +} +\value{ +a \code{list} containing 4 entries: +\describe{ +\item{\code{pcaSample}}{ a \code{list} containing the information related +to the eigenvectors. The \code{list} contains those 3 entries: +\describe{ +\item{\code{sample.id}}{ a \code{character} string representing the unique +identifier of the current profile.} +\item{\code{eigenvector.ref}}{ a \code{matrix} of \code{numeric} containing +the eigenvectors for the reference profiles.} +\item{\code{eigenvector}}{ a \code{matrix} of \code{numeric} containing the +eigenvectors for the current profile projected on the PCA from the +reference profiles.} +} +} +\item{\code{paraSample}}{ a \code{list} containing the results with +different \code{D} and \code{K} values that lead to optimal parameter +selection. The \code{list} contains those entries: +\describe{ +\item{\code{dfPCA}}{ a \code{data.frame} containing statistical results +on all combined synthetic results done with a fixed value of \code{D} (the +number of dimensions). The \code{data.frame} contains those columns: +\describe{ +\item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +number of dimensions).} +\item{\code{median}}{ a \code{numeric} representing the median of the +minimum AUROC obtained (within super populations) for all combination of +the fixed \code{D} value and all tested \code{K} values. } +\item{\code{mad}}{ a \code{numeric} representing the MAD of the minimum +AUROC obtained (within super populations) for all combination of the fixed +\code{D} value and all tested \code{K} values. } +\item{\code{upQuartile}}{ a \code{numeric} representing the upper quartile +of the minimum AUROC obtained (within super populations) for all +combination of the fixed \code{D} value and all tested \code{K} values. } +\item{\code{k}}{ a \code{numeric} representing the optimal \code{K} value +(the number of neighbors) for a fixed \code{D} value. } +} +} +\item{\code{dfPop}}{ a \code{data.frame} containing statistical results on +all combined synthetic results done with different values of \code{D} (the +number of dimensions) and \code{K} (the number of neighbors). +The \code{data.frame} contains those columns: +\describe{ +\item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +number of dimensions).} +\item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +number of neighbors).} +\item{\code{AUROC.min}}{ a \code{numeric} representing the minimum accuracy +obtained by grouping all the synthetic results by super-populations, for +the specified values of \code{D} and \code{K}.} +\item{\code{AUROC}}{ a \code{numeric} representing the accuracy obtained +by grouping all the synthetic results for the specified values of \code{D} +and \code{K}.} +\item{\code{Accu.CM}}{ a \code{numeric} representing the value of accuracy +of the confusion matrix obtained by grouping all the synthetic results for +the specified values of \code{D} and \code{K}.} +} +} +\item{\code{dfAUROC}}{ a \code{data.frame} the summary of the results by +super-population. The \code{data.frame} contains +those columns: +\describe{ +\item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +number of dimensions).} +\item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +number of neighbors).} +\item{\code{Call}}{ a \code{character} string representing the +super-population.} +\item{\code{L}}{ a \code{numeric} representing the lower value of the 95\% +confidence interval for the AUROC obtained for the fixed values of +super-population, \code{D} and \code{K}.} +\item{\code{AUROC}}{ a \code{numeric} representing the AUROC obtained for +the fixed values of super-population, \code{D} and \code{K}.} +\item{\code{H}}{ a \code{numeric} representing the higher value of the 95\% +confidence interval for the AUROC obtained for the fixed values of +super-population, \code{D} and \code{K}.} +} +} +\item{\code{D}}{ a \code{numeric} representing the optimal \code{D} value +(the number of dimensions) for the specific profile.} +\item{\code{K}}{ a \code{numeric} representing the optimal \code{K} value +(the number of neighbors) for the specific profile.} +\item{\code{listD}}{ a \code{numeric} representing the optimal \code{D} +values (the number of dimensions) for the specific profile. More than one +\code{D} is possible.} +} +} +\item{\code{KNNSample}}{ a \code{data.frame} containing the inferred +ancestry for different values of \code{K} and \code{D}. The +\code{data.frame} contains those columns: +\describe{ +\item{\code{sample.id}}{ a \code{character} string representing the unique +identifier of the current profile.} +\item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +number of dimensions) used to infer the ancestry. } +\item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +number of neighbors) used to infer the ancestry. } +\item{\code{SuperPop}}{ a \code{character} string representing the inferred +ancestry for the specified \code{D} and \code{K} values.} +} +} +\item{\code{KNNSynthetic}}{ a \code{data.frame} containing the inferred +ancestry for each synthetic data for different values of \code{K} and +\code{D}. +The \code{data.frame} +contains those columns: +\describe{ +\item{\code{sample.id}}{ a \code{character} string representing the unique +identifier of the current synthetic data.} +\item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +number of dimensions) used to infer the ancestry. } +\item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +number of neighbors) used to infer the ancestry. } +\item{\code{infer.superPop}}{ a \code{character} string representing the +inferred ancestry for the specified \code{D} and \code{K} values.} +\item{\code{ref.superPop}}{ a \code{character} string representing the known +ancestry from the reference} +} +} +\item{\code{Ancestry}}{ a \code{data.frame} containing the inferred +ancestry for the current profile. The \code{data.frame} contains those +columns: +\describe{ +\item{\code{sample.id}}{ a \code{character} string representing the unique +identifier of the current profile.} +\item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +number of dimensions) used to infer the ancestry.} +\item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +number of neighbors) used to infer the ancestry.} +\item{\code{SuperPop}}{ a \code{character} string representing the inferred +ancestry.} +} +} +} +} +\description{ +This function runs most steps leading to the ancestry inference +call on a specific RNA profile. First, the function creates the +Profile GDS file for the specific profile using the information from a +RDS Sample description file and the Population Reference GDS file. +} +\details{ +The runExomeAncestry() function generates 3 types of files +in the OUTPUT directory. +\describe{ +\item{Ancestry Inference}{ The ancestry inference CSV file +(".Ancestry.csv" file)} +\item{Inference Informaton}{ The inference information RDS file +(".infoCall.rds" file)} +\item{Synthetic Information}{ The parameter information RDS files +from the synthetic inference ("KNN.synt.*.rds" files in a sub-directory)} +} + +In addition, a sub-directory (named using the profile ID) is +also created. +} +\examples{ + +## Required library for GDS +library(SNPRelate) + +## Path to the demo 1KG GDS file is located in this package +dataDir <- system.file("extdata", package="RAIDS") + + +################################################################# +## The 1KG GDS file and the 1KG SNV Annotation GDS file +## need to be located in the same directory +## Note that the 1KG GDS file used for this example is a +## simplified version and CANNOT be used for any real analysis +################################################################# +path1KG <- file.path(dataDir, "tests") + +fileReferenceGDS <- file.path(path1KG, "ex1_good_small_1KG.gds") +fileAnnotGDS <- file.path(path1KG, "ex1_good_small_1KG_Annot.gds") + +################################################################# +## The Sample SNP pileup files (one per sample) need +## to be located in the same directory. +################################################################# +demoProfileEx1 <- file.path(dataDir, "example", "snpPileup", "ex1.txt.gz") + +################################################################# +## The path where the Profile GDS Files (one per sample) +## will be created need to be specified. +################################################################# +pathProfileGDS <- file.path(tempdir(), "out.tmp") + +#################################################################### +## Fix seed to ensure reproducible results +#################################################################### +set.seed(3043) + +gds1KG <- snpgdsOpen(fileReferenceGDS) +dataRef <- select1KGPop(gds1KG, nbProfiles=2L) +closefn.gds(gds1KG) + +## Required library for this example to run correctly +if (requireNamespace("GenomeInfoDb", quietly=TRUE) && + requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) { + + ## Chromosome length information + ## chr23 is chrX, chr24 is chrY and chrM is 25 + chrInfo <- GenomeInfoDb::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25] + + \donttest{ + + res <- inferAncestryGeneAware(profileFile=demoProfileEx1, + pathProfileGDS=pathProfileGDS, + fileReferenceGDS=fileReferenceGDS, + fileReferenceAnnotGDS=fileAnnotGDS, + chrInfo=chrInfo, + syntheticRefDF=dataRef, + blockTypeID="GeneS.Ensembl.Hsapiens.v86", + genoSource="snp-pileup") + + unlink(pathProfileGDS, recursive=TRUE, force=TRUE) + + } +} + +} +\references{ +Galinsky KJ, Bhatia G, Loh PR, Georgiev S, Mukherjee S, Patterson NJ, +Price AL. Fast Principal-Component Analysis Reveals Convergent Evolution +of ADH1B in Europe and East Asia. Am J Hum Genet. 2016 Mar 3;98(3):456-72. +doi: 10.1016/j.ajhg.2015.12.022. Epub 2016 Feb 25. +} +\author{ +Pascal Belleau, Astrid Deschênes and Alexander Krasnitz +} diff --git a/man/matKNNSynthetic.Rd b/man/matKNNSynthetic.Rd new file mode 100644 index 000000000..02c9630aa --- /dev/null +++ b/man/matKNNSynthetic.Rd @@ -0,0 +1,79 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RAIDS.R +\docType{data} +\name{matKNNSynthetic} +\alias{matKNNSynthetic} +\title{A small \code{data.frame} containing the +inferred ancestry on the synthetic profiles.} +\format{ +The \code{data.frame} containing the information about the +synthetic profiles. The \code{data.frame} contains 4 columns: +\describe{ +\item{\code{sample.id}}{ a \code{character} string representing the unique +synthetic profile identifier.} +\item{\code{D}}{ a \code{numeric} representing the number of dimensions used +to infer the ancestry of the synthetic profile.} +\item{\code{K}}{ a \code{numeric} representing the number of neighbors used +to infer the ancestry of the synthetic profile.} +\item{\code{SuperPop}}{ a \code{character} string representing the +inferred ancestry of the synthetic profile for the specific D and K values.} +} +} +\usage{ +data(matKNNSynthetic) +} +\value{ +The \code{data.frame} containing the information about the +synthetic profiles. The \code{data.frame} contains 4 columns: +\describe{ +\item{\code{sample.id}}{ a \code{character} string representing the unique +synthetic profile identifier.} +\item{\code{D}}{ a \code{numeric} representing the number of dimensions used +to infer the ancestry of the synthetic profile.} +\item{\code{K}}{ a \code{numeric} representing the number of neighbors used +to infer the ancestry of the synthetic profile.} +\item{\code{SuperPop}}{ a \code{character} string representing the +inferred ancestry of the synthetic profile for the specific D and K values.} +} +} +\description{ +The object is a \code{data.frame} with 4 columns. +} +\details{ +This dataset can be +used to test the \code{\link{computeSyntheticROC}} function. +} +\examples{ + +## Loading demo dataset containing pedigree information for synthetic +## profiles +data(pedSynthetic) + +## Loading demo dataset containing the inferred ancestry results +## for the synthetic data +data(matKNNSynthetic) + +## Retain one K and one D value +matKNN <- matKNNSynthetic[matKNNSynthetic$D == 5 & matKNNSynthetic$K == 4, ] + +## Compile statistics from the +## synthetic profiles for fixed values of D and K +results <- RAIDS:::computeSyntheticROC(matKNN=matKNN, + matKNNAncestryColumn="SuperPop", + pedCall=pedSynthetic, pedCallAncestryColumn="superPop", + listCall=c("EAS", "EUR", "AFR", "AMR", "SAS")) + +results$matAUROC.All +results$matAUROC.Call +results$listROC.Call + + +} +\seealso{ +\describe{ +\item{\code{\link{computeSyntheticROC}}}{ for calculating the AUROC of +the inferences for specific values of D and K using the inferred +ancestry results from the synthetic profiles} +} +} +\keyword{datasets} diff --git a/man/pedSynthetic.Rd b/man/pedSynthetic.Rd new file mode 100644 index 000000000..4e8fad85e --- /dev/null +++ b/man/pedSynthetic.Rd @@ -0,0 +1,99 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RAIDS.R +\docType{data} +\name{pedSynthetic} +\alias{pedSynthetic} +\title{A small \code{data.frame} containing the information related to +synthetic profiles. The ancestry of the profiles used to generate the +synthetic profiles must be present.} +\format{ +The \code{data.frame} containing the information about the +synthetic profiles. The row names of +the \code{data.frame} correspond to the profile unique identifiers. +The \code{data.frame} contains 7 columns: +\describe{ +\item{\code{data.id}}{ a \code{character} string representing the unique +synthetic profile identifier.} +\item{\code{case.id}}{ a \code{character} string representing the unique +profile identifier that was used to generate the synthetic profile.} +\item{\code{sample.type}}{ a \code{character} string representing the type +of profile. } +\item{\code{diagnosis}}{ a \code{character} string representing the +diagnosis of profile that was used to generate the synthetic profile. } +\item{\code{source}}{ a \code{character} string representing the +source of the synthetic profile. } +\item{\code{study.id}}{ a \code{character} string representing the +name of the study to which the synthetic profile is associated. } +\item{\code{superPop}}{ a \code{character} string representing the +super population of the profile that was used to generate the synthetic +profile. } +} +} +\usage{ +data(pedSynthetic) +} +\value{ +The \code{data.frame} containing the information about the +synthetic profiles. The row names of +the \code{data.frame} correspond to the profile unique identifiers. +The \code{data.frame} contains 7 columns: +\describe{ +\item{\code{data.id}}{ a \code{character} string representing the unique +synthetic profile identifier.} +\item{\code{case.id}}{ a \code{character} string representing the unique +profile identifier that was used to generate the synthetic profile.} +\item{\code{sample.type}}{ a \code{character} string representing the type +of profile.} +\item{\code{diagnosis}}{ a \code{character} string representing the +diagnosis of profile that was used to generate the synthetic profile. } +\item{\code{source}}{ a \code{character} string representing the +source of the synthetic profile. } +\item{\code{study.id}}{ a \code{character} string representing the +name of the study to which the synthetic profile is associated. } +\item{\code{superPop}}{ a \code{character} string representing the +super population of the profile that was used to generate the synthetic +profile. } +} +} +\description{ +The object is a \code{data.frame} with 7 columns. The row names of +the \code{data.frame} must be the profile unique identifiers. +} +\details{ +This dataset can be +used to test the \code{\link{computeSyntheticROC}} function. +} +\examples{ + +## Loading demo dataset containing pedigree information for synthetic +## profiles +data(pedSynthetic) + +## Loading demo dataset containing the inferred ancestry results +## for the synthetic data +data(matKNNSynthetic) + +## Retain one K and one D value +matKNN <- matKNNSynthetic[matKNNSynthetic$D == 5 & matKNNSynthetic$K == 4, ] + +## Compile statistics from the +## synthetic profiles for fixed values of D and K +results <- RAIDS:::computeSyntheticROC(matKNN=matKNN, + matKNNAncestryColumn="SuperPop", + pedCall=pedSynthetic, pedCallAncestryColumn="superPop", + listCall=c("EAS", "EUR", "AFR", "AMR", "SAS")) + +results$matAUROC.All +results$matAUROC.Call +results$listROC.Call + + +} +\seealso{ +\describe{ +\item{\code{\link{computeSyntheticROC}}}{ for calculating the AUROC of +the inferences for specific values of D and K using the inferred +ancestry results from the synthetic profiles} +} +} +\keyword{datasets} diff --git a/man/prepPed1KG.Rd b/man/prepPed1KG.Rd index d467cccd6..c7902315a 100644 --- a/man/prepPed1KG.Rd +++ b/man/prepPed1KG.Rd @@ -3,44 +3,45 @@ \encoding{UTF-8} \name{prepPed1KG} \alias{prepPed1KG} -\title{Prepare the pedigree file using pedigree information from 1KG} +\title{Prepare the pedigree file using pedigree information from Reference} \usage{ prepPed1KG(filePed, pathGeno = file.path("data", "sampleGeno"), batch = 0L) } \arguments{ \item{filePed}{a \code{character} string representing the path and file name of the pedigree file (PED file) that contains the information -related to the profiles present in the 1KG GDS file. The PED file must +related to the profiles present in the Reference GDS file. The PED file must exist.} \item{pathGeno}{a \code{character} string representing the path where -the 1KG genotyping files for each profile are located. Only the profiles -with associated genotyping files are retained in the creation of the final -\code{data.frame}. The name of the genotyping files must correspond to -the individual identification (Individual.ID) in the pedigree file -(PED file). +the Reference genotyping files for each profile are located. Only the +profiles with associated genotyping files are retained in the creation of +the final \code{data.frame}. The name of the genotyping files must +correspond to the individual identification (Individual.ID) in the +pedigree file (PED file). Default: \code{"./data/sampleGeno"}.} \item{batch}{a\code{integer} that uniquely identifies the source of the -pedigree information. The 1KG is usually \code{0L}. Default: \code{0L}.} +pedigree information. The Reference is usually \code{0L}. +Default: \code{0L}.} } \value{ a \code{data.frame} containing the needed pedigree information -from 1KG. The \code{data.frame} contains those columns: -\itemize{ -\item{sample.id}{a \code{character} string representing the profile unique +from Reference. The \code{data.frame} contains those columns: +\describe{ +\item{sample.id}{ a \code{character} string representing the profile unique ID.} -\item{Name.ID}{a \code{character} string representing the profile name.} +\item{Name.ID}{ a \code{character} string representing the profile name.} \item{sex}{a \code{character} string representing the sex of the profile.} -\item{pop.group}{a \code{character} string representing the +\item{pop.group}{ a \code{character} string representing the sub-continental ancestry of the profile.} -\item{superPop }{a \code{character} string representing the continental +\item{superPop }{ a \code{character} string representing the continental ancestry of the profile.} -\item{superPop }{a \code{integer} representing the batch of the profile.} +\item{superPop }{ a \code{integer} representing the batch of the profile.} } } \description{ -Using the pedigree file from 1KG, this function extracts +Using the pedigree file from Reference, this function extracts needed information and formats it into a \code{data.frame} so in can be used in following steps of the ancestry inference process. The function also requires that the genotyping files associated to each @@ -51,12 +52,15 @@ sample be available in a specified directory. ## Path to the demo pedigree file is located in this package dataDir <- system.file("extdata", package="RAIDS") +## Path where the demo genotype CSV files are located +pathGeno <- file.path(dataDir, "demoProfileGenotypes") + ## Demo pedigree file pedDemoFile <- file.path(dataDir, "PedigreeDemo.ped") ## Create a data.frame containing the information of the retained ## samples (samples with existing genotyping files) -prepPed1KG(filePed=pedDemoFile, pathGeno=dataDir, batch=0L) +prepPed1KG(filePed=pedDemoFile, pathGeno=pathGeno, batch=0L) } diff --git a/man/prepPedSynthetic1KG.Rd b/man/prepPedSynthetic1KG.Rd index 6fb8ed77c..193d7f528 100644 --- a/man/prepPedSynthetic1KG.Rd +++ b/man/prepPedSynthetic1KG.Rd @@ -58,7 +58,7 @@ library(gdsfmt) ## The open 1KG GDS file is required (this is a demo file) dataDir <- system.file("extdata", package="RAIDS") -gds_1KG_file <- file.path(dataDir, "1KG_Demo.gds") +gds_1KG_file <- file.path(dataDir, "PopulationReferenceDemo.gds") gds1KG <- openfn.gds(gds_1KG_file) fileSampleGDS <- file.path(dataDir, "GDS_Sample_with_study_demo.gds") diff --git a/man/prepSynthetic.Rd b/man/prepSynthetic.Rd index 10d487c61..6d2df3d98 100644 --- a/man/prepSynthetic.Rd +++ b/man/prepSynthetic.Rd @@ -63,7 +63,8 @@ the Profile GDS file "study.annot" node. Both the "Source" and the "Sample.Type" entries are always set to 'Synthetic'. The synthetic profiles are assigned unique names by combining: -[prefix].[data.id.profile].[listSampleRef].[simulation number(1 to nbSim)] +\code{prefix}.\code{data.id.profile}.\code{listSampleRef}.\code{simulation +number(1 to nbSim)} } \examples{ @@ -73,28 +74,23 @@ library(gdsfmt) ## Path to the demo 1KG GDS file is located in this package dataDir <- system.file("extdata/tests", package="RAIDS") -## Copy the Profile GDS file demo that has been pruned and annotated -## into a test directory (deleted after the example has been run) -dataDirAllelicFraction <- file.path(system.file("extdata", package="RAIDS"), - "demoAllelicFraction") -dir.create(dataDirAllelicFraction, showWarnings=FALSE, - recursive=FALSE, mode="0777") - -## Profile GDS file -fileNameGDS <- file.path(dataDirAllelicFraction, "ex1.gds") +## Temporary Profile GDS file +fileNameGDS <- file.path(tempdir(), "ex1.gds") +## Copy the Profile GDS file demo that has been pruned and annotated file.copy(file.path(dataDir, "ex1_demo_with_pruning_and_1KG_annot.gds"), fileNameGDS) ## Information about the synthetic data set syntheticStudyDF <- data.frame(study.id="MYDATA.Synthetic", - study.desc="MYDATA synthetic data", study.platform="PLATFORM", - stringsAsFactors=FALSE) + study.desc="MYDATA synthetic data", study.platform="PLATFORM", + stringsAsFactors=FALSE) ## Add information related to the synthetic profiles into the Profile GDS prepSynthetic(fileProfileGDS=fileNameGDS, - listSampleRef=c("HG00243", "HG00150"), profileID="ex1", - studyDF=syntheticStudyDF, nbSim=1L, prefix="synthetic", verbose=FALSE) + listSampleRef=c("HG00243", "HG00150"), profileID="ex1", + studyDF=syntheticStudyDF, nbSim=1L, prefix="synthetic", + verbose=FALSE) ## Open Profile GDS file profileGDS <- openfn.gds(fileNameGDS) @@ -102,15 +98,16 @@ profileGDS <- openfn.gds(fileNameGDS) ## The synthetic profiles should be added in the 'study.annot' entry tail(read.gdsn(index.gdsn(profileGDS, "study.annot"))) -## The synthetic study information should be added to the 'study.list' entry +## The synthetic study information should be added to +## the 'study.list' entry tail(read.gdsn(index.gdsn(profileGDS, "study.list"))) ## Close GDS file (important) closefn.gds(profileGDS) -## Unlink Profile GDS file (created for demo purpose) -unlink(file.path(dataDirAllelicFraction, "ex1.gds")) -unlink(dataDirAllelicFraction) +## Remove Profile GDS file (created for demo purpose) +unlink(fileNameGDS, force=TRUE) + } \author{ diff --git a/man/processBlockChr.Rd b/man/processBlockChr.Rd index a0e39ac5d..95e71bcf0 100644 --- a/man/processBlockChr.Rd +++ b/man/processBlockChr.Rd @@ -3,28 +3,50 @@ \encoding{UTF-8} \name{processBlockChr} \alias{processBlockChr} -\title{TODO} +\title{The function create a vector of integer representing the linkage +disequilibrium block for each SNV in the in the same order +than the variant in Population reference dataset.} \usage{ -processBlockChr(snp.keep, PATHBLOCK, superPop, chr) +processBlockChr(fileReferenceGDS, fileBlock) } \arguments{ -\item{snp.keep}{TODO} +\item{fileReferenceGDS}{a \code{character} string representing the file +name of the Reference GDS file. The file must exist.} -\item{PATHBLOCK}{TODO} - -\item{superPop}{TODO} - -\item{chr}{TODO} +\item{fileBlock}{a \code{character} string representing the file +name of output file det from the plink block command for a chromosome.} } \value{ -the a \code{array} with the sample from pedDF keept +a \code{list} containing 2 entries: +\describe{ +\item{\code{chr}}{ a \code{integer} representing a the chromosome from +fileBlock. +} +\item{\code{block.snp}}{ a \code{array} of integer +representing the linkage disequilibrium block for +each SNV in the in the same order than the variant +in Population reference dataset. +} +} } \description{ -TODO +The function create a vector of integer representing the linkage +disequilibrium block for each SNV in the in the same order +than the variant in Population reference dataset. } \examples{ -# TODO +## Path to the demo pedigree file is located in this package +dataDir <- system.file("extdata", package="RAIDS") + +## Demo of Reference GDS file containing reference information +fileReferenceGDS <- file.path(dataDir, "PopulationReferenceDemo.gds") + +## Demo of of output file det from the plink block +## command for chromosome 1 +fileLdBlock <- file.path(dataDir, "block.sp.EUR.Ex.chr1.blocks.det") + +listLdBlock <- RAIDS:::processBlockChr(fileReferenceGDS, fileLdBlock) } \author{ diff --git a/man/processPileupChrBin.Rd b/man/processPileupChrBin.Rd new file mode 100644 index 000000000..924aaf126 --- /dev/null +++ b/man/processPileupChrBin.Rd @@ -0,0 +1,68 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/tools_internal.R +\encoding{UTF-8} +\name{processPileupChrBin} +\alias{processPileupChrBin} +\title{Extract SNV information from pileup file for a selected chromosome} +\usage{ +processPileupChrBin(chr, resPileup, varDf, verbose) +} +\arguments{ +\item{chr}{a \code{character} string representing the name of the +chromosome to keep} + +\item{resPileup}{a \code{data.frame} as generated by the \code{pileup} +function from \code{Rsamtools} package} + +\item{varDf}{a \code{list} containing a \code{data.frame} representing +the positions to keep for each chromosome.} + +\item{verbose}{a \code{logical} indicating if messages should be printed} +} +\value{ +a \code{data.frame} containing at least: +\describe{ +\item{seqnames}{ a \code{character} representing the name of the chromosome} +\item{pos}{ a \code{numeric} representing the position on the chromosome} +\item{REF}{ a \code{character} string representing the reference nucleotide} +\item{ALT}{ a \code{character} string representing the alternative +nucleotide} +\item{A}{ a \code{numeric} representing the count for the A nucleotide} +\item{C}{ a \code{numeric} representing the count for the C nucleotide} +\item{G}{ a \code{numeric} representing the count for the G nucleotide} +\item{T}{ a \code{numeric} representing the count for the T nucleotide} +\item{count}{ a \code{numeric} representing the total count} +} +} +\description{ +The function reads pileup file and +returns a \code{data.frame} +containing the information about the read counts for the SNVs present in +the selected chromosome. +} +\examples{ + +## Demo pileup result data.frame +resDemo <- data.frame(seqnames=rep("chr14", 10), + pos=c(19069583, 19069584, 19069586, 19069588, 19069589, 19069590, + 19069591, 19069592, 19069609, 19069760), + strand=c(rep("+", 5), rep("-", 5)), + nucleotide=c("T", "G", "G", "C", "A", "A", "C", "T", "T", "G"), + count=c(5, 3, 2, 4, 1, 2, 1, 8, 7, 4)) +resDemo$seqnames <- factor(resDemo$seqnames) +resDemo$strand <- factor(resDemo$strand) +resDemo$nucleotide <- factor(resDemo$nucleotide) + +## Position to keep in a data.frame format +varInfo <- list("chr14"=data.frame(chr=c("chr14", "chr14"), + start=c(19069584, 19069609), REF=c("A", "G"), ALT=c("T", "A"))) + +## Extract information from pileup for selected positions +RAIDS:::processPileupChrBin(chr="chr14", resPileup=resDemo, varDf=varInfo, + verbose=FALSE) + +} +\author{ +Pascal Belleau, Astrid Deschênes and Alexander Krasnitz +} +\keyword{internal} diff --git a/man/profileAncestry.Rd b/man/profileAncestry.Rd new file mode 100644 index 000000000..3f2bfef23 --- /dev/null +++ b/man/profileAncestry.Rd @@ -0,0 +1,330 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/processStudy_internal.R +\encoding{UTF-8} +\name{profileAncestry} +\alias{profileAncestry} +\title{Run most steps leading to the ancestry inference call on a +specific profile (LD or geneAware)} +\usage{ +profileAncestry( + gdsReference, + gdsRefAnnot, + studyDF, + currentProfile, + pathProfileGDS, + chrInfo, + syntheticRefDF, + studyDFSyn, + listProfileRef, + studyType = c("LD", "GeneAware"), + np = 1L, + blockTypeID = NULL, + verbose = FALSE +) +} +\arguments{ +\item{gdsReference}{an object of class \code{\link[gdsfmt]{gds.class}} +(a GDS file), the opened Population Reference GDS file.} + +\item{gdsRefAnnot}{an object of class \code{\link[gdsfmt]{gds.class}} +(a GDS file), the opened Population Reference SNV Annotation GDS file. +This parameter is RNA specific.} + +\item{studyDF}{a \code{data.frame} containing the information about the +study associated to the analysed sample(s). The \code{data.frame} must have +those 3 columns: "study.id", "study.desc", "study.platform". All columns +must be in \code{character} strings (no factor).} + +\item{currentProfile}{a \code{character} string representing the profile +identifier.} + +\item{pathProfileGDS}{a \code{character} string representing the path to +the directory where the GDS Profile files will be created. +Default: \code{NULL}.} + +\item{chrInfo}{a \code{vector} of positive \code{integer} values +representing the length of the chromosomes. See 'details' section.} + +\item{syntheticRefDF}{a \code{data.frame} containing a subset of +reference profiles for each sub-population present in the Reference GDS +file. The \code{data.frame} must have those columns: +\describe{ +\item{sample.id}{ a \code{character} string representing the sample +identifier. } +\item{pop.group}{ a \code{character} string representing the +subcontinental population assigned to the sample. } +\item{superPop}{ a \code{character} string representing the +super-population assigned to the sample. } +}} + +\item{studyDFSyn}{a \code{data.frame} containing the information about the +synthetic data to the analysed sample(s). The \code{data.frame} must have +those 3 columns: "study.id", "study.desc", "study.platform". All columns +must be in \code{character} strings (no factor).} + +\item{listProfileRef}{a \code{vector} of \code{character} string +representing the +identifiers of the selected 1KG profiles that will be used as reference to +generate the synthetic profiles.} + +\item{studyType}{a \code{character} string representing the type of study. +The possible choices are: "LD" and "GeneAware". The type of study affects the +way the estimation of the allelic fraction is done. Default: \code{"LD"}.} + +\item{np}{a single positive \code{integer} specifying the number of +threads to be used. Default: \code{1L}.} + +\item{blockTypeID}{a \code{character} string corresponding to the block +type used to extract the block identifiers. The block type must be +present in the GDS Reference Annotation file.} + +\item{verbose}{a \code{logical} indicating if messages should be printed +to show how the different steps in the function. Default: \code{FALSE}.} +} +\value{ +a \code{list} containing 4 entries: +\describe{ +\item{\code{pcaSample}}{ a \code{list} containing the information related +to the eigenvectors. The \code{list} contains those 3 entries: +\describe{ +\item{\code{sample.id}}{ a \code{character} string representing the unique +identifier of the current profile.} +\item{\code{eigenvector.ref}}{ a \code{matrix} of \code{numeric} containing +the eigenvectors for the reference profiles.} +\item{\code{eigenvector}}{ a \code{matrix} of \code{numeric} containing the +eigenvectors for the current profile projected on the PCA from the +reference profiles.} +} +} +\item{\code{paraSample}}{ a \code{list} containing the results with +different \code{D} and \code{K} values that lead to optimal parameter +selection. The \code{list} contains those entries: +\describe{ +\item{\code{dfPCA}}{ a \code{data.frame} containing statistical results +on all combined synthetic results done with a fixed value of \code{D} (the +number of dimensions). The \code{data.frame} contains those columns: +\describe{ +\item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +number of dimensions).} +\item{\code{median}}{ a \code{numeric} representing the median of the +minimum AUROC obtained (within super populations) for all combination of +the fixed \code{D} value and all tested \code{K} values. } +\item{\code{mad}}{ a \code{numeric} representing the MAD of the minimum +AUROC obtained (within super populations) for all combination of the fixed +\code{D} value and all tested \code{K} values. } +\item{\code{upQuartile}}{ a \code{numeric} representing the upper quartile +of the minimum AUROC obtained (within super populations) for all +combination of the fixed \code{D} value and all tested \code{K} values. } +\item{\code{k}}{ a \code{numeric} representing the optimal \code{K} value +(the number of neighbors) for a fixed \code{D} value. } +} +} +\item{\code{dfPop}}{ a \code{data.frame} containing statistical results on +all combined synthetic results done with different values of \code{D} (the +number of dimensions) and \code{K} (the number of neighbors). +The \code{data.frame} contains those columns: +\describe{ +\item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +number of dimensions).} +\item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +number of neighbors).} +\item{\code{AUROC.min}}{ a \code{numeric} representing the minimum accuracy +obtained by grouping all the synthetic results by super-populations, for +the specified values of \code{D} and \code{K}.} +\item{\code{AUROC}}{ a \code{numeric} representing the accuracy obtained +by grouping all the synthetic results for the specified values of \code{D} +and \code{K}.} +\item{\code{Accu.CM}}{ a \code{numeric} representing the value of accuracy +of the confusion matrix obtained by grouping all the synthetic results for +the specified values of \code{D} and \code{K}.} +} +} +\item{\code{dfAUROC}}{ a \code{data.frame} the summary of the results by +super-population. The \code{data.frame} contains +those columns: +\describe{ +\item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +number of dimensions).} +\item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +number of neighbors).} +\item{\code{Call}}{ a \code{character} string representing the +super-population.} +\item{\code{L}}{ a \code{numeric} representing the lower value of the 95\% +confidence interval for the AUROC obtained for the fixed values of +super-population, \code{D} and \code{K}.} +\item{\code{AUROC}}{ a \code{numeric} representing the AUROC obtained for the +fixed values of super-population, \code{D} and \code{K}.} +\item{\code{H}}{ a \code{numeric} representing the higher value of the 95\% +confidence interval for the AUROC obtained for the fixed values of +super-population, \code{D} and \code{K}.} +} +} +\item{\code{D}}{ a \code{numeric} representing the optimal \code{D} value +(the number of dimensions) for the specific profile.} +\item{\code{K}}{ a \code{numeric} representing the optimal \code{K} value +(the number of neighbors) for the specific profile.} +\item{\code{listD}}{ a \code{numeric} representing the optimal \code{D} +values (the number of dimensions) for the specific profile. More than one +\code{D} is possible.} +} +} +\item{\code{KNNSample}}{ a \code{data.frame} containing the inferred ancestry +for different values of \code{K} and \code{D}. The \code{data.frame} +contains those columns: +\describe{ +\item{\code{sample.id}}{ a \code{character} string representing the unique +identifier of the current profile.} +\item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +number of dimensions) used to infer the ancestry. } +\item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +number of neighbors) used to infer the ancestry. } +\item{\code{SuperPop}}{ a \code{character} string representing the inferred +ancestry for the specified \code{D} and \code{K} values.} +} +} +\item{\code{KNNSynthetic}}{ a \code{data.frame} containing the inferred ancestry +for each synthetic data for different values of \code{K} and \code{D}. +The \code{data.frame} +contains those columns: "sample.id", "D", "K", "infer.superPop", "ref.superPop" +\describe{ +\item{\code{sample.id}}{ a \code{character} string representing the unique +identifier of the current synthetic data.} +\item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +number of dimensions) used to infer the ancestry. } +\item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +number of neighbors) used to infer the ancestry. } +\item{\code{infer.superPop}}{ a \code{character} string representing the inferred +ancestry for the specified \code{D} and \code{K} values.} +\item{\code{ref.superPop}}{ a \code{character} string representing the known +ancestry from the reference} +} +} +\item{\code{Ancestry}}{ a \code{data.frame} containing the inferred +ancestry for the current profile. The \code{data.frame} contains those +columns: +\describe{ +\item{\code{sample.id}}{ a \code{character} string representing the unique +identifier of the current profile.} +\item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +number of dimensions) used to infer the ancestry.} +\item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +number of neighbors) used to infer the ancestry.} +\item{\code{SuperPop}}{ a \code{character} string representing the inferred +ancestry.} +} +} +} +} +\description{ +This function runs most steps leading to the ancestry inference +call on a specific profile. First, the function creates the Profile GDS file +for the specific profile using the information from a RDS Sample +description file and the Population reference GDS file. +} +\examples{ + +## Required library for GDS +library(SNPRelate) + +## Path to the demo 1KG GDS file is located in this package +dataDir <- system.file("extdata", package="RAIDS") + +################################################################# +## Load the information about the profile +################################################################# +data(demoPedigreeEx1) +head(demoPedigreeEx1) + +################################################################# +## The 1KG GDS file and the 1KG SNV Annotation GDS file +## need to be located in the same directory +## Note that the 1KG GDS file used for this example is a +## simplified version and CANNOT be used for any real analysis +################################################################# +path1KG <- file.path(dataDir, "tests") + +fileReferenceGDS <- file.path(path1KG, "ex1_good_small_1KG.gds") +fileAnnotGDS <- file.path(path1KG, "ex1_good_small_1KG_Annot.gds") + +################################################################# +## The Sample SNP pileup files (one per sample) need +## to be located in the same directory. +################################################################# +pathGeno <- file.path(dataDir, "example", "snpPileup") + +################################################################# +## The path where the Profile GDS Files (one per sample) +## will be created need to be specified. +################################################################# + +pathProfileGDS <- file.path(tempdir(), "outTest.tmp") + + +################################################################# +## A data frame containing general information about the study +## is also required. The data frame must have +## those 3 columns: "studyID", "study.desc", "study.platform" +################################################################# +studyDF <- data.frame(study.id="MYDATA", + study.desc="Description", + study.platform="PLATFORM", + stringsAsFactors=FALSE) + +#################################################################### +## Fix seed to ensure reproducible results +#################################################################### +set.seed(3043) + +dataRef <- select1KGPopForSynthetic(fileReferenceGDS, nbProfiles=2L) + +## Required library for this example to run correctly +if (requireNamespace("GenomeInfoDb", quietly=TRUE) && + requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) { + + ## Chromosome length information + ## chr23 is chrX, chr24 is chrY and chrM is 25 + chrInfo <- GenomeInfoDb::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25] + + studyDFSyn <- data.frame(study.id=paste0(studyDF$study.id, ".Synthetic"), + study.desc=paste0(studyDF$study.id, " synthetic data"), + study.platform=studyDF$study.platform, stringsAsFactors=FALSE) + + listProfileRef <- dataRef$sample.id + profileFile <- file.path(pathProfileGDS, "ex1.gds") + + \dontrun{ + + dir.create(pathProfileGDS) + file.copy(file.path(dataDir, "tests", "ex1_demo.gds"), profileFile) + + gdsReference <- snpgdsOpen(fileReferenceGDS) + gdsRefAnnot <- openfn.gds(fileAnnotGDS) + + res <- RAIDS:::profileAncestry(gdsReference=gdsReference, + gdsRefAnnot=gdsRefAnnot, + studyDF=studyDF, currentProfile=demoPedigreeEx1[1,"Name.ID"], + pathProfileGDS=pathProfileGDS, + chrInfo=chrInfo, + syntheticRefDF=dataRef, + studyDFSyn=studyDFSyn, + listProfileRef=listProfileRef, + studyType="LD") + + closefn.gds(gdsReference) + closefn.gds(gdsRefAnnot) + + + } +} + +} +\references{ +Galinsky KJ, Bhatia G, Loh PR, Georgiev S, Mukherjee S, Patterson NJ, +Price AL. Fast Principal-Component Analysis Reveals Convergent Evolution +of ADH1B in Europe and East Asia. Am J Hum Genet. 2016 Mar 3;98(3):456-72. +doi: 10.1016/j.ajhg.2015.12.022. Epub 2016 Feb 25. +} +\author{ +Pascal Belleau, Astrid Deschênes and Alexander Krasnitz +} +\keyword{internal} diff --git a/man/projectSample2PCA.Rd b/man/projectSample2PCA.Rd deleted file mode 100644 index 4d986924f..000000000 --- a/man/projectSample2PCA.Rd +++ /dev/null @@ -1,66 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/processStudy.R -\encoding{UTF-8} -\name{projectSample2PCA} -\alias{projectSample2PCA} -\title{Project profile onto existing principal component axes (PCA)} -\usage{ -projectSample2PCA( - gdsProfile, - listPCA, - currentProfile, - np = 1L, - verbose = FALSE -) -} -\arguments{ -\item{gdsProfile}{an object of class -\code{\link[SNPRelate:SNPGDSFileClass]{SNPRelate::SNPGDSFileClass}}, an -opened Profile GDS file.} - -\item{listPCA}{a \code{list} containing two entries: -\itemize{ -\item{pca.unrel} {\code{snpgdsPCAClass} object} -\item{snp.load} {\code{snpgdsPCASNPLoading} object} -}} - -\item{currentProfile}{a \code{character} string representing the -identifiant of the profile to be projected in the PCA.} - -\item{np}{a single positive \code{integer} representing the number of -threads. Default: \code{1L}.} - -\item{verbose}{a \code{logical} passed to the PCA function. -Default: \code{FALSE}.} -} -\value{ -a \code{snpgdsPCAClass} object, a \code{list} that contains: -\itemize{ - \item{sample.id} {the sample ids used in the analysis} - \item{snp.id} {the SNP ids used in the analysis} - \item{eigenvalues} {eigenvalues} - \item{eigenvect} {eigenvactors, “# of samples” x “eigen.cnt”} - \item{TraceXTX} {the trace of the genetic covariance matrix} - \item{Bayesian} {whether use bayerisan normalization} -} -} -\description{ -This function calculates the profile eigenvectors using -the specified SNP loadings. -} -\details{ -More information about the method used to calculate the patient eigenvectors -can be found at the Bioconductor SNPRelate website: -https://bioconductor.org/packages/SNPRelate/ -} -\examples{ - -## Path to the demo pedigree file is located in this package -dataDir <- system.file("extdata", "RAIDS") - -## TODO - -} -\author{ -Pascal Belleau, Astrid Deschênes and Alexander Krasnitz -} diff --git a/man/pruning1KGbyChr.Rd b/man/pruning1KGbyChr.Rd index 5c7f873ff..c12df35e8 100644 --- a/man/pruning1KGbyChr.Rd +++ b/man/pruning1KGbyChr.Rd @@ -3,8 +3,8 @@ \encoding{UTF-8} \name{pruning1KGbyChr} \alias{pruning1KGbyChr} -\title{Extract the pruned SNVs in a reference data set (1KG) by chromosome -and/or allelic frequency} +\title{Extract the pruned SNVs in a population reference data set (ex:1KG) +by chromosome and/or allelic frequency} \usage{ pruning1KGbyChr( gdsReference, @@ -61,25 +61,47 @@ allelic frequency is not used as a filtering criterion. Default: \code{NULL}.} \item{outPrefix}{a \code{character} string that represents the prefix of the -RDS files that will be generated. Default: \code{"pruned_1KG"}.} +RDS file(s) that will be generated. Default: \code{"pruned_1KG"}.} \item{keepObj}{a \code{logical} specifying if the function must save the -the processed information into a RDS object. Default: \code{FALSE}.} +the processed information into a second RDS file. Default: \code{FALSE}.} } \value{ The function returns \code{0L} when successful. } \description{ -The function extracts the pruned SNVs in a reference data -set (1KG) by chromosome and/or allelic frequency. The pruning is done -through the linkage disequilibrium analysis. +The function extracts the pruned SNVs in a population +reference data set (ex: 1KG) by chromosome and/or allelic frequency. +The pruning is done through the linkage disequilibrium analysis. The +pruned SNVs are saved in a RDS file. } \examples{ +## Required libraries +library(SNPRelate) +library(gdsfmt) + ## Path to the demo pedigree file is located in this package dataDir <- system.file("extdata", package="RAIDS") -## TODO +## The 1KG Population Reference GDS demo file (opened) +gds1KG <- snpgdsOpen(file.path(dataDir, "PopulationReferenceDemo.gds")) + +## The prefix of the RDS file to be created and containing the pruned SNVs +outPrefix <- file.path(tempdir(), "Pruned_Demo_Reference") + +## Create a RDS file with the pruned SNVs +RAIDS:::pruning1KGbyChr(gdsReference=gds1KG, outPrefix=outPrefix) + +prunedSNVs <- readRDS(file.path(paste0(outPrefix, ".rds"))) +prunedSNVs + +## Close 1K GDS file +closefn.gds(gds1KG) + +## Delete temporary file +unlink(paste0(outPrefix, ".rds"), force=TRUE) + } \author{ diff --git a/man/pruningSample.Rd b/man/pruningSample.Rd index 15c459ec8..2b9dd6293 100644 --- a/man/pruningSample.Rd +++ b/man/pruningSample.Rd @@ -4,7 +4,8 @@ \name{pruningSample} \alias{pruningSample} \title{Compute the list of pruned SNVs for a specific profile using the -information from the 1KG GDS file and a linkage disequilibrium analysis} +information from the Reference GDS file and a linkage disequilibrium +analysis} \usage{ pruningSample( gdsReference, @@ -110,9 +111,9 @@ function can be specified by the user. ## Required library for GDS library(gdsfmt) -## Path to the demo 1KG GDS file is located in this package +## Path to the demo Reference GDS file is located in this package dataDir <- system.file("extdata/tests", package="RAIDS") -fileGDS <- file.path(dataDir, "ex1_good_small_1KG_GDS.gds") +fileGDS <- file.path(dataDir, "ex1_good_small_1KG.gds") ## The data.frame containing the information about the study ## The 3 mandatory columns: "study.id", "study.desc", "study.platform" @@ -131,14 +132,11 @@ samplePED <- data.frame(Name.ID = c("ex1", "ex2"), Source = rep("Databank B", 2), stringsAsFactors = FALSE) rownames(samplePED) <- samplePED$Name.ID +## Temporary Profile GDS file +profileFile <- file.path(tempdir(), "ex1.gds") + ## Copy the Profile GDS file demo that has not been pruned yet -## into a test directory (deleted after the example has been run) -dataDirPruning <- file.path(system.file("extdata", package="RAIDS"), - "demoPruning") -dir.create(dataDirPruning, showWarnings=FALSE, - recursive=FALSE, mode="0777") -file.copy(file.path(dataDir, "ex1_demo.gds"), - file.path(dataDirPruning, "ex1.gds")) +file.copy(file.path(dataDir, "ex1_demo.gds"), profileFile) ## Open 1KG file gds1KG <- snpgdsOpen(fileGDS) @@ -146,22 +144,22 @@ gds1KG <- snpgdsOpen(fileGDS) ## Compute the list of pruned SNVs for a specific profile 'ex1' ## and save it in the Profile GDS file 'ex1.gds' pruningSample(gdsReference=gds1KG, currentProfile=c("ex1"), - studyID = studyDF$study.id, pathProfileGDS=dataDirPruning) + studyID = studyDF$study.id, pathProfileGDS=tempdir()) -## Close the 1KG GDS file (it is important to always close the GDS files) +## Close the Reference GDS file (important) closefn.gds(gds1KG) ## Check content of Profile GDS file ## The 'pruned.study' entry should be present -content <- openfn.gds(file.path(dataDirPruning, "ex1.gds")) +content <- openfn.gds(profileFile) content -## Close the Profile GDS file (it is important to always close the GDS files) +## Close the Profile GDS file (important) closefn.gds(content) -## Unlink Profile GDS file (created for demo purpose) -unlink(file.path(dataDirPruning, "ex1.gds")) -unlink(dataDirPruning) +## Remove Profile GDS file (created for demo purpose) +unlink(profileFile, force=TRUE) + } \author{ diff --git a/man/readSNVBAM.Rd b/man/readSNVBAM.Rd new file mode 100644 index 000000000..7a2d7ed10 --- /dev/null +++ b/man/readSNVBAM.Rd @@ -0,0 +1,73 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/tools_internal.R +\encoding{UTF-8} +\name{readSNVBAM} +\alias{readSNVBAM} +\title{Read a VCF file with the genotypes use for the ancestry call} +\usage{ +readSNVBAM( + fileName, + varSelected, + offset = 0L, + paramSNVBAM = list(ScanBamParam = NULL, PileupParam = NULL, yieldSize = 1e+07), + verbose = FALSE +) +} +\arguments{ +\item{fileName}{a \code{character} string representing the name, including +the path, of a BAM file with the index file in the same directory} + +\item{varSelected}{a \code{data.frame} representing the position to keep} + +\item{offset}{a \code{integer} representing the offset to be added to the +position of the SNVs. The value of offset +is added to the position present in the file. Default: \code{0L}.} + +\item{paramSNVBAM}{a \code{list} containing the parameters passed to the +BamFile() function. Default: \code{list(ScanBamParam=NULL, PileupParam=NULL, +yieldSize=10000000)}.} + +\item{verbose}{a \code{logical} indicating if messages should be printed +to show how the different steps in the function. Default: \code{FALSE}.} +} +\value{ +a \code{data.frame} containing at least: +\describe{ +\item{Chromosome}{ a \code{numeric} representing the name of +the chromosome} +\item{Position}{ a \code{numeric} representing the position on the +chromosome} +\item{Ref}{ a \code{character} string representing the reference nucleotide} +\item{Alt}{ a \code{character} string representing the alternative +nucleotide} +\item{File1R}{ a \code{numeric} representing the count for +the reference nucleotide} +\item{File1A}{ a \code{numeric} representing the count for the +alternative nucleotide} +\item{count}{ a \code{numeric} representing the total count} +} +} +\description{ +The function reads VCF file and +returns a data frame +containing the information about the read counts for the SNVs present in +the file. +} +\examples{ + + +## Required library for this example to run correctly +if (requireNamespace("Rsamtools", quietly=TRUE)) { + ## Demo bam + fl <- system.file("extdata", "no_which_buffered_pileup.bam", + package="Rsamtools", mustWork=TRUE) + + RAIDS:::readSNVBAM(fl, varSelected=data.frame(chr=c(1,1), + start=c(3,5), REF=c("A", "A"), ALT=c("C", "C"))) +} + +} +\author{ +Pascal Belleau, Astrid Deschênes and Alexander Krasnitz +} +\keyword{internal} diff --git a/man/readSNVFileGeneric.Rd b/man/readSNVFileGeneric.Rd index 39458b9d8..64aad2908 100644 --- a/man/readSNVFileGeneric.Rd +++ b/man/readSNVFileGeneric.Rd @@ -3,38 +3,55 @@ \encoding{UTF-8} \name{readSNVFileGeneric} \alias{readSNVFileGeneric} -\title{TODO} +\title{Read a generic SNP pileup file} \usage{ readSNVFileGeneric(fileName, offset = 0L) } \arguments{ -\item{fileName}{File name with the path to a -csv with at least the columns: -Chromosome,Position,Ref,Alt,Count,File1R,File1A -where Count is the deep at the position, -FileR is the deep of the reference allele, and -File1A is the deep of the specific alternative allele} +\item{fileName}{a \code{character} string representing the name, including +the path, of a text file containing the SNV read counts. The text file must +be comma separated. The text file must +contain those columns: Chromosome, Position, Ref, Alt, Count, +File1R and File1A.} -\item{offset}{TODO} +\item{offset}{a \code{integer} representing the offset to be added to the +position of the SNVs. The value of offset +is added to the position present in the file. Default: \code{0L}.} } \value{ a \code{data.frame} containing at least: -\itemize{ -\item{Chromosome} {TODO} -\item{Position} {TODO} -\item{Ref} -\item{Alt} -\item{File1R} {deep of the reference allele} -\item{File1A} {deep of the alternative allele} -\item{count} {Total deep at the position} +\describe{ +\item{Chromosome}{ a \code{numeric} representing the name of +the chromosome} +\item{Position}{ a \code{numeric} representing the position on the +chromosome} +\item{Ref}{ a \code{character} string representing the reference nucleotide} +\item{Alt}{ a \code{character} string representing the alternative +nucleotide} +\item{File1R}{ a \code{numeric} representing the count for +the reference nucleotide} +\item{File1A}{ a \code{numeric} representing the count for the +alternative nucleotide} +\item{count}{ a \code{numeric} representing the total count} } } \description{ -TODO +The function reads a generic SNP pileup file and +returns a data frame +containing the information about the read counts for the SNVs present in +the file. } \examples{ -# TODO + +## Directory where demo SNP-pileup file +dataDir <- system.file("extdata/example/snpPileup", package="RAIDS") + +## The SNP-pileup file +snpPileupFile <- file.path(dataDir, "ex1.generic.txt.gz") + +info <- RAIDS:::readSNVFileGeneric(fileName=snpPileupFile) +head(info) } \author{ diff --git a/man/readSNVPileupFile.Rd b/man/readSNVPileupFile.Rd index cfbd187e1..956ad047f 100644 --- a/man/readSNVPileupFile.Rd +++ b/man/readSNVPileupFile.Rd @@ -3,33 +3,58 @@ \encoding{UTF-8} \name{readSNVPileupFile} \alias{readSNVPileupFile} -\title{TODO} +\title{Read a SNP-pileup file} \usage{ readSNVPileupFile(fileName, offset = 0L) } \arguments{ -\item{fileName}{Output from snp-pileup -must csv with the columns: -Chromosome,Position,Ref,Alt,File1R,File1A,File1E,File1D} +\item{fileName}{a \code{character} string representing the name, including +the path, of a text file containing the SNV read counts as generated by +snp-pileup software. The text file must be comma separated. +The text file must contain those columns: Chromosome, Position, Ref, Alt, +File1R, File1A, File1E and File1D.} -\item{offset}{TODO} +\item{offset}{a \code{integer} representing the offset to be added to the +position of the SNVs. The value of offset +is added to the position present in the file. Default: \code{0L}.} } \value{ the a \code{data.frame} containing at least: -\itemize{ -\item{Chromosome} {TODO} -\item{Position} {TODO} -\item{File1R} {TODO} -\item{File1A} {TODO} -\item{count} {TODO} +\describe{ +\item{Chromosome}{ a \code{numeric} representing the name of +the chromosome} +\item{Position}{ a \code{numeric} representing the position on the +chromosome} +\item{Ref}{ a \code{character} string representing the reference nucleotide} +\item{Alt}{ a \code{character} string representing the alternative +nucleotide} +\item{File1R}{ a \code{numeric} representing the count for +the reference nucleotide} +\item{File1A}{ a \code{numeric} representing the count for the +alternative nucleotide} +\item{File1E}{a \code{numeric} representing the count for the +errors} +\item{File1D}{a \code{numeric} representing the count for the +deletions} +\item{count}{ a \code{numeric} representing the total count} } } \description{ -TODO +The function reads a generic SNP pileup file and +returns a data frame +containing the information about the read counts for the SNVs present in +the file. } \examples{ -# TODO +## Directory where demo SNP-pileup file +dataDir <- system.file("extdata/example/snpPileup", package="RAIDS") + +## The SNP-pileup file +snpPileupFile <- file.path(dataDir, "ex1.txt.gz") + +info <- RAIDS:::readSNVPileupFile(fileName=snpPileupFile) +head(info) } \author{ diff --git a/man/readSNVVCF.Rd b/man/readSNVVCF.Rd new file mode 100644 index 000000000..acc082117 --- /dev/null +++ b/man/readSNVVCF.Rd @@ -0,0 +1,60 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/tools_internal.R +\encoding{UTF-8} +\name{readSNVVCF} +\alias{readSNVVCF} +\title{Read a VCF file with the genotypes use for the ancestry call} +\usage{ +readSNVVCF(fileName, profileName = NULL, offset = 0L) +} +\arguments{ +\item{fileName}{a \code{character} string representing the name, including +the path, of a VCF file containing the SNV read counts. +The VCF must contain those genotype fields: GT, AD, DP.} + +\item{profileName}{a \code{character} with Name.ID for the genotype name. +Default: \code{NULL}.} + +\item{offset}{a \code{integer} representing the offset to be added to the +position of the SNVs. The value of offset +is added to the position present in the file. Default: \code{0L}.} +} +\value{ +a \code{data.frame} containing at least: +\describe{ +\item{Chromosome}{ a \code{numeric} representing the name of +the chromosome} +\item{Position}{ a \code{numeric} representing the position on the +chromosome} +\item{Ref}{ a \code{character} string representing the reference nucleotide} +\item{Alt}{ a \code{character} string representing the alternative +nucleotide} +\item{File1R}{ a \code{numeric} representing the count for +the reference nucleotide} +\item{File1A}{ a \code{numeric} representing the count for the +alternative nucleotide} +\item{count}{ a \code{numeric} representing the total count} +} +} +\description{ +The function reads VCF file and returns a data frame +containing the information about the read counts for the SNVs present in +the file. +} +\examples{ + + +## Directory where demo SNP-pileup file +dataDir <- system.file("extdata/example/snpPileup", package="RAIDS") + +## The SNP-pileup file +snpPileupFile <- file.path(dataDir, "ex1.vcf.gz") + +info <- RAIDS:::readSNVVCF(fileName=snpPileupFile) +head(info) + +} +\author{ +Pascal Belleau, Astrid Deschênes and Alexander Krasnitz +} +\keyword{internal} diff --git a/man/runExomeAncestry.Rd b/man/runExomeAncestry.Rd index 9a08e1e34..b2e9a0141 100644 --- a/man/runExomeAncestry.Rd +++ b/man/runExomeAncestry.Rd @@ -4,7 +4,7 @@ \name{runExomeAncestry} \alias{runExomeAncestry} \title{Run most steps leading to the ancestry inference call on a specific -profile} +exome profile} \usage{ runExomeAncestry( pedStudy, @@ -16,7 +16,8 @@ runExomeAncestry( fileReferenceAnnotGDS, chrInfo, syntheticRefDF, - genoSource = c("snp-pileup", "generic"), + genoSource = c("snp-pileup", "generic", "VCF"), + np = 1L, verbose = FALSE ) } @@ -41,16 +42,20 @@ Default: \code{NULL}.} directory containing the VCF output of SNP-pileup for each sample. The SNP-pileup files must be compressed (gz files) and have the name identifiers of the samples. A sample with "Name.ID" identifier would have an -associated SNP-pileup file called "Name.ID.txt.gz".} +associated file called +if genoSource is "VCF", then "Name.ID.vcf.gz", +if genoSource is "generic", then "Name.ID.generic.txt.gz" +if genoSource is "snp-pileup", then "Name.ID.txt.gz".} \item{pathOut}{a \code{character} string representing the path to the directory where the output files are created.} \item{fileReferenceGDS}{a \code{character} string representing the file -name of the 1KG GDS file. The file must exist.} +name of the Reference GDS file. The file must exist.} \item{fileReferenceAnnotGDS}{a \code{character} string representing the -file name of the 1KG GDS annotation file. The file must exist.} +file name of the Population Reference GDS Annotation file. The file must +exist.} \item{chrInfo}{a \code{vector} of positive \code{integer} values representing the length of the chromosomes. See 'details' section.} @@ -58,23 +63,28 @@ representing the length of the chromosomes. See 'details' section.} \item{syntheticRefDF}{a \code{data.frame} containing a subset of reference profiles for each sub-population present in the Reference GDS file. The \code{data.frame} must have those columns: -\itemize{ -\item{sample.id} { a \code{character} string representing the sample +\describe{ +\item{sample.id}{ a \code{character} string representing the sample identifier. } -\item{pop.group} { a \code{character} string representing the +\item{pop.group}{ a \code{character} string representing the subcontinental population assigned to the sample. } -\item{superPop} { a \code{character} string representing the +\item{superPop}{ a \code{character} string representing the super-population assigned to the sample. } }} -\item{genoSource}{a \code{stirng} with two possible values: -snp-pileup and generic. It specify if the genotype files -are generate by snp-pileup(Facets) or generic format csv -with the column at least the columns: -Chromosome,Position,Ref,Alt,Count,File1R,File1A -where Count is the deep at the position, -FileR is the deep of the reference allele, and -File1A is the deep of the specific alternative allele} +\item{genoSource}{a \code{character} string with two possible values: +'snp-pileup', 'generic' or 'VCF'. It specifies if the genotype files +are generated by snp-pileup (Facets) or are a generic format CSV file +with at least those columns: +'Chromosome', 'Position', 'Ref', 'Alt', 'Count', 'File1R' and 'File1A'. +The 'Count' is the depth at the specified position; +'FileR' is the depth of the reference allele and +'File1A' is the depth of the specific alternative allele. +Finally the file can be a VCF file with at least those genotype +fields: GT, AD, DP.} + +\item{np}{a single positive \code{integer} specifying the number of +threads to be used. Default: \code{1L}.} \item{verbose}{a \code{logical} indicating if messages should be printed to show how the different steps in the function. Default: \code{FALSE}.} @@ -85,19 +95,19 @@ more information about the generated output files. } \description{ This function runs most steps leading to the ancestry inference -call on a specific profile. First, the function creates the Profile GDS file -for the specific profile using the information from a RDS Sample -description file and the 1KG reference GDS file. +call on a specific exome profile. First, the function creates the +Profile GDS file for the specific profile using the information from a +RDS Sample description file and the Population reference GDS file. } \details{ The runExomeAncestry() function generates 3 types of files in the OUTPUT directory. -\itemize{ -\item{Ancestry Inference}{The ancestry inference CSV file +\describe{ +\item{Ancestry Inference}{ The ancestry inference CSV file (".Ancestry.csv" file)} -\item{Inference Informaton}{The inference information RDS file +\item{Inference Informaton}{ The inference information RDS file (".infoCall.rds" file)} -\item{Synthetic Information}{The parameter information RDS files +\item{Synthetic Information}{ The parameter information RDS files from the synthetic inference ("KNN.synt.*.rds" files in a sub-directory)} } @@ -107,18 +117,16 @@ also created. \examples{ ## Required library for GDS -library(gdsfmt) +library(SNPRelate) ## Path to the demo 1KG GDS file is located in this package dataDir <- system.file("extdata", package="RAIDS") ################################################################# -## The path and file name for the PED RDS file -## will the information about the analyzed samples +## Load the information about the profile ################################################################# -filePED <- file.path(dataDir, "example", "pedEx.rds") -ped <- readRDS(filePED) -head(ped) +data(demoPedigreeEx1) +head(demoPedigreeEx1) ################################################################# ## The 1KG GDS file and the 1KG SNV Annotation GDS file @@ -126,10 +134,10 @@ head(ped) ## Note that the 1KG GDS file used for this example is a ## simplified version and CANNOT be used for any real analysis ################################################################# -path1KG <- file.path(dataDir, "example", "gdsRef") +path1KG <- file.path(dataDir, "tests") -fileReferenceGDS <- file.path(path1KG, "ex1kg.gds") -fileAnnotGDS <- file.path(path1KG, "exAnnot1kg.gds") +fileReferenceGDS <- file.path(path1KG, "ex1_good_small_1KG.gds") +fileAnnotGDS <- file.path(path1KG, "ex1_good_small_1KG_Annot.gds") ################################################################# ## The Sample SNP pileup files (one per sample) need @@ -141,9 +149,9 @@ pathGeno <- file.path(dataDir, "example", "snpPileup") ## The path where the Profile GDS Files (one per sample) ## will be created need to be specified. ################################################################# -pathProfileGDS <- file.path(dataDir, "example", "out.tmp") +pathProfileGDS <- file.path(tempdir(), "out.tmp") -pathOut <- file.path(dataDir, "example", "res.out") +pathOut <- file.path(tempdir(), "res.out") ################################################################# ## A data frame containing general information about the study @@ -164,36 +172,29 @@ gds1KG <- snpgdsOpen(fileReferenceGDS) dataRef <- select1KGPop(gds1KG, nbProfiles=2L) closefn.gds(gds1KG) -## Chromosome length information -## chr23 is chrX, chr24 is chrY and chrM is 25 -chrInfo <- c(248956422L, 242193529L, 198295559L, 190214555L, - 181538259L, 170805979L, 159345973L, 145138636L, 138394717L, 133797422L, - 135086622L, 133275309L, 114364328L, 107043718L, 101991189L, 90338345L, - 83257441L, 80373285L, 58617616L, 64444167L, 46709983L, 50818468L, - 156040895L, 57227415L, 16569L) - -## A formal way to get the chormosome length information -## library(BSgenome.Hsapiens.UCSC.hg38) -## chrInfo <- integer(25L) -## for(i in seq_len(22L)){ chrInfo[i] <- -## length(Hsapiens[[paste0("chr", i)]])} -## chrInfo[23] <- length(Hsapiens[["chrX"]]) -## chrInfo[24] <- length(Hsapiens[["chrY"]]) -## chrInfo[25] <- length(Hsapiens[["chrM"]]) - -\dontrun{ -runExomeAncestry(pedStudy=ped, studyDF=studyDF, - pathProfileGDS=pathProfileGDS, - pathGeno=pathGeno, - pathOut=pathOut, - fileReferenceGDS=fileReferenceGDS, - fileReferenceAnnotGDS=fileAnnotGDS, - chrInfo=chrInfo, - syntheticRefDF=dataRef, - genoSource="snp-pileup") - -unlink(pathProfileGDS, recursive=TRUE, force=TRUE) -unlink(pathOut, recursive=TRUE, force=TRUE) +## Required library for this example to run correctly +if (requireNamespace("GenomeInfoDb", quietly=TRUE) && + requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) { + + ## Chromosome length information + ## chr23 is chrX, chr24 is chrY and chrM is 25 + chrInfo <- GenomeInfoDb::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25] + + \donttest{ + + runExomeAncestry(pedStudy=demoPedigreeEx1, studyDF=studyDF, + pathProfileGDS=pathProfileGDS, + pathGeno=pathGeno, + pathOut=pathOut, + fileReferenceGDS=fileReferenceGDS, + fileReferenceAnnotGDS=fileAnnotGDS, + chrInfo=chrInfo, + syntheticRefDF=dataRef, + genoSource="snp-pileup") + + unlink(pathProfileGDS, recursive=TRUE, force=TRUE) + unlink(pathOut, recursive=TRUE, force=TRUE) + } } } diff --git a/man/runIBDKING.Rd b/man/runIBDKING.Rd index 4ed5f2610..8665451cb 100644 --- a/man/runIBDKING.Rd +++ b/man/runIBDKING.Rd @@ -29,18 +29,18 @@ the process in the \code{\link[SNPRelate]{snpgdsIBDKING}}() function.} } \value{ a \code{list} containing: -\itemize{ - \item{sample.id}{a \code{character} string representing the sample - ids used in the analysis} - \item{snp.id}{a \code{character} string representing the SNP ids - used in the analysis} - \item{k0}{a \code{numeric}, the IBD coefficient, the probability of - sharing zero IBD} - \item{k1}{a \code{numeric}, the IBD coefficient, the probability of - sharing one IBD} - \item{IBS0}{a \code{numeric}, the proportion of SNPs with zero IBS} - \item{kinship}{a \code{numeric}, the proportion of SNPs with zero IBS, - if the parameter kinship=TRUE} +\describe{ +\item{sample.id}{a \code{character} string representing the sample +ids used in the analysis} +\item{snp.id}{a \code{character} string representing the SNP ids +used in the analysis} +\item{k0}{a \code{numeric}, the IBD coefficient, the probability of +sharing zero IBD} +\item{k1}{a \code{numeric}, the IBD coefficient, the probability of +sharing one IBD} +\item{IBS0}{a \code{numeric}, the proportion of SNPs with zero IBS} +\item{kinship}{a \code{numeric}, the proportion of SNPs with zero IBS, +if the parameter kinship=TRUE} } } \description{ diff --git a/man/runLDPruning.Rd b/man/runLDPruning.Rd index e9c8d438c..e75f60ebd 100644 --- a/man/runLDPruning.Rd +++ b/man/runLDPruning.Rd @@ -66,6 +66,9 @@ SNPRelate package (https://bioconductor.org/packages/SNPRelate/). } \examples{ +## Required +library(SNPRelate) + ## Open an example dataset (HapMap) genoFile <- snpgdsOpen(snpgdsExampleFileName()) diff --git a/man/runProfileAncestry.Rd b/man/runProfileAncestry.Rd new file mode 100644 index 000000000..f578560e9 --- /dev/null +++ b/man/runProfileAncestry.Rd @@ -0,0 +1,226 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/processStudy_internal.R +\encoding{UTF-8} +\name{runProfileAncestry} +\alias{runProfileAncestry} +\title{Run most steps leading to the ancestry inference call on a +specific profile (RNA or DNA)} +\usage{ +runProfileAncestry( + gdsReference, + gdsRefAnnot, + studyDF, + currentProfile, + pathProfileGDS, + pathOut, + chrInfo, + syntheticRefDF, + studyDFSyn, + listProfileRef, + studyType = c("DNA", "RNA"), + np = 1L, + blockTypeID = NULL, + verbose = FALSE +) +} +\arguments{ +\item{gdsReference}{an object of class \code{\link[gdsfmt]{gds.class}} +(a GDS file), the opened Population Reference GDS file.} + +\item{gdsRefAnnot}{an object of class \code{\link[gdsfmt]{gds.class}} +(a GDS file), the opened Population Reference SNV Annotation GDS file. +This parameter is RNA specific.} + +\item{studyDF}{a \code{data.frame} containing the information about the +study associated to the analysed sample(s). The \code{data.frame} must have +those 3 columns: "study.id", "study.desc", "study.platform". All columns +must be in \code{character} strings (no factor).} + +\item{currentProfile}{a \code{character} string representing the profile +identifier.} + +\item{pathProfileGDS}{a \code{character} string representing the path to +the directory where the GDS Profile files will be created. +Default: \code{NULL}.} + +\item{pathOut}{a \code{character} string representing the path to +the directory where the output files are created.} + +\item{chrInfo}{a \code{vector} of positive \code{integer} values +representing the length of the chromosomes. See 'details' section.} + +\item{syntheticRefDF}{a \code{data.frame} containing a subset of +reference profiles for each sub-population present in the Reference GDS +file. The \code{data.frame} must have those columns: +\describe{ +\item{sample.id}{ a \code{character} string representing the sample +identifier. } +\item{pop.group}{ a \code{character} string representing the +subcontinental population assigned to the sample. } +\item{superPop}{ a \code{character} string representing the +super-population assigned to the sample. } +}} + +\item{studyDFSyn}{a \code{data.frame} containing the information about the +synthetic data to the analysed sample(s). The \code{data.frame} must have +those 3 columns: "study.id", "study.desc", "study.platform". All columns +must be in \code{character} strings (no factor).} + +\item{listProfileRef}{a \code{vector} of \code{character} string +representing the +identifiers of the selected 1KG profiles that will be used as reference to +generate the synthetic profiles.} + +\item{studyType}{a \code{character} string representing the type of study. +The possible choices are: "DNA" and "RNA". The type of study affects the +way the estimation of the allelic fraction is done. Default: \code{"DNA"}.} + +\item{np}{a single positive \code{integer} specifying the number of +threads to be used. Default: \code{1L}.} + +\item{blockTypeID}{a \code{character} string corresponding to the block +type used to extract the block identifiers. The block type must be +present in the GDS Reference Annotation file.} + +\item{verbose}{a \code{logical} indicating if messages should be printed +to show how the different steps in the function. Default: \code{FALSE}.} +} +\value{ +The integer \code{0L} when successful. See details section for +more information about the generated output files. +} +\description{ +This function runs most steps leading to the ancestry inference +call on a specific profile. First, the function creates the Profile GDS file +for the specific profile using the information from a RDS Sample +description file and the Population reference GDS file. +} +\details{ +The runWrapperAncestry() function generates 3 types of files +in the \code{pathOut} directory: +\describe{ +\item{Ancestry Inference}{ The ancestry inference CSV file +(".Ancestry.csv" file)} +\item{Inference Informaton}{ The inference information RDS file +(".infoCall.rds" file)} +\item{Synthetic Information}{ The parameter information RDS files +from the synthetic inference ("KNN.synt.*.rds" files in a sub-directory)} +} + +In addition, a sub-directory (named using the profile ID) is +also created. +} +\examples{ + +## Required library for GDS +library(SNPRelate) + +## Path to the demo 1KG GDS file is located in this package +dataDir <- system.file("extdata", package="RAIDS") + +################################################################# +## Load the information about the profile +################################################################# +data(demoPedigreeEx1) +head(demoPedigreeEx1) + +################################################################# +## The 1KG GDS file and the 1KG SNV Annotation GDS file +## need to be located in the same directory +## Note that the 1KG GDS file used for this example is a +## simplified version and CANNOT be used for any real analysis +################################################################# +path1KG <- file.path(dataDir, "tests") + +fileReferenceGDS <- file.path(path1KG, "ex1_good_small_1KG.gds") +fileAnnotGDS <- file.path(path1KG, "ex1_good_small_1KG_Annot.gds") + +################################################################# +## The Sample SNP pileup files (one per sample) need +## to be located in the same directory. +################################################################# +pathGeno <- file.path(dataDir, "example", "snpPileup") + +################################################################# +## The path where the Profile GDS Files (one per sample) +## will be created need to be specified. +################################################################# + +pathProfileGDS <- file.path(tempdir(), "outTest.tmp") + +pathOut <- file.path(tempdir(), "resTest.out") + +################################################################# +## A data frame containing general information about the study +## is also required. The data frame must have +## those 3 columns: "studyID", "study.desc", "study.platform" +################################################################# +studyDF <- data.frame(study.id="MYDATA", + study.desc="Description", + study.platform="PLATFORM", + stringsAsFactors=FALSE) + +#################################################################### +## Fix seed to ensure reproducible results +#################################################################### +set.seed(3043) + +gdsReference <- snpgdsOpen(fileReferenceGDS) +dataRef <- select1KGPop(gdsReference, nbProfiles=2L) +closefn.gds(gdsReference) + +## Required library for this example to run correctly +if (requireNamespace("GenomeInfoDb", quietly=TRUE) && + requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) { + + ## Chromosome length information + ## chr23 is chrX, chr24 is chrY and chrM is 25 + chrInfo <- GenomeInfoDb::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25] + + studyDFSyn <- data.frame(study.id=paste0(studyDF$study.id, ".Synthetic"), + study.desc=paste0(studyDF$study.id, " synthetic data"), + study.platform=studyDF$study.platform, stringsAsFactors=FALSE) + + listProfileRef <- dataRef$sample.id + profileFile <- file.path(pathProfileGDS, "ex1.gds") + + \dontrun{ + + dir.create(pathProfileGDS) + dir.create(pathOut) + file.copy(file.path(dataDir, "tests", "ex1_demo.gds"), profileFile) + + gdsReference <- snpgdsOpen(fileReferenceGDS) + gdsRefAnnot <- openfn.gds(fileAnnotGDS) + + RAIDS:::runProfileAncestry(gdsReference=gdsReference, + gdsRefAnnot=gdsRefAnnot, + studyDF=studyDF, currentProfile=ped[1,"Name.ID"], + pathProfileGDS=pathProfileGDS, + pathOut=pathOut, + chrInfo=chrInfo, + syntheticRefDF=dataRef, + studyDFSyn=studyDFSyn, + listProfileRef=listProfileRef, + studyType="DNA") + + closefn.gds(gdsReference) + closefn.gds(gdsRefAnnot) + + unlink(pathProfileGDS, recursive=TRUE, force=TRUE) + unlink(pathOut, recursive=TRUE, force=TRUE) + + } +} + +} +\references{ +Galinsky KJ, Bhatia G, Loh PR, Georgiev S, Mukherjee S, Patterson NJ, +Price AL. Fast Principal-Component Analysis Reveals Convergent Evolution +of ADH1B in Europe and East Asia. Am J Hum Genet. 2016 Mar 3;98(3):456-72. +doi: 10.1016/j.ajhg.2015.12.022. Epub 2016 Feb 25. +} +\author{ +Pascal Belleau, Astrid Deschênes and Alexander Krasnitz +} +\keyword{internal} diff --git a/man/runRNAAncestry.Rd b/man/runRNAAncestry.Rd new file mode 100644 index 000000000..712409b70 --- /dev/null +++ b/man/runRNAAncestry.Rd @@ -0,0 +1,216 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/processStudy.R +\encoding{UTF-8} +\name{runRNAAncestry} +\alias{runRNAAncestry} +\title{Run most steps leading to the ancestry inference call on a specific +RNA profile} +\usage{ +runRNAAncestry( + pedStudy, + studyDF, + pathProfileGDS, + pathGeno, + pathOut, + fileReferenceGDS, + fileReferenceAnnotGDS, + chrInfo, + syntheticRefDF, + genoSource = c("snp-pileup", "generic", "VCF"), + np = 1L, + blockTypeID, + verbose = FALSE +) +} +\arguments{ +\item{pedStudy}{a \code{data.frame} with those mandatory columns: "Name.ID", +"Case.ID", "Sample.Type", "Diagnosis", "Source". All columns must be in +\code{character} strings (no factor). The \code{data.frame} +must contain the information for all the samples passed in the +\code{listSamples} parameter. Only \code{filePedRDS} or \code{pedStudy} +can be defined.} + +\item{studyDF}{a \code{data.frame} containing the information about the +study associated to the analysed sample(s). The \code{data.frame} must have +those 3 columns: "study.id", "study.desc", "study.platform". All columns +must be in \code{character} strings (no factor).} + +\item{pathProfileGDS}{a \code{character} string representing the path to +the directory where the GDS Profile files will be created. +Default: \code{NULL}.} + +\item{pathGeno}{a \code{character} string representing the path to the +directory containing the VCF output of SNP-pileup for each sample. The +SNP-pileup files must be compressed (gz files) and have the name identifiers +of the samples. A sample with "Name.ID" identifier would have an +associated file called +if genoSource is "VCF", then "Name.ID.vcf.gz", +if genoSource is "generic", then "Name.ID.generic.txt.gz" +if genoSource is "snp-pileup", then "Name.ID.txt.gz".} + +\item{pathOut}{a \code{character} string representing the path to +the directory where the output files are created.} + +\item{fileReferenceGDS}{a \code{character} string representing the file +name of the Population Reference GDS file. The file must exist.} + +\item{fileReferenceAnnotGDS}{a \code{character} string representing the +file name of the Population Reference GDS Annotation file. The file +must exist.} + +\item{chrInfo}{a \code{vector} of positive \code{integer} values +representing the length of the chromosomes. See 'details' section.} + +\item{syntheticRefDF}{a \code{data.frame} containing a subset of +reference profiles for each sub-population present in the Reference GDS +file. The \code{data.frame} must have those columns: +\describe{ +\item{sample.id}{ a \code{character} string representing the sample +identifier. } +\item{pop.group}{ a \code{character} string representing the +subcontinental population assigned to the sample. } +\item{superPop}{ a \code{character} string representing the +super-population assigned to the sample. } +}} + +\item{genoSource}{a \code{character} string with two possible values: +'snp-pileup', 'generic' or 'VCF'. It specifies if the genotype files +are generated by snp-pileup (Facets) or are a generic format CSV file +with at least those columns: +'Chromosome', 'Position', 'Ref', 'Alt', 'Count', 'File1R' and 'File1A'. +The 'Count' is the depth at the specified position; +'FileR' is the depth of the reference allele and +'File1A' is the depth of the specific alternative allele. +Finally the file can be a VCF file with at least those genotype +fields: GT, AD, DP.} + +\item{np}{a single positive \code{integer} specifying the number of +threads to be used. Default: \code{1L}.} + +\item{blockTypeID}{a \code{character} string corresponding to the block +type used to extract the block identifiers. The block type must be +present in the GDS Reference Annotation file.} + +\item{verbose}{a \code{logical} indicating if messages should be printed +to show how the different steps in the function. Default: \code{FALSE}.} +} +\value{ +The integer \code{0L} when successful. See details section for +more information about the generated output files. +} +\description{ +This function runs most steps leading to the ancestry inference +call on a specific RNA profile. First, the function creates the +Profile GDS file for the specific profile using the information from a +RDS Sample description file and the Population Reference GDS file. +} +\details{ +The runExomeAncestry() function generates 3 types of files +in the OUTPUT directory. +\describe{ +\item{Ancestry Inference}{ The ancestry inference CSV file +(".Ancestry.csv" file)} +\item{Inference Informaton}{ The inference information RDS file +(".infoCall.rds" file)} +\item{Synthetic Information}{ The parameter information RDS files +from the synthetic inference ("KNN.synt.*.rds" files in a sub-directory)} +} + +In addition, a sub-directory (named using the profile ID) is +also created. +} +\examples{ + +## Required library for GDS +library(SNPRelate) + +## Path to the demo 1KG GDS file is located in this package +dataDir <- system.file("extdata", package="RAIDS") + +################################################################# +## Load the information about the profile +################################################################# +data(demoPedigreeEx1) +head(demoPedigreeEx1) + +################################################################# +## The 1KG GDS file and the 1KG SNV Annotation GDS file +## need to be located in the same directory +## Note that the 1KG GDS file used for this example is a +## simplified version and CANNOT be used for any real analysis +################################################################# +path1KG <- file.path(dataDir, "tests") + +fileReferenceGDS <- file.path(path1KG, "ex1_good_small_1KG.gds") +fileAnnotGDS <- file.path(path1KG, "ex1_good_small_1KG_Annot.gds") + +################################################################# +## The Sample SNP pileup files (one per sample) need +## to be located in the same directory. +################################################################# +pathGeno <- file.path(dataDir, "example", "snpPileup") + +################################################################# +## The path where the Profile GDS Files (one per sample) +## will be created need to be specified. +################################################################# +pathProfileGDS <- file.path(tempdir(), "out.tmp") + +pathOut <- file.path(tempdir(), "res.out") + +################################################################# +## A data frame containing general information about the study +## is also required. The data frame must have +## those 3 columns: "studyID", "study.desc", "study.platform" +################################################################# +studyDF <- data.frame(study.id="MYDATA", + study.desc="Description", + study.platform="PLATFORM", + stringsAsFactors=FALSE) + +#################################################################### +## Fix seed to ensure reproducible results +#################################################################### +set.seed(3043) + +gds1KG <- snpgdsOpen(fileReferenceGDS) +dataRef <- select1KGPop(gds1KG, nbProfiles=2L) +closefn.gds(gds1KG) + +## Required library for this example to run correctly +if (requireNamespace("GenomeInfoDb", quietly=TRUE) && + requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) { + + ## Chromosome length information + ## chr23 is chrX, chr24 is chrY and chrM is 25 + chrInfo <- GenomeInfoDb::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25] + + \donttest{ + + runRNAAncestry(pedStudy=demoPedigreeEx1, studyDF=studyDF, + pathProfileGDS=pathProfileGDS, + pathGeno=pathGeno, + pathOut=pathOut, + fileReferenceGDS=fileReferenceGDS, + fileReferenceAnnotGDS=fileAnnotGDS, + chrInfo=chrInfo, + syntheticRefDF=dataRef, + blockTypeID="GeneS.Ensembl.Hsapiens.v86", + genoSource="snp-pileup") + + unlink(pathProfileGDS, recursive=TRUE, force=TRUE) + unlink(pathOut, recursive=TRUE, force=TRUE) + + } +} + +} +\references{ +Galinsky KJ, Bhatia G, Loh PR, Georgiev S, Mukherjee S, Patterson NJ, +Price AL. Fast Principal-Component Analysis Reveals Convergent Evolution +of ADH1B in Europe and East Asia. Am J Hum Genet. 2016 Mar 3;98(3):456-72. +doi: 10.1016/j.ajhg.2015.12.022. Epub 2016 Feb 25. +} +\author{ +Pascal Belleau, Astrid Deschênes and Alexander Krasnitz +} diff --git a/man/runWrapperAncestry.Rd b/man/runWrapperAncestry.Rd new file mode 100644 index 000000000..7f27cc776 --- /dev/null +++ b/man/runWrapperAncestry.Rd @@ -0,0 +1,218 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/processStudy_internal.R +\encoding{UTF-8} +\name{runWrapperAncestry} +\alias{runWrapperAncestry} +\title{Run most steps leading to the ancestry inference call +on a specific profile (RNA or DNA)} +\usage{ +runWrapperAncestry( + pedStudy, + studyDF, + pathProfileGDS, + pathGeno, + pathOut, + fileReferenceGDS, + fileReferenceAnnotGDS, + chrInfo, + syntheticRefDF, + genoSource = c("snp-pileup", "generic", "VCF"), + studyType = c("DNA", "RNA"), + np = 1L, + blockTypeID = NULL, + verbose = FALSE +) +} +\arguments{ +\item{pedStudy}{a \code{data.frame} with those mandatory columns: "Name.ID", +"Case.ID", "Sample.Type", "Diagnosis", "Source". All columns must be in +\code{character} strings (no factor). The \code{data.frame} +must contain the information for all the samples passed in the +\code{listSamples} parameter. Only \code{filePedRDS} or \code{pedStudy} +can be defined.} + +\item{studyDF}{a \code{data.frame} containing the information about the +study associated to the analysed sample(s). The \code{data.frame} must have +those 3 columns: "study.id", "study.desc", "study.platform". All columns +must be in \code{character} strings (no factor).} + +\item{pathProfileGDS}{a \code{character} string representing the path to +the directory where the GDS Profile files will be created. +Default: \code{NULL}.} + +\item{pathGeno}{a \code{character} string representing the path to the +directory containing the VCF output of SNP-pileup for each sample. The +SNP-pileup files must be compressed (gz files) and have the name identifiers +of the samples. A sample with "Name.ID" identifier would have an +associated file called +if genoSource is "VCF", then "Name.ID.vcf.gz", +if genoSource is "generic", then "Name.ID.generic.txt.gz" +if genoSource is "snp-pileup", then "Name.ID.txt.gz".} + +\item{pathOut}{a \code{character} string representing the path to +the directory where the output files are created.} + +\item{fileReferenceGDS}{a \code{character} string representing the file +name of the Reference GDS file. The file must exist.} + +\item{fileReferenceAnnotGDS}{a \code{character} string representing the +file name of the Reference GDS Annotation file. The file must exist.} + +\item{chrInfo}{a \code{vector} of positive \code{integer} values +representing the length of the chromosomes. See 'details' section.} + +\item{syntheticRefDF}{a \code{data.frame} containing a subset of +reference profiles for each sub-population present in the Reference GDS +file. The \code{data.frame} must have those columns: +\describe{ +\item{sample.id}{ a \code{character} string representing the sample +identifier. } +\item{pop.group}{ a \code{character} string representing the +subcontinental population assigned to the sample. } +\item{superPop}{ a \code{character} string representing the +super-population assigned to the sample. } +}} + +\item{genoSource}{a \code{character} string with two possible values: +'snp-pileup', 'generic' or 'VCF'. It specifies if the genotype files +are generated by snp-pileup (Facets) or are a generic format CSV file +with at least those columns: +'Chromosome', 'Position', 'Ref', 'Alt', 'Count', 'File1R' and 'File1A'. +The 'Count' is the depth at the specified position; +'FileR' is the depth of the reference allele and +'File1A' is the depth of the specific alternative allele. +Finally the file can be a VCF file with at least those genotype +fields: GT, AD, DP.} + +\item{studyType}{a \code{character} string representing the type of study. +The possible choices are: "DNA" and "RNA". The type of study affects the +way the estimation of the allelic fraction is done. Default: \code{"DNA"}.} + +\item{np}{a single positive \code{integer} specifying the number of +threads to be used. Default: \code{1L}.} + +\item{blockTypeID}{a \code{character} string corresponding to the block +type used to extract the block identifiers. The block type must be +present in the GDS Reference Annotation file.} + +\item{verbose}{a \code{logical} indicating if messages should be printed +to show how the different steps in the function. Default: \code{FALSE}.} +} +\value{ +The integer \code{0L} when successful. See details section for +more information about the generated output files. +} +\description{ +This function runs most steps leading to the ancestry inference +call on a specific profile. First, the function creates the Profile GDS file +for the specific profile using the information from a RDS Sample +description file and the Population reference GDS file. +} +\details{ +The runWrapperAncestry() function generates 3 types of files +in the \code{pathOut} directory. +\describe{ +\item{Ancestry Inference}{ The ancestry inference CSV file +(".Ancestry.csv" file)} +\item{Inference Informaton}{ The inference information RDS file +(".infoCall.rds" file)} +\item{Synthetic Information}{ The parameter information RDS files +from the synthetic inference ("KNN.synt.*.rds" files in a sub-directory)} +} + +In addition, a sub-directory (named using the profile ID) is +also created. +} +\examples{ + +## Required library for GDS +library(SNPRelate) + +## Path to the demo 1KG GDS file is located in this package +dataDir <- system.file("extdata", package="RAIDS") + +################################################################# +## Load the information about the profile +################################################################# +data(demoPedigreeEx1) +head(demoPedigreeEx1) + +################################################################# +## The 1KG GDS file and the 1KG SNV Annotation GDS file +## need to be located in the same directory +## Note that the 1KG GDS file used for this example is a +## simplified version and CANNOT be used for any real analysis +################################################################# +path1KG <- file.path(dataDir, "tests") + +fileReferenceGDS <- file.path(path1KG, "ex1_good_small_1KG.gds") +fileAnnotGDS <- file.path(path1KG, "ex1_good_small_1KG_Annot.gds") + +################################################################# +## The Sample SNP pileup files (one per sample) need +## to be located in the same directory. +################################################################# +pathGeno <- file.path(dataDir, "example", "snpPileup") + +################################################################# +## The path where the Profile GDS Files (one per sample) +## will be created need to be specified. +################################################################# +pathProfileGDS <- file.path(tempdir(), "out.tmp") + +pathOut <- file.path(tempdir(), "res.out") + +################################################################# +## A data frame containing general information about the study +## is also required. The data frame must have +## those 3 columns: "studyID", "study.desc", "study.platform" +################################################################# +studyDF <- data.frame(study.id="MYDATA", + study.desc="Description", + study.platform="PLATFORM", + stringsAsFactors=FALSE) + +#################################################################### +## Fix seed to ensure reproducible results +#################################################################### +set.seed(3043) + +gds1KG <- snpgdsOpen(fileReferenceGDS) +dataRef <- select1KGPop(gds1KG, nbProfiles=2L) +closefn.gds(gds1KG) + +## Required library for this example to run correctly +if (requireNamespace("GenomeInfoDb", quietly=TRUE) && + requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) { + + ## Chromosome length information + ## chr23 is chrX, chr24 is chrY and chrM is 25 + chrInfo <- GenomeInfoDb::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25] + + \dontrun{ + + RAIDS:::runWrapperAncestry(pedStudy=demoPedigreeEx1, studyDF=studyDF, + pathProfileGDS=pathProfileGDS, + pathGeno=pathGeno, pathOut=pathOut, + fileReferenceGDS=fileReferenceGDS, + fileReferenceAnnotGDS=fileAnnotGDS, + chrInfo=chrInfo, syntheticRefDF=dataRef, + studyType="DNA", genoSource="snp-pileup") + + unlink(pathProfileGDS, recursive=TRUE, force=TRUE) + unlink(pathOut, recursive=TRUE, force=TRUE) + + } +} + +} +\references{ +Galinsky KJ, Bhatia G, Loh PR, Georgiev S, Mukherjee S, Patterson NJ, +Price AL. Fast Principal-Component Analysis Reveals Convergent Evolution +of ADH1B in Europe and East Asia. Am J Hum Genet. 2016 Mar 3;98(3):456-72. +doi: 10.1016/j.ajhg.2015.12.022. Epub 2016 Feb 25. +} +\author{ +Pascal Belleau, Astrid Deschênes and Alexander Krasnitz +} +\keyword{internal} diff --git a/man/selParaPCAUpQuartile.Rd b/man/selParaPCAUpQuartile.Rd index db96ee1c8..57effa389 100644 --- a/man/selParaPCAUpQuartile.Rd +++ b/man/selParaPCAUpQuartile.Rd @@ -19,7 +19,7 @@ selParaPCAUpQuartile( } \arguments{ \item{matKNN}{a \code{data.frame} containing the inferred ancestry for the -synthetic profiles for different _K_ and _D_ values. The \code{data.frame} +synthetic profiles for different \emph{K} and \emph{D} values. The \code{data.frame} must contained those columns: "sample.id", "D", "K" and the fourth column name must correspond to the \code{predCall} argument.} @@ -41,22 +41,22 @@ argument.} the list of possible ancestry assignations.} \item{kList}{a \code{vector} of \code{integer} representing the list of -values tested for the _K_ parameter. The _K_ parameter represents the +values tested for the \emph{K} parameter. The \emph{K} parameter represents the number of neighbors used in the K-nearest neighbor analysis. Default: \code{seq(3,15,1)}.} \item{pcaList}{a \code{vector} of \code{integer} representing the list of -values tested for the _D_ parameter. The _D_ parameter represents the +values tested for the \emph{D} parameter. The \emph{D} parameter represents the number of dimensions used in the PCA analysis. Default: \code{seq(2,15,1)}.} } \value{ a \code{list} containing 5 entries: -\itemize{ -\item{\code{dfPCA}} { a \code{data.frame} containing statistical results +\describe{ +\item{\code{dfPCA}}{ a \code{data.frame} containing statistical results on all combined synthetic results done with a fixed value of \code{D} (the number of dimensions). The \code{data.frame} contains those columns: -\itemize{ +\describe{ \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the number of dimensions).} \item{\code{median}}{ a \code{numeric} representing the median of the @@ -72,11 +72,11 @@ combination of the fixed \code{D} value and all tested \code{K} values. } (the number of neighbors) for a fixed \code{D} value. } } } -\item{\code{dfPop}} { a \code{data.frame} containing statistical results on +\item{\code{dfPop}}{ a \code{data.frame} containing statistical results on all combined synthetic results done with different values of \code{D} (the number of dimensions) and \code{K} (the number of neighbors). The \code{data.frame} contains those columns: -\itemize{ +\describe{ \item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the number of dimensions).} \item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the @@ -92,11 +92,11 @@ of the confusion matrix obtained by grouping all the synthetic results for the specified values of \code{D} and \code{K}.} } } -\item{\code{D}} { a \code{numeric} representing the optimal \code{D} value +\item{\code{D}}{ a \code{numeric} representing the optimal \code{D} value (the number of dimensions) for the specific profile.} -\item{\code{K}} { a \code{numeric} representing the optimal \code{K} value +\item{\code{K}}{ a \code{numeric} representing the optimal \code{K} value (the number of neighbors) for the specific profile.} -\item{\code{listD}} { a \code{numeric} representing the optimal \code{D} +\item{\code{listD}}{ a \code{numeric} representing the optimal \code{D} values (the number of dimensions) for the specific profile. More than one \code{D} is possible.} } @@ -111,21 +111,20 @@ accuracy. } \examples{ -dataDirRes <- system.file("extdata/demoAncestryCall", package="RAIDS") - -## The inferred ancestry results for the synthetic data using different -## values of D and K -matKNN <- readRDS(file.path(dataDirRes, "matKNN.RDS")) - -## The known ancestry from the reference profiles used to generate the +## Loading demo dataset containing pedigree information for synthetic +## profiles and known ancestry of the profiles used to generate the ## synthetic profiles -syntheticInfo <- readRDS(file.path(dataDirRes, "pedSyn.RDS")) +data(pedSynthetic) + +## Loading demo dataset containing the inferred ancestry results +## for the synthetic data +data(matKNNSynthetic) ## Compile all the results for ancestry inference done on the ## synthetic profiles for different D and K values ## Select the optimal D and K values -results <- RAIDS:::selParaPCAUpQuartile(matKNN=matKNN, - pedCall=syntheticInfo, refCall="superPop", predCall="SuperPop", +results <- RAIDS:::selParaPCAUpQuartile(matKNN=matKNNSynthetic, + pedCall=pedSynthetic, refCall="superPop", predCall="SuperPop", listCall=c("EAS", "EUR", "AFR", "AMR", "SAS"), kList=seq(3,15,1), pcaList=seq(2,15,1)) results$D diff --git a/man/select1KGPop.Rd b/man/select1KGPop.Rd index 7369badd6..80a60bc1d 100644 --- a/man/select1KGPop.Rd +++ b/man/select1KGPop.Rd @@ -21,12 +21,12 @@ subcontinental population will correspond to the size of this population.} } \value{ a \code{data.frame} containing those columns: -\itemize{ -\item{sample.id} { a \code{character} string representing the sample +\describe{ +\item{sample.id}{ a \code{character} string representing the sample identifier. } -\item{pop.group} { a \code{character} string representing the +\item{pop.group}{ a \code{character} string representing the subcontinental population assigned to the sample. } -\item{superPop} { a \code{character} string representing the +\item{superPop}{ a \code{character} string representing the super-population assigned to the sample. } } } @@ -48,8 +48,8 @@ nbProfiles <- 5L ## Open 1KG GDS Demo file ## This file only one superpopulation (for demonstration purpose) dataDir <- system.file("extdata", package="RAIDS") -fileGDS <- file.path(dataDir, "gds1KG.gds") -gdsFileOpen <- openfn.gds(fileGDS) +fileGDS <- file.path(dataDir, "PopulationReferenceDemo.gds") +gdsFileOpen <- openfn.gds(fileGDS, readonly=TRUE) ## Extract a selected number of random samples ## for each subcontinental population diff --git a/man/select1KGPopForSynthetic.Rd b/man/select1KGPopForSynthetic.Rd new file mode 100644 index 000000000..d6c519c98 --- /dev/null +++ b/man/select1KGPopForSynthetic.Rd @@ -0,0 +1,64 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/synthetic.R +\encoding{UTF-8} +\name{select1KGPopForSynthetic} +\alias{select1KGPopForSynthetic} +\title{Random selection of a specific number of reference profiles in each +subcontinental population present in the 1KG GDS file ( same as select1KGPop +but the function doesn't need gds object as parameters but the file name +of the referenceGDS )} +\usage{ +select1KGPopForSynthetic(fileReferenceGDS, nbProfiles) +} +\arguments{ +\item{fileReferenceGDS}{a \code{character} string representing the file +name of the Reference GDS file. The file must exist.} + +\item{nbProfiles}{a single positive \code{integer} representing the number +of samples that will be selected for each subcontinental population present +in the 1KG GDS file. If the number of samples in a specific subcontinental +population is smaller than the \code{nbProfiles}, the number of samples +selected in this +subcontinental population will correspond to the size of this population.} +} +\value{ +a \code{data.frame} containing those columns: +\describe{ +\item{sample.id}{ a \code{character} string representing the sample +identifier. } +\item{pop.group}{ a \code{character} string representing the +subcontinental population assigned to the sample. } +\item{superPop}{ a \code{character} string representing the +super-population assigned to the sample. } +} +} +\description{ +The function randomly selects a fixed number of reference +for each subcontinental population present in the 1KG GDS file. When a +subcontinental population has less samples than the fixed number, all +samples from the subcontinental population are selected. +} +\examples{ + +## Required library +library(gdsfmt) + +## The number of samples needed by subcontinental population +## The number is small for demonstration purpose +nbProfiles <- 5L + +## 1KG GDS Demo file +## This file only one superpopulation (for demonstration purpose) +dataDir <- system.file("extdata", package="RAIDS") +fileGDS <- file.path(dataDir, "PopulationReferenceDemo.gds") + +## Extract a selected number of random samples +## for each subcontinental population +## In the 1KG GDS Demo file, there is one subcontinental population +dataR <- select1KGPopForSynthetic(fileReferenceGDS=fileGDS, nbProfiles=nbProfiles) + + +} +\author{ +Pascal Belleau, Astrid Deschênes and Alexander Krasnitz +} diff --git a/man/snpPositionDemo.Rd b/man/snpPositionDemo.Rd new file mode 100644 index 000000000..a4774fba5 --- /dev/null +++ b/man/snpPositionDemo.Rd @@ -0,0 +1,112 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RAIDS.R +\docType{data} +\name{snpPositionDemo} +\alias{snpPositionDemo} +\title{A small \code{data.frame} containing the +SNV information.} +\format{ +The \code{data.frame} containing the information about the +synthetic profiles. The \code{data.frame} contains 4 columns: +\describe{ +\item{\code{cnt.tot}}{ a \code{integer} representing the number of reads at +the SNV position.} +\item{\code{cnt.ref}}{ a \code{integer} representing the number of reads +corresponding to the reference at the SNV position.} +\item{\code{cnt.alt}}{ a \code{integer} representing the number of reads +different than the reference at the SNV position.} +\item{\code{snp.pos}}{ a \code{integer} representing the position of the +SNV on the chromosome.} +\item{\code{snp.chr}}{ a \code{integer} representing the chromosome on which +the SNV is located.} +\item{\code{normal.geno}}{ a \code{integer} representing the genotype +(0=wild-type reference; 1=heterozygote; 2=homozygote alternative; +3=unkown).} +\item{\code{pruned}}{ a \code{logical} indicated if the SNV is pruned.} +\item{\code{snp.index}}{ a \code{integer} representing the index of the +SNV in the reference SNV GDS file.} +\item{\code{keep}}{ a \code{logical} indicated if the genotype +exists for the SNV.} +\item{\code{hetero}}{ a \code{logical} indicated if the SNV is +heterozygote.} +\item{\code{homo}}{ a \code{logical} indicated if the SNV is homozygote.} +\item{\code{block.id}}{ a \code{integer} representing the block identifier +associated to the current SNV.} +\item{\code{phase}}{ a \code{integer} representing the block identifier +associated to the current SNV.} +\item{\code{lap}}{ a \code{numeric} representing the lower allelic +fraction.} +\item{\code{LOH}}{ a \code{integer} indicating if the SNV is in an LOH +region (0=not LOH, 1=in LOH).} +\item{\code{imbAR}}{ a \code{integer} indicating if the SNV is in an +imbalanced region (-1=not classified as imbalanced or LOH, 0=in LOH; +1=tested positive for imbalance in at least 1 window).} +\item{\code{freq}}{ a \code{numeric} representing the frequency of the +variant in the the reference.} +} +} +\usage{ +data(snpPositionDemo) +} +\value{ +The \code{data.frame} containing the information about the +synthetic profiles. The \code{data.frame} contains 4 columns: +\describe{ +\item{\code{cnt.tot}}{ a \code{integer} representing the number of reads at +the SNV position.} +\item{\code{cnt.ref}}{ a \code{integer} representing the number of reads +corresponding to the reference at the SNV position.} +\item{\code{cnt.alt}}{ a \code{integer} representing the number of reads +different than the reference at the SNV position.} +\item{\code{snp.pos}}{ a \code{integer} representing the position of the +SNV on the chromosome.} +\item{\code{snp.chr}}{ a \code{integer} representing the chromosome on which +the SNV is located.} +\item{\code{normal.geno}}{ a \code{integer} representing the genotype +(0=wild-type reference; 1=heterozygote; 2=homozygote alternative; 3=unkown).} +\item{\code{pruned}}{ a \code{logical} indicated if the SNV is pruned.} +\item{\code{snp.index}}{ a \code{integer} representing the index of the +SNV in the reference SNV GDS file.} +\item{\code{keep}}{ a \code{logical} indicated if the genotype +exists for the SNV.} +\item{\code{hetero}}{ a \code{logical} indicated if the SNV is heterozygote.} +\item{\code{homo}}{ a \code{logical} indicated if the SNV is homozygote.} +\item{\code{block.id}}{ a \code{integer} representing the block identifier +associated to the current SNV.} +\item{\code{phase}}{ a \code{integer} representing the block identifier +associated to the current SNV.} +\item{\code{lap}}{ a \code{numeric} representing the lower allelic fraction.} +\item{\code{LOH}}{ a \code{integer} indicating if the SNV is in an LOH region +(0=not LOH, 1=in LOH).} +\item{\code{imbAR}}{ a \code{integer} indicating if the SNV is in an +imbalanced region (-1=not classified as imbalanced or LOH, 0=in LOH; +1=tested positive for imbalance in at least 1 window).} +\item{\code{freq}}{ a \code{numeric} representing the frequency of the +variant in the the reference.} +} +} +\description{ +The object is a \code{data.frame} with 17 columns. +} +\details{ +This dataset can be +used to test the \code{\link{calcAFMLRNA}} and \code{\link{tableBlockAF}} +internal functions. +} +\examples{ + +## Loading demo dataset containing SNV information +data(snpPositionDemo) + +## Only use a subset of heterozygote SNVs related to one block +subset <- snpPositionDemo[which(snpPositionDemo$block.id == 2750 & + snpPositionDemo$hetero), c("cnt.ref", "cnt.alt", "phase")] + +## Compute the log likelihood ratio based on the coverage of +## each allele in a specific block +result <- RAIDS:::calcAFMLRNA(subset) +head(result) + + +} +\keyword{datasets} diff --git a/man/snvListVCF.Rd b/man/snvListVCF.Rd index b0891e545..13e69e1da 100644 --- a/man/snvListVCF.Rd +++ b/man/snvListVCF.Rd @@ -6,14 +6,15 @@ \title{Generate a VCF with the information from the SNPs that pass a cut-off threshold} \usage{ -snvListVCF(gdsReference, fileOUT, offset = 0L, freqCutoff = NULL) +snvListVCF(gdsReference, fileOut, offset = 0L, freqCutoff = NULL) } \arguments{ \item{gdsReference}{an object of class \code{\link[gdsfmt]{gds.class}} (a GDS file), the 1KG GDS file.} -\item{fileOUT}{a \code{character} string representing the path and file -name of the VCF file that will be created wit the retained SNP information.} +\item{fileOut}{a \code{character} string representing the path and file +name of the VCF file that will be created wit the retained SNP information. +The file should have the ".vcf" extension.} \item{offset}{a single \code{integer} that is added to the SNP position to switch from 0-based to 1-based coordinate when needed (or reverse). @@ -23,7 +24,7 @@ Default: \code{0L}.} keep a SNP. If \code{NULL}, all SNPs are retained. Default: \code{NULL}.} } \value{ -The integer \code{0} when successful. +The integer \code{0L} when successful. } \description{ This function extract the SNPs that pass a frequency cut-off @@ -33,18 +34,23 @@ a VCF file. } \examples{ +## Required library +library(gdsfmt) + ## Path to the demo pedigree file is located in this package dataDir <- system.file("extdata", package="RAIDS") -## Demo 1KG GDS file -fileGDS <- openfn.gds(file.path(dataDir, "1KG_Demo.gds")) +## Demo 1KG Reference GDS file +fileGDS <- openfn.gds(file.path(dataDir, + "PopulationReferenceDemo.gds")) -## Output VCF file that will be created -vcfFile <- file.path(dataDir, "Demo_TMP_01.vcf") +## Output VCF file that will be created (temporary) +vcfFile <- file.path(tempdir(), "Demo_TMP_01.vcf") ## Create a VCF file with the SNV dataset present in the GDS file ## No cutoff on frequency, so all SNVs are saved -snvListVCF(gdsReference=fileGDS, fileOUT=vcfFile, offset=0L, freqCutoff=NULL) +snvListVCF(gdsReference=fileGDS, fileOut=vcfFile, offset=0L, + freqCutoff=NULL) ## Close GDS file (IMPORTANT) closefn.gds(fileGDS) diff --git a/man/splitSelectByPop.Rd b/man/splitSelectByPop.Rd index fdeb556d3..586bbd65c 100644 --- a/man/splitSelectByPop.Rd +++ b/man/splitSelectByPop.Rd @@ -9,12 +9,12 @@ splitSelectByPop(dataRef) } \arguments{ \item{dataRef}{a \code{data.frame} containing those columns: -\itemize{ -\item{sample.id} { a \code{character} string representing the sample +\describe{ +\item{sample.id}{ a \code{character} string representing the sample identifier. } -\item{pop.group} { a \code{character} string representing the +\item{pop.group}{ a \code{character} string representing the subcontinental population assigned to the sample. } -\item{superPop} { a \code{character} string representing the +\item{superPop}{ a \code{character} string representing the super-population assigned to the sample. } }} } diff --git a/man/syntheticGeno.Rd b/man/syntheticGeno.Rd index c2d9e1af8..dd64689c9 100644 --- a/man/syntheticGeno.Rd +++ b/man/syntheticGeno.Rd @@ -49,7 +49,7 @@ represents the frequency of phase switching in the synthetic profiles, Default: \code{0.01}.} \item{minProb}{a single positive \code{numeric} between 0 and 1 that -represents the probability that the genotype is correct. TODO. +represents the probability that the genotype is correct. Default: \code{0.999}.} \item{seqError}{a single positive \code{numeric} between 0 and 1 @@ -79,42 +79,36 @@ library(gdsfmt) ## Path to the demo 1KG GDS file is located in this package dataDir <- system.file("extdata/tests", package="RAIDS") -## Copy the Profile GDS file demo that has been pruned and annotated -## into a test directory (deleted after the example has been run) -dataDirAllelicFraction <- file.path(system.file("extdata", package="RAIDS"), - "demoAllelicFraction") -dir.create(dataDirAllelicFraction, showWarnings=FALSE, - recursive=FALSE, mode="0777") - -## Profile GDS file -fileNameGDS <- file.path(dataDirAllelicFraction, "ex1.gds") +## Profile GDS file (temporary) +fileNameGDS <- file.path(tempdir(), "ex1.gds") +## Copy the Profile GDS file demo that has been pruned and annotated file.copy(file.path(dataDir, "ex1_demo_with_pruning_and_1KG_annot.gds"), fileNameGDS) ## Information about the synthetic data set syntheticStudyDF <- data.frame(study.id="MYDATA.Synthetic", - study.desc="MYDATA synthetic data", study.platform="PLATFORM", - stringsAsFactors=FALSE) + study.desc="MYDATA synthetic data", study.platform="PLATFORM", + stringsAsFactors=FALSE) ## Add information related to the synthetic profiles into the Profile GDS prepSynthetic(fileProfileGDS=fileNameGDS, - listSampleRef=c("HG00243", "HG00150"), profileID="ex1", - studyDF=syntheticStudyDF, nbSim=1L, prefix="synthTest", - verbose=FALSE) + listSampleRef=c("HG00243", "HG00150"), profileID="ex1", + studyDF=syntheticStudyDF, nbSim=1L, prefix="synthTest", + verbose=FALSE) ## The 1KG files gds1KG <- snpgdsOpen(file.path(dataDir, - "ex1_good_small_1KG_GDS.gds")) + "ex1_good_small_1KG.gds")) gds1KGAnnot <- openfn.gds(file.path(dataDir, - "ex1_good_small_1KG_Annot_GDS.gds")) + "ex1_good_small_1KG_Annot.gds")) ## Generate the synthetic profiles and add them into the Profile GDS syntheticGeno(gdsReference=gds1KG, gdsRefAnnot=gds1KGAnnot, - fileProfileGDS=fileNameGDS, profileID="ex1", - listSampleRef=c("HG00243", "HG00150"), nbSim=1, - prefix="synthTest", - pRecomb=0.01, minProb=0.999, seqError=0.001) + fileProfileGDS=fileNameGDS, profileID="ex1", + listSampleRef=c("HG00243", "HG00150"), nbSim=1, + prefix="synthTest", + pRecomb=0.01, minProb=0.999, seqError=0.001) ## Open Profile GDS file profileGDS <- openfn.gds(fileNameGDS) @@ -126,9 +120,9 @@ closefn.gds(profileGDS) closefn.gds(gds1KG) closefn.gds(gds1KGAnnot) -## Unlink Profile GDS file (created for demo purpose) -unlink(file.path(dataDirAllelicFraction, "ex1.gds")) -unlink(dataDirAllelicFraction) +## Remove Profile GDS file (created for demo purpose) +unlink(fileNameGDS, force=TRUE) + } \author{ diff --git a/man/tableBlockAF.Rd b/man/tableBlockAF.Rd index b2a321c73..9b7e2747d 100644 --- a/man/tableBlockAF.Rd +++ b/man/tableBlockAF.Rd @@ -3,26 +3,66 @@ \encoding{UTF-8} \name{tableBlockAF} \alias{tableBlockAF} -\title{TODO} +\title{Compile the information about the SNVs +for each block} \usage{ -tableBlockAF(snp.pos) +tableBlockAF(snpPos) } \arguments{ -\item{snp.pos}{For a specific chromosome a \code{data.frame} with lap for -the SNV dataset with -coverage > \code{minCov}.} +\item{snpPos}{a \code{data.frame} with lower allelic fraction (lap) for +the SNVs with coverage > \code{minCov}, for a specific chromosome.} } \value{ -TODO a \code{data.frame} with the information related to allelic -fraction for each block gene +a \code{data.frame} containing only heterozygote +SNV information. The +\code{data.frame} contain those columns: +\describe{ +\item{block}{ a single \code{integer} representing the unique identifier +of the block.} +\item{aRF}{ a single \code{numeric} representing the final allelic +fraction; not computed yet, \code{-1} value assigned to all entries.} +\item{aFraction}{a single \code{integer} representing the possible allelic +fraction in absence of loss of heterozygosity (LOH).} +\item{lR}{ a single \code{integer} representing the coverage for +the alternative allele.} +\item{nPhase}{ a single \code{integer} representing the number of SNV +phases.} +\item{sumAlleleLow}{ a single \code{integer} representing the sum of the +alleles with the less coverage.} +\item{sumAlleleHigh}{ a single \code{integer} representing the sum of +the alleles with more coverage.} +\item{lH}{ a single \code{numeric} for the homozygotes log10 of the product +frequencies of the allele not found in the profile (not a probability).} +\item{lM}{ a single \code{numeric} log10 product frequency allele +in population.} +\item{lRhomo}{a single \code{numeric} representing the score +\code{lH} - \code{lM}.} +\item{nbHomo}{ a single \code{integer} representing the number of +homozygote SNVs per block.} +\item{nbKeep}{ a single \code{integer} representing the number of +SNVs retained per block.} +\item{nbHetero}{ a single \code{integer} representing the number of +heterozygote SNVs per block.} +} } \description{ -TODO +The function evaluates a score +about loss of heterozygosity and allelic fraction for each block. It +generates specific information about the variants in the block, like the +number of homozygotes or heterozygotes. +In the case of RNA-seq, the blocks are genes. } \examples{ -# TODO -gds <- "Demo GDS TODO" +## Loading demo dataset containing SNV information +data(snpPositionDemo) + +## Retain SNVs on chromosome 1 +subset <- snpPositionDemo[which(snpPositionDemo$snp.chr == 1),] + +##Compile the information about the SNVs for each block +result <- RAIDS:::tableBlockAF(subset) +head(result) } \author{ diff --git a/man/testAlleleFractionChange.Rd b/man/testAlleleFractionChange.Rd index d200209b3..6d5412e80 100644 --- a/man/testAlleleFractionChange.Rd +++ b/man/testAlleleFractionChange.Rd @@ -11,10 +11,10 @@ testAlleleFractionChange(matCov, pCutOff = -3, vMean) \arguments{ \item{matCov}{a \code{data.frame} containing only heterozygote SNVs. The \code{data.frame} must contain those columns: -\itemize{ -\item{cnt.ref} {a single \code{integer} representing the coverage for +\describe{ +\item{cnt.ref}{ a single \code{integer} representing the coverage for the reference allele.} -\item{cnt.alt} {a single \code{integer} representing the coverage for +\item{cnt.alt}{ a single \code{integer} representing the coverage for the alternative allele.} }} @@ -29,16 +29,16 @@ reference to see if there is a allelic fraction change.} } \value{ a \code{list} containing 4 entries: -\itemize{ +\describe{ \item{pWin}{ a \code{vector} of \code{numeric} representing the probability (x2) of obtaining the current alternative/(alternative+reference) ratio from a reference distribution specified by user.} -\item{p}{a \code{integer} indicating if all SNVs tested +\item{p}{ a \code{integer} indicating if all SNVs tested positive (1=TRUE, 0=FALSE). The cut-off is 0.5. } -\item{pCut}{a \code{integer} indicating if all SNVs tested +\item{pCut}{ a \code{integer} indicating if all SNVs tested positive (1=TRUE, 0-FALSE). } -\item{pCut1}{a \code{integer} indicating if the region tested +\item{pCut1}{ a \code{integer} indicating if the region tested positive (1=TRUE, 0=FALSE) for allelic ratio change.} } } diff --git a/man/testEmptyBox.Rd b/man/testEmptyBox.Rd index d7c838bad..e5520967f 100644 --- a/man/testEmptyBox.Rd +++ b/man/testEmptyBox.Rd @@ -11,10 +11,10 @@ testEmptyBox(matCov, pCutOff = -3) \arguments{ \item{matCov}{a \code{data.frame} containing only heterozygote SNVs. The \code{data.frame} must contain those columns: -\itemize{ -\item{cnt.ref} {a single \code{integer} representing the coverage for +\describe{ +\item{cnt.ref}{ a single \code{integer} representing the coverage for the reference allele.} -\item{cnt.alt} {a single \code{integer} representing the coverage for +\item{cnt.alt}{ a single \code{integer} representing the coverage for the alternative allele.} }} @@ -24,7 +24,7 @@ likelihood not to be imbalanced. Default: \code{-3}.} } \value{ a \code{list} containing 4 entries: -\itemize{ +\describe{ \item{pWin}{ a \code{vector} of \code{numeric} representing the probability (x2) of obtaining the current alternative/(alternative+reference) ratio from a 0.5 distribution.} diff --git a/man/validateAccuracyGraphInternal.Rd b/man/validateAccuracyGraphInternal.Rd new file mode 100644 index 000000000..85a614df1 --- /dev/null +++ b/man/validateAccuracyGraphInternal.Rd @@ -0,0 +1,43 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/visualization_internal.R +\encoding{UTF-8} +\name{validateAccuracyGraphInternal} +\alias{validateAccuracyGraphInternal} +\title{Validate input parameters for createAccuracyGraph and +createAUROCGraph functions} +\usage{ +validateAccuracyGraphInternal(title, selectD, selectColor) +} +\arguments{ +\item{title}{a \code{character} string representing the title of the graph.} + +\item{selectD}{a \code{array} of \code{integer} representing the selected +PCA dimensions to plot. The length of the \code{array} cannot be more than +5 entries. The dimensions must tested by RAIDS (i.e. be present in the +RDS file).} + +\item{selectColor}{a \code{array} of \code{character} strings representing +the selected colors for the associated PCA dimensions to plot. The length +of the \code{array} must correspond to the length of the \code{selectD} +parameter. In addition, the length of the \code{array} cannot be more than +5 entries.} +} +\value{ +The function returns \code{0L} when successful. +} +\description{ +This function validates the parameters for the +\code{\link{createAccuracyGraph}} and \code{\link{createAUROCGraph}} +functions. +} +\examples{ + +## Validate parameters +RAIDS:::validateAccuracyGraphInternal(title="Accuracy Graph", + selectD=c(5, 10), selectColor=c("blue","darkblue")) + +} +\author{ +Astrid Deschênes and Pascal Belleau +} +\keyword{internal} diff --git a/man/validateAdd1KG2SampleGDS.Rd b/man/validateAdd1KG2SampleGDS.Rd index 2c9328572..175abaaaa 100644 --- a/man/validateAdd1KG2SampleGDS.Rd +++ b/man/validateAdd1KG2SampleGDS.Rd @@ -9,7 +9,8 @@ validateAdd1KG2SampleGDS(gdsReference, gdsProfileFile, currentProfile, studyID) } \arguments{ \item{gdsReference}{an object of class -\link[gdsfmt]{gds.class} (a GDS file), the opened 1KG GDS file.} +\link[gdsfmt]{gds.class} (a GDS file), the opened Population Reference +GDS file.} \item{gdsProfileFile}{a \code{character} string representing the path and file name of the Profile GDS file. The Profile GDS file must exist.} @@ -29,11 +30,15 @@ This function validates the input parameters for the } \examples{ +## Required library +library(gdsfmt) + ## Path to the demo pedigree file is located in this package dataDir <- system.file("extdata", package="RAIDS") -## The 1KG GDS file (opened) -gds1KG <- openfn.gds(file.path(dataDir, "gds1KG.gds"), readonly=TRUE) +## The 1KG Population Reference GDS demo file (opened) +gds1KG <- openfn.gds(file.path(dataDir, + "PopulationReferenceDemo.gds"), readonly=TRUE) ## The validatiion should be successful RAIDS:::validateAdd1KG2SampleGDS(gdsReference=gds1KG, diff --git a/man/validateAddStudy1Kg.Rd b/man/validateAddStudy1Kg.Rd index 7c1bf04df..c9056751f 100644 --- a/man/validateAddStudy1Kg.Rd +++ b/man/validateAddStudy1Kg.Rd @@ -30,7 +30,7 @@ expected, an error message is generated. ## Path to the demo 1KG GDS file is located in this package dataDir <- system.file("extdata", package="RAIDS") -fileReferenceGDS <- file.path(dataDir, "1KG_Demo.gds") +fileReferenceGDS <- file.path(dataDir, "PopulationReferenceDemo.gds") gds1KG <- snpgdsOpen(fileReferenceGDS) ## Path to demo Profile GDS file diff --git a/man/validateAppendStudy2GDS1KG.Rd b/man/validateAppendStudy2GDS1KG.Rd deleted file mode 100644 index 66cb4c3dd..000000000 --- a/man/validateAppendStudy2GDS1KG.Rd +++ /dev/null @@ -1,89 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/processStudy_internal.R -\encoding{UTF-8} -\name{validateAppendStudy2GDS1KG} -\alias{validateAppendStudy2GDS1KG} -\title{Validate input parameters for appendStudy2GDS1KG() function} -\usage{ -validateAppendStudy2GDS1KG( - pathGeno, - filePedRDS, - fileNameGDS, - batch, - studyDF, - listSamples, - pathProfileGDS, - genoSource, - verbose -) -} -\arguments{ -\item{pathGeno}{a \code{character} string representing the path to the -directory containing the output of SNP-pileup, a VCF Sample file, for -each sample.} - -\item{filePedRDS}{a \code{character} string representing the path to the -RDS file that contains the information about the sample to analyse.} - -\item{fileNameGDS}{a \code{character} string representing the file name of -the 1KG GDS file. The file must exist.} - -\item{batch}{a single positive \code{integer} representing the current -identifier for the batch. Beware, this field is not stored anymore.} - -\item{studyDF}{a \code{data.frame} containing the information about the -study associated to the analysed sample(s). The \code{data.frame} must have -those 3 columns: "studyID", "study.desc", "study.platform". All columns -must be in \code{character} strings.} - -\item{listSamples}{a \code{vector} of \code{character} string corresponding -to the sample identifiers that will have a GDS Sample file created. The -sample identifiers must be present in the "Name.ID" column of the RDS file -passed to the \code{filePedRDS} parameter. -If \code{NULL}, all samples in the \code{filePedRDS} are selected.} - -\item{pathProfileGDS}{a \code{character} string representing the path to -the directory where the GDS Sample files will be created.} - -\item{genoSource}{a \code{character} string with two possible values: -'snp-pileup' or 'generic'. It specifies if the genotype files -are generate by snp-pileup (Facets) or generic format csv.} - -\item{verbose}{a \code{logical} indicating if message information should be -printed.} -} -\value{ -The function returns \code{0L} when successful. -} -\description{ -This function validates the input parameters for the -\code{\link{appendStudy2GDS1KG}} function. -} -\examples{ - -## Path to the demo pedigree file is located in this package -dataDir <- system.file("extdata", package="RAIDS") - -gds1KG <- file.path(dataDir, "1KG_Demo.gds") -ped <- file.path(dataDir, "unrelatedPatientsInfo_Demo.rds") - -## The data.frame containing the information about the study -## The 3 mandatory columns: "studyID", "study.desc", "study.platform" -## The entries should be strings, not factors (stringsAsFactors=FALSE) -studyInfo <- data.frame(study.id="Pancreatic.WES", - study.desc="Pancreatic study", - study.platform="WES", - stringsAsFactors=FALSE) - -## The validatiion should be successful -RAIDS:::validateAppendStudy2GDS1KG(pathGeno=dataDir, - filePedRDS=ped, fileNameGDS=gds1KG, - batch=1L, studyDF=studyInfo, listSamples=c("HC01", "HC02"), - pathProfileGDS=dataDir, genoSource="snp-pileup", verbose=TRUE) - - -} -\author{ -Pascal Belleau, Astrid Deschênes and Alexander Krasnitz -} -\keyword{internal} diff --git a/man/validateComputeAncestryFromSyntheticFile.Rd b/man/validateComputeAncestryFromSyntheticFile.Rd index 2efe1c63a..7082429e1 100644 --- a/man/validateComputeAncestryFromSyntheticFile.Rd +++ b/man/validateComputeAncestryFromSyntheticFile.Rd @@ -27,21 +27,22 @@ validateComputeAncestryFromSyntheticFile( } \arguments{ \item{gdsReference}{an object of class \link[gdsfmt]{gds.class} (a GDS -file), the opened 1KG GDS file.} +file), the opened Population Reference GDS file.} \item{gdsProfile}{an object of class \code{\link[gdsfmt]{gds.class}} (a GDS file), the opened Profile GDS file.} \item{listFiles}{a \code{vector} of \code{character} strings representing the name of files that contain the results of ancestry inference done on -the synthetic profiles for multiple values of _D_ and _K_. The files must +the synthetic profiles for multiple values of \emph{D} and \emph{K}. The files must exist.} \item{currentProfile}{a \code{character} string representing the profile identifier of the current profile on which ancestry will be inferred.} \item{spRef}{a \code{vector} of \code{character} strings representing the -known super population ancestry for the 1KG profiles. The 1KG profile +known super population ancestry for the 1KG profiles. The Population +Reference profile identifiers are used as names for the \code{vector}.} \item{studyIDSyn}{a \code{character} string corresponding to the study @@ -62,11 +63,11 @@ the column that will contain the inferred ancestry for the specified dataset. Default: \code{"SuperPop"}.} \item{kList}{a \code{vector} of \code{integer} representing the list of -values tested for the _K_ parameter. The _K_ parameter represents the +values tested for the \emph{K} parameter. The \emph{K} parameter represents the number of neighbors used in the K-nearest neighbor analysis.} \item{pcaList}{a \code{vector} of \code{integer} representing the list of -values tested for the _D_ parameter. The _D_ parameter represents the +values tested for the \emph{D} parameter. The \emph{D} parameter represents the number of dimensions used in the PCA analysis.} \item{algorithm}{a \code{character} string representing the algorithm used @@ -93,11 +94,15 @@ This function validates the input parameters for the } \examples{ +## Required library +library(gdsfmt) + ## Directory where demo GDS files are located dataDir <- system.file("extdata", package="RAIDS") -## The 1KG GDS file (opened) -gds1KG <- openfn.gds(file.path(dataDir, "gds1KG.gds"), readonly=TRUE) +## The 1KG Population Reference GDS demo file (opened) +gds1KG <- openfn.gds(file.path(dataDir, + "PopulationReferenceDemo.gds"), readonly=TRUE) ## The Profile GDS (opened) gdsSample <- openfn.gds(file.path(dataDir, diff --git a/man/validateComputeKNNRefSample.Rd b/man/validateComputeKNNRefSample.Rd index d591d3d78..a710e9aa4 100644 --- a/man/validateComputeKNNRefSample.Rd +++ b/man/validateComputeKNNRefSample.Rd @@ -51,20 +51,20 @@ When a parameter is not as expected, an error message is generated. } \examples{ +## Load the demo PCA on the synthetic profiles projected on the +## demo 1KG reference PCA +data(demoPCASyntheticProfiles) -## Path to the demo GDS file is located in this package -dataDir <- system.file("extdata/demoKNNSynthetic", package="RAIDS") +## Load the known ancestry for the demo 1KG reference profiles +data(demoKnownSuperPop1KG) -pcaSynthetic <- readRDS(file.path(dataDir, "pcaSynthetic.RDS")) +pcaSynthetic <- demoPCASyntheticProfiles pcaSynthetic$sample.id <- pcaSynthetic$sample.id[1] -## The known ancestry for the 1KG reference profiles -refKnownSuperPop <- readRDS(file.path(dataDir, "knownSuperPop1KG.RDS")) - ## The function returns 0L when all parameters are valid RAIDS:::validateComputeKNNRefSample(listEigenvector=pcaSynthetic, listCatPop=c("EAS", "EUR", "AFR", "AMR", "SAS"), - spRef=refKnownSuperPop, fieldPopInfAnc="Superpop", + spRef=demoKnownSuperPop1KG, fieldPopInfAnc="Superpop", kList=c(10, 11, 12), pcaList=c(13, 14, 15)) diff --git a/man/validateComputeKNNRefSynthetic.Rd b/man/validateComputeKNNRefSynthetic.Rd index af7529074..958db651f 100644 --- a/man/validateComputeKNNRefSynthetic.Rd +++ b/man/validateComputeKNNRefSynthetic.Rd @@ -60,24 +60,25 @@ When a parameter is not as expected, an error message is generated. } \examples{ +## Load the demo PCA on the synthetic profiles projected on the +## demo 1KG reference PCA +data(demoPCASyntheticProfiles) + +## Load the known ancestry for the demo 1KG reference profiles +data(demoKnownSuperPop1KG) ## Path to the demo GDS file is located in this package dataDir <- system.file("extdata/demoKNNSynthetic", package="RAIDS") fileProfileGDS <- file.path(dataDir, "ex1.gds") -pcaSynthetic <- readRDS(file.path(dataDir, "pcaSynthetic.RDS")) - -## The known ancestry for the 1KG reference profiles -refKnownSuperPop <- readRDS(file.path(dataDir, "knownSuperPop1KG.RDS")) - ## Open GDS files gdsProfile <- openfn.gds(fileProfileGDS) ## The function returns 0L when all parameters are valid RAIDS:::validateComputeKNNRefSynthetic(gdsProfile=gdsProfile, - listEigenvector=pcaSynthetic, + listEigenvector=demoPCASyntheticProfiles, listCatPop=c("EAS", "EUR", "AFR", "AMR", "SAS"), - studyIDSyn="MyStudy", spRef=refKnownSuperPop, + studyIDSyn="MyStudy", spRef=demoKnownSuperPop1KG, fieldPopInfAnc="Superpop", kList=c(10, 11, 12), pcaList=c(13, 14, 15)) diff --git a/man/validateComputePCAMultiSynthetic.Rd b/man/validateComputePCAMultiSynthetic.Rd index 6b04ccd6a..cf963d95b 100644 --- a/man/validateComputePCAMultiSynthetic.Rd +++ b/man/validateComputePCAMultiSynthetic.Rd @@ -42,6 +42,9 @@ When a parameter is not as expected, an error message is generated. } \examples{ +## Loading demo PCA on subset of 1KG reference dataset +data(demoPCA1KG) + ## Path to the demo GDS file is located in this package dataDir <- system.file("extdata/demoKNNSynthetic", package="RAIDS") fileProfileGDS <- file.path(dataDir, "ex1.gds") @@ -49,11 +52,9 @@ fileProfileGDS <- file.path(dataDir, "ex1.gds") ## Open GDS files gdsProfile <- openfn.gds(fileProfileGDS) -pca <- readRDS(file.path(dataDir, "pca1KG.RDS")) - ## The function returns 0L when all parameters are valid RAIDS:::validateComputePCAMultiSynthetic(gdsProfile=gdsProfile, - listPCA=pca, sampleRef=c("HG00246", "HG00325"), + listPCA=demoPCA1KG, sampleRef=c("HG00246", "HG00325"), studyIDSyn="MyStudy", verbose=FALSE) ## Close GDS file (it is important to always close the GDS files) diff --git a/man/validateComputePoolSyntheticAncestry.Rd b/man/validateComputePoolSyntheticAncestry.Rd deleted file mode 100644 index 9aba6e559..000000000 --- a/man/validateComputePoolSyntheticAncestry.Rd +++ /dev/null @@ -1,121 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/processStudy_internal.R -\encoding{UTF-8} -\name{validateComputePoolSyntheticAncestry} -\alias{validateComputePoolSyntheticAncestry} -\title{Validate the input parameters for computePoolSyntheticAncestry() -function} -\usage{ -validateComputePoolSyntheticAncestry( - gdsReference, - profileGDS, - profileID, - dataRef, - spRef, - studyIDSyn, - np, - listCatPop, - fieldPopIn1KG, - fieldPopInfAnc, - kList, - pcaList, - algorithm, - eigenCount, - missingRate -) -} -\arguments{ -\item{gdsReference}{an object of class \link[gdsfmt]{gds.class} -(a GDS file), the opened 1KG GDS file.} - -\item{profileGDS}{an object of class \link[gdsfmt]{gds.class} (a GDS file), -an opened Profile GDS file.} - -\item{profileID}{a single \code{character} string representing the -profile identifier.} - -\item{dataRef}{a \code{data.frame} TODO} - -\item{spRef}{TODO} - -\item{studyIDSyn}{a \code{character} string corresponding to the study -identifier. The study identifier must be present in the GDS Sample file.} - -\item{np}{a single positive \code{integer} representing the number of -threads. Default: \code{1L}.} - -\item{listCatPop}{a \code{vector} of \code{character} string -representing the list of possible ancestry assignations. Default: -\code{("EAS", "EUR", "AFR", "AMR", "SAS")}.} - -\item{fieldPopIn1KG}{a \code{character} string representing the TODO} - -\item{fieldPopInfAnc}{a \code{character} string representing the name of -the column that will contain the inferred ancestry for the specified -dataset. Default: \code{"SuperPop"}.} - -\item{kList}{a \code{vector} of \code{integer} representing the list of -values tested for the _K_ parameter. The _K_ parameter represents the -number of neighbors used in the K-nearest neighbors analysis. If -\code{NULL}, the value \code{seq(2,15,1)} is assigned. -Default: \code{seq(2,15,1)}.} - -\item{pcaList}{a \code{vector} of \code{integer} representing the list of -values tested for the _D_ parameter. The _D_ parameter represents the -number of dimensions used in the PCA analysis. If \code{NULL}, -the value \code{seq(2,15,1)} is assigned. -Default: \code{seq(2,15,1)}.} - -\item{algorithm}{a \code{character} string representing the algorithm used -to calculate the PCA. The 2 choices are "exact" (traditional exact -calculation) and "randomized" (fast PCA with randomized algorithm -introduced in Galinsky et al. 2016). Default: \code{"exact"}.} - -\item{eigen.cnt}{a single \code{integer} indicating the number of -eigenvectors that will be in the output of the \link[SNPRelate]{snpgdsPCA} -function; if 'eigen.cnt' <= 0, then all eigenvectors are returned.} - -\item{missing.rate}{a \code{numeric} value representing the threshold -missing rate at with the SNVs are discarded; the SNVs are retained in the -\link[SNPRelate]{snpgdsPCA} -with "<= missing.rate" only; if \code{NaN}, no missing threshold.} -} -\value{ -The integer \code{0L} when successful. -} -\description{ -The function validates the input parameters for the -\code{\link{computePoolSyntheticAncestry}} function. -When a parameter is not as expected, an error message is generated. -} -\examples{ - -## Path to the demo 1KG GDS file is located in this package -dataDir <- system.file("extdata/tests", package="RAIDS") -fileGDS <- file.path(dataDir, "ex1_good_small_1KG_GDS.gds") -fileProfileGDS <- file.path(dataDir, "ex1_demo.gds") - -## Open GDS files -gds1KG <- snpgdsOpen(fileGDS) -gdsProfile <- openfn.gds(fileProfileGDS) - -dataRef <- data.frame(test=c(1,2), stringAsFactro=FALSE) - -## The function returns 0L when all parameters are valid -RAIDS:::validateComputePoolSyntheticAncestry(gdsReference=gds1KG, - profileGDS=gdsProfile, profileID="SampleID", - dataRef=dataRef, spRef=NULL, studyIDSyn="MyStudy", - np=1L, listCatPop=c("EAS", "EUR", "AFR", "AMR", "SAS"), - fieldPopIn1KG="SuperPop", fieldPopInfAnc="SuperPop", - kList=seq(2,15,1), pcaList=seq(2,15,1), - algorithm="exact", eigenCount=32L, missingRate=0.025) - -## Close GDS files (it is important to always close the GDS files) -closefn.gds(gds1KG) -closefn.gds(gdsProfile) - -} -\author{ -Pascal Belleau, Astrid Deschênes and Alexander Krasnitz -} -\keyword{internal} diff --git a/man/validateComputePoolSyntheticAncestryGr.Rd b/man/validateComputePoolSyntheticAncestryGr.Rd index 838226d42..ac2fe4724 100644 --- a/man/validateComputePoolSyntheticAncestryGr.Rd +++ b/man/validateComputePoolSyntheticAncestryGr.Rd @@ -28,11 +28,12 @@ validateComputePoolSyntheticAncestryGr( opened Profile GDS file.} \item{sampleRM}{a \code{vector} of \code{character} strings representing -the identifiers of the 1KG reference samples that should not be used to -create the reference PCA.} +the identifiers of the population reference samples that should not +be used to create the reference PCA.} \item{spRef}{a \code{vector} of \code{character} strings representing the -known super population ancestry for the 1KG profiles. The 1KG profile +known super population ancestry for the population reference profiles. +The population reference profile identifiers are used as names for the \code{vector}.} \item{studyIDSyn}{a \code{character} string corresponding to the study @@ -50,11 +51,11 @@ the column that will contain the inferred ancestry for the specified dataset.} \item{kList}{a \code{vector} of \code{integer} representing the list of -values tested for the _K_ parameter. The _K_ parameter represents the +values tested for the \emph{K} parameter. The \emph{K} parameter represents the number of neighbors used in the K-nearest neighbor analysis.} \item{pcaList}{a \code{vector} of \code{integer} representing the list of -values tested for the _D_ parameter. The _D_ parameter represents the +values tested for the \emph{D} parameter. The \emph{D} parameter represents the number of dimensions used in the PCA analysis.} \item{algorithm}{a \code{character} string representing the algorithm used @@ -90,7 +91,7 @@ dataDir <- system.file("extdata", package="RAIDS") gdsSample <- openfn.gds(file.path(dataDir, "GDS_Sample_with_study_demo.gds"), readonly=TRUE) -## The known super population ancestry for the 1KG profiles +## The known super population ancestry for the population reference profiles spRef <- c("EUR", "SAS", "EAS", "EUR", "AFR") names(spRef) <- c("HG00100", "HG00101", "HG00102", "HG00103", "HG00104") diff --git a/man/validateComputeSyntheticRoc.Rd b/man/validateComputeSyntheticRoc.Rd index 2a0ca55b3..a997cb7db 100644 --- a/man/validateComputeSyntheticRoc.Rd +++ b/man/validateComputeSyntheticRoc.Rd @@ -15,7 +15,7 @@ validateComputeSyntheticRoc( } \arguments{ \item{matKNN}{a \code{data.frame} containing the inferred ancestry results -for fixed values of _D_ and _K_. On of the column names of the +for fixed values of \emph{D} and \emph{K}. On of the column names of the \code{data.frame} must correspond to the \code{matKNNAncestryColumn} argument.} @@ -43,26 +43,27 @@ the list of all possible ancestry assignations.} } \description{ This function validates the input parameters for the -\code{\link{computeSyntheticROC}} function. +\code{\link[=computeSyntheticROC]{computeSyntheticROC()}} function. } \examples{ -## Directory where demo GDS files are located -dataDir <- system.file("extdata/demoAncestryCall", package="RAIDS") +## Loading demo dataset containing pedigree information for synthetic +## profiles and known ancestry of the profiles used to generate the +## synthetic profiles +data(pedSynthetic) + +## Loading demo dataset containing the inferred ancestry results +## for the synthetic data +data(matKNNSynthetic) ## The inferred ancestry results for the synthetic data using ## values of D=6 and K=5 -matKNN <- readRDS(file.path(dataDir, "matKNN.RDS")) -matKNN <- matKNN[matKNN$K == 6 & matKNN$D == 5, ] - -## The known ancestry from the reference profiles used to generate the -## synthetic profiles -syntheticData <- readRDS(file.path(dataDir, "pedSyn.RDS")) +matKNN <- matKNNSynthetic[matKNNSynthetic$K == 6 & matKNNSynthetic$D == 5, ] ## The validation should be successful RAIDS:::validateComputeSyntheticRoc(matKNN=matKNN, matKNNAncestryColumn="SuperPop", - pedCall=syntheticData, pedCallAncestryColumn="superPop", + pedCall=pedSynthetic, pedCallAncestryColumn="superPop", listCall=c("EAS", "EUR", "AFR", "AMR", "SAS")) diff --git a/man/validateCreateAccuracyGraph.Rd b/man/validateCreateAccuracyGraph.Rd new file mode 100644 index 000000000..81ddcf818 --- /dev/null +++ b/man/validateCreateAccuracyGraph.Rd @@ -0,0 +1,50 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/visualization_internal.R +\encoding{UTF-8} +\name{validateCreateAccuracyGraph} +\alias{validateCreateAccuracyGraph} +\title{Validate input parameters for createAccuracyGraph +function} +\usage{ +validateCreateAccuracyGraph(fileRDS, title, selectD, selectColor) +} +\arguments{ +\item{fileRDS}{a \code{character} string representing the path and file +name of the RDS file containing the ancestry information as generated by +RAIDS.} + +\item{title}{a \code{character} string representing the title of the graph.} + +\item{selectD}{a \code{array} of \code{integer} representing the selected +PCA dimensions to plot. The length of the \code{array} cannot be more than +5 entries. The dimensions must tested by RAIDS (i.e. be present in the +RDS file).} + +\item{selectColor}{a \code{array} of \code{character} strings representing +the selected colors for the associated PCA dimensions to plot. The length +of the \code{array} must correspond to the length of the \code{selectD} +parameter. In addition, the length of the \code{array} cannot be more than +5 entries.} +} +\value{ +The function returns \code{0L} when successful. +} +\description{ +This function validates the parameters for the +\code{\link{createAccuracyGraph}} function. +} +\examples{ + +## Path to RDS file with ancestry information generated by RAIDS (demo file) +dataDir <- system.file("extdata", package="RAIDS") +fileRDS <- file.path(dataDir, "TEST_01.infoCall.RDS") + +## Validate parameters +RAIDS:::validateCreateAccuracyGraph(fileRDS=fileRDS, title="Accuracy Graph", + selectD=c(5, 10), selectColor=c("blue","darkblue")) + +} +\author{ +Astrid Deschênes and Pascal Belleau +} +\keyword{internal} diff --git a/man/validateCreateStudy2GDS1KG.Rd b/man/validateCreateStudy2GDS1KG.Rd index ed66c2d05..cd4caa18b 100644 --- a/man/validateCreateStudy2GDS1KG.Rd +++ b/man/validateCreateStudy2GDS1KG.Rd @@ -33,7 +33,7 @@ must contain the information for all the samples passed in the can be defined.} \item{fileNameGDS}{a \code{character} string representing the file name of -the 1KG GDS file. The file must exist.} +the Population Reference GDS file. The file must exist.} \item{batch}{a single positive \code{integer} representing the current identifier for the batch. Beware, this field is not stored anymore.} @@ -67,7 +67,8 @@ This function validates the input parameters for the ## Path to the demo pedigree file is located in this package dataDir <- system.file("extdata", package="RAIDS") -gds1KG <- file.path(dataDir, "gds1KG.gds") +## Demo 1KG Population Reference GDS file +gds1KG <- file.path(dataDir, "PopulationReferenceDemo.gds") ## The data.frame containing the information about the study ## The 3 mandatory columns: "study.id", "study.desc", "study.platform" diff --git a/man/validateEstimateAllelicFraction.Rd b/man/validateEstimateAllelicFraction.Rd index 535de6b5a..3c5eb7a51 100644 --- a/man/validateEstimateAllelicFraction.Rd +++ b/man/validateEstimateAllelicFraction.Rd @@ -26,7 +26,7 @@ validateEstimateAllelicFraction( } \arguments{ \item{gdsReference}{an object of class \code{\link[gdsfmt]{gds.class}} -(a GDS file), the 1KG GDS file.} +(a GDS file), the Population Reference GDS file.} \item{gdsProfile}{an object of class \code{\link[gdsfmt]{gds.class}} (a GDS file), the Profile GDS file.} @@ -48,7 +48,8 @@ way the estimation of the allelic fraction is done. Default: \code{"DNA"}.} \item{minCov}{a single positive \code{integer} representing the minimum required coverage.} -\item{minProb}{a single \code{numeric} between 0 and 1 representing TODO.} +\item{minProb}{a single positive \code{numeric} between 0 and 1 that +represents the probability that the genotype is correct.} \item{eProb}{a single \code{numeric} between 0 and 1 representing the probability of sequencing error.} @@ -66,7 +67,7 @@ the window used to compute an empty box.} log score, that the SNVs in a gene are allelic fraction different from 0.5.} \item{gdsRefAnnot}{an object of class \code{\link[gdsfmt]{gds.class}} -(a GDS file), the1 1KG SNV Annotation GDS file. +(a GDS file), the1 Population Reference SNV Annotation GDS file. This parameter is RNA specific.} \item{blockID}{a \code{character} string corresponding to the block @@ -84,37 +85,42 @@ This function validates the input parameters for the } \examples{ + +## Required library +library(gdsfmt) + ## Directory where demo GDS files are located dataDir <- system.file("extdata", package="RAIDS") -## The 1KG GDS file (opened) -gds1KG <- openfn.gds(file.path(dataDir, "gds1KG.gds"), readonly=TRUE) +## The 1KG Population Reference GDS Demo file (opened) +gds1KG <- openfn.gds(file.path(dataDir, + "PopulationReferenceDemo.gds"), readonly=TRUE) ## The GDS Sample (opened) gdsSample <- openfn.gds(file.path(dataDir, "GDS_Sample_with_study_demo.gds"), readonly=TRUE) -## Get chromosome length information -## Information from BSgenome.Hsapiens.UCSC.hg38 package version 1.4.4 -## Order by chromosomes 1 to 25 -## chr23 is chrX, chr24 is chrY and chrM is 25 -chrInfo <- c(248956422L, 242193529L, 198295559L, 190214555L, - 181538259L, 170805979L, 159345973L, 145138636L, 138394717L, 133797422L, - 135086622L, 133275309L, 114364328L, 107043718L, 101991189L, 90338345L, - 83257441L, 80373285L, 58617616L, 64444167L, 46709983L, 50818468L, - 156040895L, 57227415L, 16569L) - -## The validation should be successful -RAIDS:::validateEstimateAllelicFraction(gdsReference=gds1KG, - gdsProfile=gdsSample, - currentProfile="Sample01", studyID="Synthetic", chrInfo=chrInfo, - studyType="DNA", minCov=10L, minProb=0.03, eProb=0.002, cutOffLOH=10, - cutOffHomoScore=11, wAR=2, cutOffAR=10, gdsRefAnnot=gds1KG, - blockID="1", verbose=FALSE) - -## All GDS file must be closed -closefn.gds(gdsfile=gds1KG) -closefn.gds(gdsfile=gdsSample) +## Required library for this example to run correctly +if (requireNamespace("GenomeInfoDb", quietly=TRUE) && + requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) { + + ## Chromosome length information + ## chr23 is chrX, chr24 is chrY and chrM is 25 + chrInfo <- GenomeInfoDb::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25] + + ## The validation should be successful + RAIDS:::validateEstimateAllelicFraction(gdsReference=gds1KG, + gdsProfile=gdsSample, + currentProfile="Sample01", studyID="Synthetic", chrInfo=chrInfo, + studyType="DNA", minCov=10L, minProb=0.03, eProb=0.002, cutOffLOH=10, + cutOffHomoScore=11, wAR=2, cutOffAR=10, gdsRefAnnot=gds1KG, + blockID="1", verbose=FALSE) + + ## All GDS file must be closed + closefn.gds(gdsfile=gds1KG) + closefn.gds(gdsfile=gdsSample) + +} } \author{ diff --git a/man/validateGDSClass.Rd b/man/validateGDSClass.Rd index efbc4173d..9dfb95cf6 100644 --- a/man/validateGDSClass.Rd +++ b/man/validateGDSClass.Rd @@ -27,7 +27,8 @@ the \link[gdsfmt]{gds.class} class. dataDir <- system.file("extdata", package="RAIDS") ## The 1KG GDS file (opened) -gds1KG <- openfn.gds(file.path(dataDir, "gds1KG.gds"), readonly=TRUE) +gds1KG <- openfn.gds(file.path(dataDir, + "PopulationReferenceDemo.gds"), readonly=TRUE) ## The validation should be successful RAIDS:::validateGDSClass(gds=gds1KG, name="gds") diff --git a/man/validateGenerateGDS1KG.Rd b/man/validateGenerateGDS1KG.Rd index c0c25e29f..8311ed760 100644 --- a/man/validateGenerateGDS1KG.Rd +++ b/man/validateGenerateGDS1KG.Rd @@ -59,7 +59,7 @@ This function validates the input parameters for the dataDir <- system.file("extdata", package="RAIDS") ## The RDS file containing the pedigree information -pedigreeFile <- file.path(dataDir, "PedigreeDemo.rds") +pedigreeFile <- file.path(dataDir, "PedigreeReferenceDemo.rds") ## The RDS file containing the indexes of the retained SNPs snpIndexFile <- file.path(dataDir, "listSNPIndexes_Demo.rds") diff --git a/man/validatePepSynthetic.Rd b/man/validatePepSynthetic.Rd index 7549cce7d..9a98d95c5 100644 --- a/man/validatePepSynthetic.Rd +++ b/man/validatePepSynthetic.Rd @@ -51,7 +51,7 @@ to show how the different steps in the function.} } \description{ This function validates the input parameters for the -\code{\link{prepSynthetic}} function. +\code{\link[=prepSynthetic]{prepSynthetic()}} function. } \examples{ diff --git a/man/validatePrepPed1KG.Rd b/man/validatePrepPed1KG.Rd index e07dd8e27..142ffd7e8 100644 --- a/man/validatePrepPed1KG.Rd +++ b/man/validatePrepPed1KG.Rd @@ -14,14 +14,14 @@ related to the profiles present in the 1KG GDS file. The PED file must exist.} \item{pathGeno}{a \code{character} string representing the path where -the 1KG genotyping files for each profile are located. Only the profiles -with associated genotyping files are retained in the creation of the final -\code{data.frame}. The name of the genotyping files must correspond to -the individual identification (Individual.ID) in the pedigree file -(PED file).} +the Reference genotyping files for each profile are located. Only the +profiles with associated genotyping files are retained in the creation of +the final \code{data.frame}. The name of the genotyping files must +correspond to the individual identification (Individual.ID) in the +pedigree file (PED file).} \item{batch}{a\code{integer} that uniquely identifies the source of the -pedigree information. The 1KG is usually \code{0L}.} +pedigree information. The Reference is usually \code{0L}.} } \value{ The function returns \code{0L} when successful. diff --git a/man/validateProfileGDSExist.Rd b/man/validateProfileGDSExist.Rd index 9adcb1d72..e507e518b 100644 --- a/man/validateProfileGDSExist.Rd +++ b/man/validateProfileGDSExist.Rd @@ -11,7 +11,7 @@ validateProfileGDSExist(pathProfile, profile) \item{pathProfile}{a \code{character} string representing the directory where the Profile GDS files will be created. The directory must exist.} -\item{currentProfile}{a \code{character} string +\item{profile}{a \code{character} string corresponding to the profile identifier. A Profile GDS file corresponding to the profile identifier must exist and be located in the \code{pathProfile} directory.} diff --git a/man/validatePruningSample.Rd b/man/validatePruningSample.Rd index 65f9c042e..2aee1b394 100644 --- a/man/validatePruningSample.Rd +++ b/man/validatePruningSample.Rd @@ -25,8 +25,8 @@ validatePruningSample( ) } \arguments{ -\item{gdsReference}{an object of class \link[gdsfmt]{gds.class} (a GDS file), the -1 KG GDS file.} +\item{gdsReference}{an object of class \link[gdsfmt]{gds.class} +(a GDS file), the Population Reference GDS file.} \item{method}{a \code{character} string that represents the method that will be used to calculate the linkage disequilibrium in the @@ -95,11 +95,15 @@ This function validates the input parameters for the } \examples{ +## Required library +library(gdsfmt) + ## Directory where demo GDS files are located dataDir <- system.file("extdata", package="RAIDS") ## The 1KG GDS file (opened) -gds1KG <- openfn.gds(file.path(dataDir, "gds1KG.gds"), readonly=TRUE) +gds1KG <- openfn.gds(file.path(dataDir, + "PopulationReferenceDemo.gds"), readonly=TRUE) ## The validation should be successful RAIDS:::validatePruningSample(gdsReference=gds1KG, method="corr", diff --git a/man/validateRunExomeAncestry.Rd b/man/validateRunExomeOrRNAAncestry.Rd similarity index 67% rename from man/validateRunExomeAncestry.Rd rename to man/validateRunExomeOrRNAAncestry.Rd index fc07eb073..52029c1bc 100644 --- a/man/validateRunExomeAncestry.Rd +++ b/man/validateRunExomeOrRNAAncestry.Rd @@ -1,11 +1,11 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/processStudy_internal.R \encoding{UTF-8} -\name{validateRunExomeAncestry} -\alias{validateRunExomeAncestry} +\name{validateRunExomeOrRNAAncestry} +\alias{validateRunExomeOrRNAAncestry} \title{Validate the parameters of the runExomeAncestry() function} \usage{ -validateRunExomeAncestry( +validateRunExomeOrRNAAncestry( pedStudy, studyDF, pathProfileGDS, @@ -47,21 +47,22 @@ exist.} the directory where the output files are created.} \item{fileReferenceGDS}{a \code{character} string representing the file -name of the 1KG GDS file. The file must exist.} +name of the Population Reference GDS file. The file must exist.} \item{fileReferenceAnnotGDS}{a \code{character} string representing the -file name of the 1KG GDS annotation file. The file must exist.} +file name of the Population Reference GDS annotation file. +The file must exist.} \item{chrInfo}{a \code{vector} of positive \code{integer} values representing the length of the chromosomes. See 'details' section.} \item{syntheticRefDF}{a \code{data.frame} containing those columns: -\itemize{ -\item{sample.id} { a \code{character} string representing the sample +\describe{ +\item{sample.id}{ a \code{character} string representing the sample identifier. } -\item{pop.group} { a \code{character} string representing the +\item{pop.group}{ a \code{character} string representing the subcontinental population assigned to the sample. } -\item{superPop} { a \code{character} string representing the +\item{superPop}{ a \code{character} string representing the super-population assigned to the sample. } }} @@ -90,9 +91,10 @@ study <- data.frame(study.id = "MYDATA", study.platform = "PLATFORM", stringsAsFactors = FALSE) -gds1KG <- file.path(dataDir, "gds1KG.gds") +## Population Reference GDS demo file +gdsRef <- file.path(dataDir, "PopulationReferenceDemo.gds") -gdsAnnot1KG <- file.path(dataDir, "gdsAnnot1KG.gds") +gdsAnnotRef <- file.path(dataDir, "PopulationReferenceSNVAnnotationDemo.gds") ## Pedigree Study data frame ped <- data.frame(Name.ID=c("Sample_01", "Sample_02"), @@ -100,25 +102,27 @@ ped <- data.frame(Name.ID=c("Sample_01", "Sample_02"), Sample.Type=c("DNA", "DNA"), Diagnosis=c("Cancer", "Cancer"), Source=c("TCGA", "TCGA")) -## Chromosome length information -## chr23 is chrX, chr24 is chrY and chrM is 25 -chrInfo <- c(248956422L, 242193529L, 198295559L, 190214555L, - 181538259L, 170805979L, 159345973L, 145138636L, 138394717L, 133797422L, - 135086622L, 133275309L, 114364328L, 107043718L, 101991189L, 90338345L, - 83257441L, 80373285L, 58617616L, 64444167L, 46709983L, 50818468L, - 156040895L, 57227415L, 16569L) - -## Profiles used for synthetic data set -syntheticRefDF <- data.frame(sample.id=c("HG00150", "HG00138", "HG00330", - "HG00275"), pop.group=c("GBR", "GBR","FIN", "FIN"), - superPop=c("EUR", "EUR", "EUR", "EUR"), stringsAsFactors=FALSE) - -## Returns OL when all parameters are valid -RAIDS:::validateRunExomeAncestry(pedStudy=ped, studyDF=study, - pathProfileGDS=dataDir, pathGeno=dataDir, pathOut=pathOut, - fileReferenceGDS=gds1KG, fileReferenceAnnotGDS=gdsAnnot1KG, - chrInfo=chrInfo, syntheticRefDF=syntheticRefDF, genoSource="snp-pileup", - verbose=FALSE) +## Required library for this example to run correctly +if (requireNamespace("GenomeInfoDb", quietly=TRUE) && + requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) { + + ## Chromosome length information + ## chr23 is chrX, chr24 is chrY and chrM is 25 + chrInfo <- GenomeInfoDb::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25] + + ## Profiles used for synthetic data set + syntheticRefDF <- data.frame(sample.id=c("HG00150", "HG00138", "HG00330", + "HG00275"), pop.group=c("GBR", "GBR","FIN", "FIN"), + superPop=c("EUR", "EUR", "EUR", "EUR"), stringsAsFactors=FALSE) + + ## Returns OL when all parameters are valid + RAIDS:::validateRunExomeOrRNAAncestry(pedStudy=ped, studyDF=study, + pathProfileGDS=dataDir, pathGeno=dataDir, pathOut=pathOut, + fileReferenceGDS=gdsRef, fileReferenceAnnotGDS=gdsAnnotRef, + chrInfo=chrInfo, syntheticRefDF=syntheticRefDF, + genoSource="snp-pileup", verbose=FALSE) + +} } \author{ diff --git a/man/validateSyntheticGeno.Rd b/man/validateSyntheticGeno.Rd index d7cb1163c..f59009200 100644 --- a/man/validateSyntheticGeno.Rd +++ b/man/validateSyntheticGeno.Rd @@ -55,7 +55,7 @@ The integer \code{0L} when the function is successful. } \description{ This function validates the input parameters for the -\code{\link{syntheticGeno}} function. +\code{\link[=syntheticGeno]{syntheticGeno()}} function. } \examples{ @@ -63,24 +63,25 @@ This function validates the input parameters for the dataDir <- system.file("extdata", package="RAIDS") ## The 1KG GDS file (opened) -gds1KG <- openfn.gds(file.path(dataDir, "gds1KG.gds"), readonly=TRUE) +gdsRef <- openfn.gds(file.path(dataDir, + "PopulationReferenceDemo.gds"), readonly=TRUE) ## The 1KG GDS Annotation file (opened) -gds1KGAnnot <- openfn.gds(file.path(dataDir, "gdsAnnot1KG.gds"), - readonly=TRUE) +gdsRefAnnot <- openfn.gds(file.path(dataDir, + "PopulationReferenceSNVAnnotationDemo.gds"), readonly=TRUE) ## The GDS Sample file gdsSample <- file.path(dataDir, "GDS_Sample_with_study_demo.gds") ## The validation should be successful -RAIDS:::validateSyntheticGeno(gdsReference=gds1KG, gdsRefAnnot=gds1KGAnnot, +RAIDS:::validateSyntheticGeno(gdsReference=gdsRef, gdsRefAnnot=gdsRefAnnot, fileProfileGDS=gdsSample, profileID="A101TCGA", listSampleRef="A101TCGA", nbSim=1L, prefix="TCGA", pRecomb=0.02, minProb=0.999, seqError=0.002) ## All GDS file must be closed -closefn.gds(gdsfile=gds1KG) -closefn.gds(gdsfile=gds1KGAnnot) +closefn.gds(gdsfile=gdsRef) +closefn.gds(gdsfile=gdsRefAnnot) } \author{ diff --git a/man/validatecreateAUROCGraph.Rd b/man/validatecreateAUROCGraph.Rd new file mode 100644 index 000000000..e8c4f0f1d --- /dev/null +++ b/man/validatecreateAUROCGraph.Rd @@ -0,0 +1,55 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/visualization_internal.R +\encoding{UTF-8} +\name{validatecreateAUROCGraph} +\alias{validatecreateAUROCGraph} +\title{Validate input parameters for createAccuracyGraph +function} +\usage{ +validatecreateAUROCGraph(dfAUROC, title, selectD, selectColor) +} +\arguments{ +\item{dfAUROC}{a \code{data.frame} corresponding to res$paraSample$dfAUROC +where res is the result of inferAncestry() or inferAncestryGeneAware() +functions.} + +\item{title}{a \code{character} string representing the title of the graph.} + +\item{selectD}{a \code{array} of \code{integer} representing the selected +PCA dimensions to plot. The length of the \code{array} cannot be more than +5 entries. The dimensions must tested by RAIDS (i.e. be present in the +RDS file).} + +\item{selectColor}{a \code{array} of \code{character} strings representing +the selected colors for the associated PCA dimensions to plot. The length +of the \code{array} must correspond to the length of the \code{selectD} +parameter. In addition, the length of the \code{array} cannot be more than +5 entries.} +} +\value{ +The function returns \code{0L} when successful. +} +\description{ +This function validates the parameters for the +\code{\link{createAccuracyGraph}} function. +} +\examples{ + +## Path to RDS file with ancestry information generated by RAIDS (demo file) +dataDir <- system.file("extdata", package="RAIDS") +fileRDS <- file.path(dataDir, "TEST_01.infoCall.RDS") +info <- readRDS(fileRDS) +dfAUROC <- info$paraSample$dfAUROC + +## Some of the column names must be updated to fit new standards +colnames(dfAUROC) <- c("D", "K", "Call", "L", "AUROC", "H") + +## Validate parameters +RAIDS:::validatecreateAUROCGraph(dfAUROC=dfAUROC, title="Accuracy Graph", + selectD=c(6, 12), selectColor=c("blue","darkblue")) + +} +\author{ +Astrid Deschênes and Pascal Belleau +} +\keyword{internal} diff --git a/man/validatecreateProfile.Rd b/man/validatecreateProfile.Rd new file mode 100644 index 000000000..81142bd9e --- /dev/null +++ b/man/validatecreateProfile.Rd @@ -0,0 +1,90 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/processStudy_internal.R +\encoding{UTF-8} +\name{validatecreateProfile} +\alias{validatecreateProfile} +\title{Validate input parameters for createProfile() function} +\usage{ +validatecreateProfile( + pedStudy, + fileNameGDS, + batch, + studyDF, + listProfiles, + pathProfileGDS, + genoSource, + verbose +) +} +\arguments{ +\item{pedStudy}{a \code{data.frame} with those mandatory columns: "Name.ID", +"Case.ID", "Sample.Type", "Diagnosis", "Source". All columns must be in +\code{character} strings (no factor). The \code{data.frame} +must contain the information for all the samples passed in the +\code{listSamples} parameter. Only \code{filePedRDS} or \code{pedStudy} +can be defined.} + +\item{fileNameGDS}{a \code{character} string representing the file name of +the Population Reference GDS file. The file must exist.} + +\item{batch}{a single positive \code{integer} representing the current +identifier for the batch. Beware, this field is not stored anymore.} + +\item{studyDF}{a \code{data.frame} containing the information about the +study associated to the analysed sample(s). The \code{data.frame} must have +those 3 columns: "study.id", "study.desc", "study.platform". All columns +must be in \code{character} strings (no factor).} + +\item{listProfiles}{a \code{vector} of \code{character} string corresponding +to the profile identifiers that will have a GDS Sample file created. The +profile identifiers must be present in the "Name.ID" column of the RDS file +passed to the \code{filePedRDS} parameter. +If \code{NULL}, all profiles in the \code{filePedRDS} are selected.} + +\item{pathProfileGDS}{a \code{character} string representing the path to +the directory where the Profile GDS files will be created.} + +\item{verbose}{a \code{logical} indicating if message information should be +printed.} +} +\value{ +The function returns \code{0L} when successful. +} +\description{ +This function validates the input parameters for the +\code{\link{createStudy2GDS1KG}} function. +} +\examples{ + +## Path to the demo pedigree file is located in this package +dataDir <- system.file("extdata", package="RAIDS") + +## Demo 1KG Population Reference GDS file +gds1KG <- file.path(dataDir, "PopulationReferenceDemo.gds") + +## The data.frame containing the information about the study +## The 3 mandatory columns: "study.id", "study.desc", "study.platform" +## The entries should be strings, not factors (stringsAsFactors=FALSE) +studyInfo <- data.frame(study.id="Pancreatic.WES", + study.desc="Pancreatic study", + study.platform="WES", + stringsAsFactors=FALSE) + +## PED Study +ped <- data.frame(Name.ID=c("Sample_01", "Sample_02"), + Case.ID=c("TCGA-H01", "TCGA-H02"), + Sample.Type=c("DNA", "DNA"), + Diagnosis=c("Cancer", "Cancer"), Source=c("TCGA", "TCGA")) + +## The validation should be successful +RAIDS:::validateCreateStudy2GDS1KG(pathGeno=dataDir, pedStudy=ped, + fileNameGDS=gds1KG, batch=1, studyDF=studyInfo, + listProfiles=c("Sample_01", "Sample_02"), + pathProfileGDS=dataDir, + genoSource="snp-pileup", verbose=TRUE) + +} +\author{ +Pascal Belleau, Astrid Deschênes and Alexander Krasnitz +} +\keyword{internal} diff --git a/man/wrapperAncestry.Rd b/man/wrapperAncestry.Rd new file mode 100644 index 000000000..cda691b48 --- /dev/null +++ b/man/wrapperAncestry.Rd @@ -0,0 +1,330 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/processStudy_internal.R +\encoding{UTF-8} +\name{wrapperAncestry} +\alias{wrapperAncestry} +\title{Run most steps leading to the ancestry inference call +on a specific profile (RNA or DNA)} +\usage{ +wrapperAncestry( + pedStudy, + studyDF, + pathProfileGDS, + profileFile, + fileReferenceGDS, + fileReferenceAnnotGDS, + chrInfo, + syntheticRefDF, + genoSource = c("snp-pileup", "generic", "VCF", "bam"), + studyType = c("LD", "GeneAware"), + np = 1L, + blockTypeID = NULL, + paramAncestry = list(ScanBamParam = NULL, PileupParam = NULL, yieldSize = 1e+07), + verbose = FALSE +) +} +\arguments{ +\item{pedStudy}{a \code{data.frame} with those mandatory columns: "Name.ID", +"Case.ID", "Sample.Type", "Diagnosis", "Source". All columns must be in +\code{character} strings (no factor). The \code{data.frame} +must contain the information for all the samples passed in the +\code{listSamples} parameter. Only \code{filePedRDS} or \code{pedStudy} +can be defined.} + +\item{studyDF}{a \code{data.frame} containing the information about the +study associated to the analysed sample(s). The \code{data.frame} must have +those 3 columns: "study.id", "study.desc", "study.platform". All columns +must be in \code{character} strings (no factor).} + +\item{pathProfileGDS}{a \code{character} string representing the path to +the directory where the GDS Profile files will be created. +Default: \code{NULL}.} + +\item{fileReferenceGDS}{a \code{character} string representing the file +name of the Reference GDS file. The file must exist.} + +\item{fileReferenceAnnotGDS}{a \code{character} string representing the +file name of the Reference GDS Annotation file. The file must exist.} + +\item{chrInfo}{a \code{vector} of positive \code{integer} values +representing the length of the chromosomes. See 'details' section.} + +\item{syntheticRefDF}{a \code{data.frame} containing a subset of +reference profiles for each sub-population present in the Reference GDS +file. The \code{data.frame} must have those columns: +\describe{ +\item{sample.id}{ a \code{character} string representing the sample +identifier. } +\item{pop.group}{ a \code{character} string representing the +subcontinental population assigned to the sample. } +\item{superPop}{ a \code{character} string representing the +super-population assigned to the sample. } +}} + +\item{genoSource}{a \code{character} string with two possible values: +'snp-pileup', 'generic' or 'VCF', "bam". It specifies if the genotype files +are generated by snp-pileup (Facets) or are a generic format CSV file +with at least those columns: +'Chromosome', 'Position', 'Ref', 'Alt', 'Count', 'File1R' and 'File1A'. +The 'Count' is the depth at the specified position; +'FileR' is the depth of the reference allele and +'File1A' is the depth of the specific alternative allele. +Finally the file can be a VCF file with at least those genotype +fields: GT, AD, DP.} + +\item{studyType}{a \code{character} string representing the type of study. +The possible choices are: "DNA" and "RNA". The type of study affects the +way the estimation of the allelic fraction is done. Default: \code{"DNA"}.} + +\item{np}{a single positive \code{integer} specifying the number of +threads to be used. Default: \code{1L}.} + +\item{blockTypeID}{a \code{character} string corresponding to the block +type used to extract the block identifiers. The block type must be +present in the GDS Reference Annotation file.} + +\item{paramAncestry}{a \code{list} parameters ...} + +\item{verbose}{a \code{logical} indicating if messages should be printed +to show how the different steps in the function. Default: \code{FALSE}.} + +\item{pathGeno}{a \code{character} string representing the path to the +directory containing the VCF output of SNP-pileup for each sample. The +SNP-pileup files must be compressed (gz files) and have the name identifiers +of the samples. A sample with "Name.ID" identifier would have an +associated file called +if genoSource is "VCF", then "Name.ID.vcf.gz", +if genoSource is "generic", then "Name.ID.generic.txt.gz" +if genoSource is "snp-pileup", then "Name.ID.txt.gz".} +} +\value{ +a \code{list} containing 4 entries: +\describe{ +\item{\code{pcaSample}}{ a \code{list} containing the information related +to the eigenvectors. The \code{list} contains those 3 entries: +\describe{ +\item{\code{sample.id}}{ a \code{character} string representing the unique +identifier of the current profile.} +\item{\code{eigenvector.ref}}{ a \code{matrix} of \code{numeric} containing +the eigenvectors for the reference profiles.} +\item{\code{eigenvector}}{ a \code{matrix} of \code{numeric} containing the +eigenvectors for the current profile projected on the PCA from the +reference profiles.} +} +} +\item{\code{paraSample}}{ a \code{list} containing the results with +different \code{D} and \code{K} values that lead to optimal parameter +selection. The \code{list} contains those entries: +\describe{ +\item{\code{dfPCA}}{ a \code{data.frame} containing statistical results +on all combined synthetic results done with a fixed value of \code{D} (the +number of dimensions). The \code{data.frame} contains those columns: +\describe{ +\item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +number of dimensions).} +\item{\code{median}}{ a \code{numeric} representing the median of the +minimum AUROC obtained (within super populations) for all combination of +the fixed \code{D} value and all tested \code{K} values. } +\item{\code{mad}}{ a \code{numeric} representing the MAD of the minimum +AUROC obtained (within super populations) for all combination of the fixed +\code{D} value and all tested \code{K} values. } +\item{\code{upQuartile}}{ a \code{numeric} representing the upper quartile +of the minimum AUROC obtained (within super populations) for all +combination of the fixed \code{D} value and all tested \code{K} values. } +\item{\code{k}}{ a \code{numeric} representing the optimal \code{K} value +(the number of neighbors) for a fixed \code{D} value. } +} +} +\item{\code{dfPop}}{ a \code{data.frame} containing statistical results on +all combined synthetic results done with different values of \code{D} (the +number of dimensions) and \code{K} (the number of neighbors). +The \code{data.frame} contains those columns: +\describe{ +\item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +number of dimensions).} +\item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +number of neighbors).} +\item{\code{AUROC.min}}{ a \code{numeric} representing the minimum accuracy +obtained by grouping all the synthetic results by super-populations, for +the specified values of \code{D} and \code{K}.} +\item{\code{AUROC}}{ a \code{numeric} representing the accuracy obtained +by grouping all the synthetic results for the specified values of \code{D} +and \code{K}.} +\item{\code{Accu.CM}}{ a \code{numeric} representing the value of accuracy +of the confusion matrix obtained by grouping all the synthetic results for +the specified values of \code{D} and \code{K}.} +} +} +\item{\code{dfAUROC}}{ a \code{data.frame} the summary of the results by +super-population. The \code{data.frame} contains +those columns: +\describe{ +\item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +number of dimensions).} +\item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +number of neighbors).} +\item{\code{Call}}{ a \code{character} string representing the +super-population.} +\item{\code{L}}{ a \code{numeric} representing the lower value of the 95\% +confidence interval for the AUROC obtained for the fixed values of +super-population, \code{D} and \code{K}.} +\item{\code{AUROC}}{ a \code{numeric} representing the AUROC obtained for the +fixed values of super-population, \code{D} and \code{K}.} +\item{\code{H}}{ a \code{numeric} representing the higher value of the 95\% +confidence interval for the AUROC obtained for the fixed values of +super-population, \code{D} and \code{K}.} +} +} +\item{\code{D}}{ a \code{numeric} representing the optimal \code{D} value +(the number of dimensions) for the specific profile.} +\item{\code{K}}{ a \code{numeric} representing the optimal \code{K} value +(the number of neighbors) for the specific profile.} +\item{\code{listD}}{ a \code{numeric} representing the optimal \code{D} +values (the number of dimensions) for the specific profile. More than one +\code{D} is possible.} +} +} +\item{\code{KNNSample}}{ a \code{data.frame} containing the inferred ancestry +for different values of \code{K} and \code{D}. The \code{data.frame} +contains those columns: +\describe{ +\item{\code{sample.id}}{ a \code{character} string representing the unique +identifier of the current profile.} +\item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +number of dimensions) used to infer the ancestry. } +\item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +number of neighbors) used to infer the ancestry. } +\item{\code{SuperPop}}{ a \code{character} string representing the inferred +ancestry for the specified \code{D} and \code{K} values.} +} +} +\item{\code{KNNSynthetic}}{ a \code{data.frame} containing the inferred ancestry +for each synthetic data for different values of \code{K} and \code{D}. +The \code{data.frame} +contains those columns: "sample.id", "D", "K", "infer.superPop", "ref.superPop" +\describe{ +\item{\code{sample.id}}{ a \code{character} string representing the unique +identifier of the current synthetic data.} +\item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +number of dimensions) used to infer the ancestry. } +\item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +number of neighbors) used to infer the ancestry. } +\item{\code{infer.superPop}}{ a \code{character} string representing the inferred +ancestry for the specified \code{D} and \code{K} values.} +\item{\code{ref.superPop}}{ a \code{character} string representing the known +ancestry from the reference} +} +} +\item{\code{Ancestry}}{ a \code{data.frame} containing the inferred +ancestry for the current profile. The \code{data.frame} contains those +columns: +\describe{ +\item{\code{sample.id}}{ a \code{character} string representing the unique +identifier of the current profile.} +\item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the +number of dimensions) used to infer the ancestry.} +\item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the +number of neighbors) used to infer the ancestry.} +\item{\code{SuperPop}}{ a \code{character} string representing the inferred +ancestry.} +} +} +} +} +\description{ +This function runs most steps leading to the ancestry inference +call on a specific profile. First, the function creates the Profile GDS file +for the specific profile using the information from a RDS Sample +description file and the Population reference GDS file. +} +\examples{ + +## Required library for GDS +library(SNPRelate) + +## Path to the demo 1KG GDS file is located in this package +dataDir <- system.file("extdata", package="RAIDS") + +################################################################# +## Load the information about the profile +################################################################# +data(demoPedigreeEx1) +head(demoPedigreeEx1) + +################################################################# +## The 1KG GDS file and the 1KG SNV Annotation GDS file +## need to be located in the same directory +## Note that the 1KG GDS file used for this example is a +## simplified version and CANNOT be used for any real analysis +################################################################# +path1KG <- file.path(dataDir, "tests") + +fileReferenceGDS <- file.path(path1KG, "ex1_good_small_1KG.gds") +fileAnnotGDS <- file.path(path1KG, "ex1_good_small_1KG_Annot.gds") + +################################################################# +## The Sample SNP pileup files (one per sample) need +## to be located in the same directory. +################################################################# +pathGeno <- file.path(dataDir, "example", "snpPileup") + +################################################################# +## The path where the Profile GDS Files (one per sample) +## will be created need to be specified. +################################################################# +pathProfileGDS <- file.path(tempdir(), "out.tmp") + + +################################################################# +## A data frame containing general information about the study +## is also required. The data frame must have +## those 3 columns: "studyID", "study.desc", "study.platform" +################################################################# +studyDF <- data.frame(study.id="MYDATA", + study.desc="Description", + study.platform="PLATFORM", + stringsAsFactors=FALSE) + +#################################################################### +## Fix seed to ensure reproducible results +#################################################################### +set.seed(3043) + +gds1KG <- snpgdsOpen(fileReferenceGDS) +dataRef <- select1KGPop(gds1KG, nbProfiles=2L) +closefn.gds(gds1KG) + +## Required library for this example to run correctly +if (requireNamespace("GenomeInfoDb", quietly=TRUE) && + requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) { + + ## Chromosome length information + ## chr23 is chrX, chr24 is chrY and chrM is 25 + chrInfo <- GenomeInfoDb::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25] + + \dontrun{ + + res <- RAIDS:::wrapperAncestry(pedStudy=demoPedigreeEx1, studyDF=studyDF, + pathProfileGDS=pathProfileGDS, + pathGeno=pathGeno, + fileReferenceGDS=fileReferenceGDS, + fileReferenceAnnotGDS=fileAnnotGDS, + chrInfo=chrInfo, syntheticRefDF=dataRef, + studyType="LD", genoSource="snp-pileup") + + unlink(pathProfileGDS, recursive=TRUE, force=TRUE) + + } +} + +} +\references{ +Galinsky KJ, Bhatia G, Loh PR, Georgiev S, Mukherjee S, Patterson NJ, +Price AL. Fast Principal-Component Analysis Reveals Convergent Evolution +of ADH1B in Europe and East Asia. Am J Hum Genet. 2016 Mar 3;98(3):456-72. +doi: 10.1016/j.ajhg.2015.12.022. Epub 2016 Feb 25. +} +\author{ +Pascal Belleau, Astrid Deschênes and Alexander Krasnitz +} +\keyword{internal} diff --git a/scriptsPy/extract1000gFreqAll.py b/scriptsPy/extract1000gFreqAll.py deleted file mode 100644 index e750bc616..000000000 --- a/scriptsPy/extract1000gFreqAll.py +++ /dev/null @@ -1,72 +0,0 @@ -import vcf -from datetime import date -import os -import sys -import numpy as np - -def main(): - fileVCF = sys.argv[1] - fileBASE = sys.argv[2] - freqCutOff = -1 - today = date.today() - displayLOG(fileBASE, today, fileVCF, freqCutOff) - extractFreq(fileBASE, fileVCF, freqCutOff, today) - -def displayLOG(fileBASE, today, fileVCF, freqCutOff): - try: - fileLOG = "log/" + fileBASE + "_" + str(today) + ".log" - FLOG = open(fileLOG, "w") - FLOG.write("fileVCF " + fileVCF + "\n") - FLOG.write("fileBASE " + fileBASE + "\n") - FLOG.write("FreqCutOff " + str(freqCutOff) + "\n") - FLOG.close() - except (OSError, IOError) as e: - sys.stderr.write( 'Problem with the file: '+ fileLOG + '\n') - sys.stderr.write(e + "\n") - sys.exit(1) - -def extractFreq(fileBASE, fileVCF, freqCutOff, today): - try: - fileName = fileBASE + "_" + str(today) + "_f_All" + ".txt" - - fileOUT = os.path.join("mat1000gFAll", fileName) - sep = ";" - vcf_reader = vcf.Reader(filename=fileVCF) - OUT = open(fileOUT, "w") - - for record in vcf_reader: - # At least the frequency in one super population - # higher or equal to freqCutOff - if (record.INFO['EAS_AF'][0] >= freqCutOff or - record.INFO['EUR_AF'][0] >= freqCutOff or - record.INFO['AFR_AF'][0] >= freqCutOff or - record.INFO['AMR_AF'][0] >= freqCutOff or - record.INFO['SAS_AF'][0] >= freqCutOff): - - OUT.write( record.CHROM +"\t" + str( record.start ) + "\t" + record.REF) - - flag = 0 - for g in record.ALT: - if flag == 1: - OUT.write(sep) - OUT.write("\t" + str(g)) - flag = 1 - OUT.write("\t" + str(record.INFO['AF'][0])) - OUT.write("\t" + str(record.INFO['EAS_AF'][0])) - OUT.write("\t" + str(record.INFO['EUR_AF'][0])) - OUT.write("\t" + str(record.INFO['AFR_AF'][0])) - OUT.write("\t" + str(record.INFO['AMR_AF'][0])) - OUT.write("\t" + str(record.INFO['SAS_AF'][0])) - OUT.write("\n") - - OUT.close - - - except (OSError, IOError) as e: - sys.stderr.write( 'Problem with the file: '+ fileBASE + '_' + fileVCF + '\n') - sys.stderr.write(e + "\n") - sys.exit(1) - -if __name__ == "__main__": - main() - diff --git a/tests/testthat/fixtures/1KG_Test.gds b/tests/testthat/fixtures/1KG_Test.gds index 5338fcaee..21c188e30 100644 Binary files a/tests/testthat/fixtures/1KG_Test.gds and b/tests/testthat/fixtures/1KG_Test.gds differ diff --git a/tests/testthat/fixtures/TEST_01.infoCall.RDS b/tests/testthat/fixtures/TEST_01.infoCall.RDS new file mode 100644 index 000000000..10e2d7170 Binary files /dev/null and b/tests/testthat/fixtures/TEST_01.infoCall.RDS differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr1/NA12003.chr1.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr1/NA12003.chr1.vcf.bz2 new file mode 100644 index 000000000..5a9e11381 Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr1/NA12003.chr1.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr1/NA12004.chr1.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr1/NA12004.chr1.vcf.bz2 new file mode 100644 index 000000000..ee8f423d4 Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr1/NA12004.chr1.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr1/NA12005.chr1.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr1/NA12005.chr1.vcf.bz2 new file mode 100644 index 000000000..09d7ec753 Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr1/NA12005.chr1.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr1/NA12006.chr1.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr1/NA12006.chr1.vcf.bz2 new file mode 100644 index 000000000..d099218ab Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr1/NA12006.chr1.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr10/NA12003.chr10.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr10/NA12003.chr10.vcf.bz2 new file mode 100644 index 000000000..9559377f6 Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr10/NA12003.chr10.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr10/NA12004.chr10.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr10/NA12004.chr10.vcf.bz2 new file mode 100644 index 000000000..9d57ad307 Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr10/NA12004.chr10.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr10/NA12005.chr10.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr10/NA12005.chr10.vcf.bz2 new file mode 100644 index 000000000..6d41e6737 Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr10/NA12005.chr10.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr10/NA12006.chr10.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr10/NA12006.chr10.vcf.bz2 new file mode 100644 index 000000000..499050dac Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr10/NA12006.chr10.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr11/NA12003.chr11.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr11/NA12003.chr11.vcf.bz2 new file mode 100644 index 000000000..9b3a4b93f Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr11/NA12003.chr11.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr11/NA12004.chr11.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr11/NA12004.chr11.vcf.bz2 new file mode 100644 index 000000000..5a88d8095 Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr11/NA12004.chr11.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr11/NA12005.chr11.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr11/NA12005.chr11.vcf.bz2 new file mode 100644 index 000000000..6055efe7d Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr11/NA12005.chr11.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr11/NA12006.chr11.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr11/NA12006.chr11.vcf.bz2 new file mode 100644 index 000000000..e1364900b Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr11/NA12006.chr11.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr12/NA12003.chr12.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr12/NA12003.chr12.vcf.bz2 new file mode 100644 index 000000000..57cf24de4 Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr12/NA12003.chr12.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr12/NA12004.chr12.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr12/NA12004.chr12.vcf.bz2 new file mode 100644 index 000000000..6fdeed51d Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr12/NA12004.chr12.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr12/NA12005.chr12.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr12/NA12005.chr12.vcf.bz2 new file mode 100644 index 000000000..418994f9c Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr12/NA12005.chr12.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr12/NA12006.chr12.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr12/NA12006.chr12.vcf.bz2 new file mode 100644 index 000000000..978907920 Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr12/NA12006.chr12.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr13/NA12003.chr13.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr13/NA12003.chr13.vcf.bz2 new file mode 100644 index 000000000..f0ff41dd3 Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr13/NA12003.chr13.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr13/NA12004.chr13.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr13/NA12004.chr13.vcf.bz2 new file mode 100644 index 000000000..0d5fc433f Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr13/NA12004.chr13.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr13/NA12005.chr13.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr13/NA12005.chr13.vcf.bz2 new file mode 100644 index 000000000..474e86cb5 Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr13/NA12005.chr13.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr13/NA12006.chr13.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr13/NA12006.chr13.vcf.bz2 new file mode 100644 index 000000000..fefade0d7 Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr13/NA12006.chr13.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr14/NA12003.chr14.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr14/NA12003.chr14.vcf.bz2 new file mode 100644 index 000000000..a2b975c4d Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr14/NA12003.chr14.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr14/NA12004.chr14.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr14/NA12004.chr14.vcf.bz2 new file mode 100644 index 000000000..c19eee196 Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr14/NA12004.chr14.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr14/NA12005.chr14.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr14/NA12005.chr14.vcf.bz2 new file mode 100644 index 000000000..509cb2d28 Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr14/NA12005.chr14.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr14/NA12006.chr14.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr14/NA12006.chr14.vcf.bz2 new file mode 100644 index 000000000..cd1a035af Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr14/NA12006.chr14.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr15/NA12003.chr15.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr15/NA12003.chr15.vcf.bz2 new file mode 100644 index 000000000..b08b96566 Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr15/NA12003.chr15.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr15/NA12004.chr15.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr15/NA12004.chr15.vcf.bz2 new file mode 100644 index 000000000..8a68edc98 Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr15/NA12004.chr15.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr15/NA12005.chr15.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr15/NA12005.chr15.vcf.bz2 new file mode 100644 index 000000000..f48cd2547 Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr15/NA12005.chr15.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr15/NA12006.chr15.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr15/NA12006.chr15.vcf.bz2 new file mode 100644 index 000000000..dcc06d2d4 Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr15/NA12006.chr15.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr16/NA12003.chr16.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr16/NA12003.chr16.vcf.bz2 new file mode 100644 index 000000000..aca1b7a87 Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr16/NA12003.chr16.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr16/NA12004.chr16.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr16/NA12004.chr16.vcf.bz2 new file mode 100644 index 000000000..85b694a21 Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr16/NA12004.chr16.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr16/NA12005.chr16.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr16/NA12005.chr16.vcf.bz2 new file mode 100644 index 000000000..9a9435747 Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr16/NA12005.chr16.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr16/NA12006.chr16.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr16/NA12006.chr16.vcf.bz2 new file mode 100644 index 000000000..fe8357bf0 Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr16/NA12006.chr16.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr17/NA12003.chr17.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr17/NA12003.chr17.vcf.bz2 new file mode 100644 index 000000000..549158743 Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr17/NA12003.chr17.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr17/NA12004.chr17.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr17/NA12004.chr17.vcf.bz2 new file mode 100644 index 000000000..7040e9d70 Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr17/NA12004.chr17.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr17/NA12005.chr17.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr17/NA12005.chr17.vcf.bz2 new file mode 100644 index 000000000..acdf0d3a6 Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr17/NA12005.chr17.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr17/NA12006.chr17.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr17/NA12006.chr17.vcf.bz2 new file mode 100644 index 000000000..936e42bb6 Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr17/NA12006.chr17.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr18/NA12003.chr18.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr18/NA12003.chr18.vcf.bz2 new file mode 100644 index 000000000..7069651cc Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr18/NA12003.chr18.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr18/NA12004.chr18.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr18/NA12004.chr18.vcf.bz2 new file mode 100644 index 000000000..719c31374 Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr18/NA12004.chr18.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr18/NA12005.chr18.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr18/NA12005.chr18.vcf.bz2 new file mode 100644 index 000000000..2928cf343 Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr18/NA12005.chr18.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr18/NA12006.chr18.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr18/NA12006.chr18.vcf.bz2 new file mode 100644 index 000000000..d24b326db Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr18/NA12006.chr18.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr19/NA12003.chr19.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr19/NA12003.chr19.vcf.bz2 new file mode 100644 index 000000000..9733612b4 Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr19/NA12003.chr19.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr19/NA12004.chr19.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr19/NA12004.chr19.vcf.bz2 new file mode 100644 index 000000000..b7c7bd7e2 Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr19/NA12004.chr19.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr19/NA12005.chr19.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr19/NA12005.chr19.vcf.bz2 new file mode 100644 index 000000000..eaf2103ed Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr19/NA12005.chr19.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr19/NA12006.chr19.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr19/NA12006.chr19.vcf.bz2 new file mode 100644 index 000000000..0519557fb Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr19/NA12006.chr19.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr2/NA12003.chr2.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr2/NA12003.chr2.vcf.bz2 new file mode 100644 index 000000000..68b8d8cb5 Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr2/NA12003.chr2.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr2/NA12004.chr2.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr2/NA12004.chr2.vcf.bz2 new file mode 100644 index 000000000..29710f3b9 Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr2/NA12004.chr2.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr2/NA12005.chr2.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr2/NA12005.chr2.vcf.bz2 new file mode 100644 index 000000000..e7207c85e Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr2/NA12005.chr2.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr2/NA12006.chr2.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr2/NA12006.chr2.vcf.bz2 new file mode 100644 index 000000000..1e82c8200 Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr2/NA12006.chr2.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr20/NA12003.chr20.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr20/NA12003.chr20.vcf.bz2 new file mode 100644 index 000000000..e46f83752 Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr20/NA12003.chr20.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr20/NA12004.chr20.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr20/NA12004.chr20.vcf.bz2 new file mode 100644 index 000000000..901c941be Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr20/NA12004.chr20.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr20/NA12005.chr20.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr20/NA12005.chr20.vcf.bz2 new file mode 100644 index 000000000..0286e12e8 Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr20/NA12005.chr20.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr20/NA12006.chr20.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr20/NA12006.chr20.vcf.bz2 new file mode 100644 index 000000000..0519557fb Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr20/NA12006.chr20.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr21/NA12003.chr21.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr21/NA12003.chr21.vcf.bz2 new file mode 100644 index 000000000..1b8e8a4b8 Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr21/NA12003.chr21.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr21/NA12004.chr21.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr21/NA12004.chr21.vcf.bz2 new file mode 100644 index 000000000..d6b1128a0 Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr21/NA12004.chr21.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr21/NA12005.chr21.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr21/NA12005.chr21.vcf.bz2 new file mode 100644 index 000000000..175df8b9f Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr21/NA12005.chr21.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr21/NA12006.chr21.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr21/NA12006.chr21.vcf.bz2 new file mode 100644 index 000000000..d000d6bf1 Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr21/NA12006.chr21.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr22/NA12003.chr22.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr22/NA12003.chr22.vcf.bz2 new file mode 100644 index 000000000..df8ed4e01 Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr22/NA12003.chr22.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr22/NA12004.chr22.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr22/NA12004.chr22.vcf.bz2 new file mode 100644 index 000000000..4a026f88b Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr22/NA12004.chr22.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr22/NA12005.chr22.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr22/NA12005.chr22.vcf.bz2 new file mode 100644 index 000000000..45979aa6b Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr22/NA12005.chr22.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr22/NA12006.chr22.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr22/NA12006.chr22.vcf.bz2 new file mode 100644 index 000000000..5e9a8cf72 Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr22/NA12006.chr22.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr3/NA12003.chr3.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr3/NA12003.chr3.vcf.bz2 new file mode 100644 index 000000000..e1d4d9c32 Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr3/NA12003.chr3.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr3/NA12004.chr3.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr3/NA12004.chr3.vcf.bz2 new file mode 100644 index 000000000..b14a26a07 Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr3/NA12004.chr3.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr3/NA12005.chr3.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr3/NA12005.chr3.vcf.bz2 new file mode 100644 index 000000000..ae8558317 Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr3/NA12005.chr3.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr3/NA12006.chr3.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr3/NA12006.chr3.vcf.bz2 new file mode 100644 index 000000000..8ca1d4feb Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr3/NA12006.chr3.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr4/NA12003.chr4.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr4/NA12003.chr4.vcf.bz2 new file mode 100644 index 000000000..c01c61b90 Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr4/NA12003.chr4.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr4/NA12004.chr4.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr4/NA12004.chr4.vcf.bz2 new file mode 100644 index 000000000..f9fd1206d Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr4/NA12004.chr4.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr4/NA12005.chr4.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr4/NA12005.chr4.vcf.bz2 new file mode 100644 index 000000000..bfa295fde Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr4/NA12005.chr4.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr4/NA12006.chr4.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr4/NA12006.chr4.vcf.bz2 new file mode 100644 index 000000000..4f169969b Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr4/NA12006.chr4.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr5/NA12003.chr5.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr5/NA12003.chr5.vcf.bz2 new file mode 100644 index 000000000..e812d8fbd Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr5/NA12003.chr5.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr5/NA12004.chr5.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr5/NA12004.chr5.vcf.bz2 new file mode 100644 index 000000000..b978498b0 Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr5/NA12004.chr5.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr5/NA12005.chr5.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr5/NA12005.chr5.vcf.bz2 new file mode 100644 index 000000000..f60e08d41 Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr5/NA12005.chr5.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr5/NA12006.chr5.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr5/NA12006.chr5.vcf.bz2 new file mode 100644 index 000000000..f18447b4a Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr5/NA12006.chr5.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr6/NA12003.chr6.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr6/NA12003.chr6.vcf.bz2 new file mode 100644 index 000000000..f6c4bce28 Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr6/NA12003.chr6.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr6/NA12004.chr6.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr6/NA12004.chr6.vcf.bz2 new file mode 100644 index 000000000..1c94aff64 Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr6/NA12004.chr6.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr6/NA12005.chr6.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr6/NA12005.chr6.vcf.bz2 new file mode 100644 index 000000000..6ab58a256 Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr6/NA12005.chr6.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr6/NA12006.chr6.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr6/NA12006.chr6.vcf.bz2 new file mode 100644 index 000000000..606e5cac5 Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr6/NA12006.chr6.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr7/NA12003.chr7.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr7/NA12003.chr7.vcf.bz2 new file mode 100644 index 000000000..8d0eb4604 Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr7/NA12003.chr7.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr7/NA12004.chr7.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr7/NA12004.chr7.vcf.bz2 new file mode 100644 index 000000000..de80fe24b Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr7/NA12004.chr7.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr7/NA12005.chr7.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr7/NA12005.chr7.vcf.bz2 new file mode 100644 index 000000000..6c2db5350 Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr7/NA12005.chr7.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr7/NA12006.chr7.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr7/NA12006.chr7.vcf.bz2 new file mode 100644 index 000000000..dcc289f34 Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr7/NA12006.chr7.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr8/NA12003.chr8.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr8/NA12003.chr8.vcf.bz2 new file mode 100644 index 000000000..1e1a1581b Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr8/NA12003.chr8.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr8/NA12004.chr8.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr8/NA12004.chr8.vcf.bz2 new file mode 100644 index 000000000..9fe74e90b Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr8/NA12004.chr8.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr8/NA12005.chr8.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr8/NA12005.chr8.vcf.bz2 new file mode 100644 index 000000000..2831fedcc Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr8/NA12005.chr8.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr8/NA12006.chr8.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr8/NA12006.chr8.vcf.bz2 new file mode 100644 index 000000000..c4d18da00 Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr8/NA12006.chr8.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr9/NA12003.chr9.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr9/NA12003.chr9.vcf.bz2 new file mode 100644 index 000000000..9cf34b914 Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr9/NA12003.chr9.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr9/NA12004.chr9.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr9/NA12004.chr9.vcf.bz2 new file mode 100644 index 000000000..f251524c5 Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr9/NA12004.chr9.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr9/NA12005.chr9.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr9/NA12005.chr9.vcf.bz2 new file mode 100644 index 000000000..64b333410 Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr9/NA12005.chr9.vcf.bz2 differ diff --git a/tests/testthat/fixtures/demoGenoChr/chr9/NA12006.chr9.vcf.bz2 b/tests/testthat/fixtures/demoGenoChr/chr9/NA12006.chr9.vcf.bz2 new file mode 100644 index 000000000..47d31ea3c Binary files /dev/null and b/tests/testthat/fixtures/demoGenoChr/chr9/NA12006.chr9.vcf.bz2 differ diff --git a/tests/testthat/fixtures/ex1.generic.txt.gz b/tests/testthat/fixtures/ex1.generic.txt.gz new file mode 100644 index 000000000..48192f094 Binary files /dev/null and b/tests/testthat/fixtures/ex1.generic.txt.gz differ diff --git a/tests/testthat/fixtures/ex1_small.vcf.gz b/tests/testthat/fixtures/ex1_small.vcf.gz new file mode 100644 index 000000000..d8e97ee62 Binary files /dev/null and b/tests/testthat/fixtures/ex1_small.vcf.gz differ diff --git a/tests/testthat/fixtures/matFreqSNV_Demo.txt.bz2 b/tests/testthat/fixtures/matFreqSNV_Demo.txt.bz2 index c30dd4f8c..11ed7d002 100644 Binary files a/tests/testthat/fixtures/matFreqSNV_Demo.txt.bz2 and b/tests/testthat/fixtures/matFreqSNV_Demo.txt.bz2 differ diff --git a/tests/testthat/helper_initGDS.R b/tests/testthat/helper_initGDS.R index 7ada8b90e..cc77d58e6 100644 --- a/tests/testthat/helper_initGDS.R +++ b/tests/testthat/helper_initGDS.R @@ -8,7 +8,7 @@ library(gdsfmt) ## The file will be removed automatically local_GDS_Sample_file <- function(path, env = parent.frame()) { GDS_file_tmp <- createfn.gds(filename=path) - defer(unlink(x=path, force=TRUE), envir = env) + withr::defer(unlink(x=path, force=TRUE), envir = env) add.gdsn(GDS_file_tmp, "Ref.count", rep(10L, 12)) add.gdsn(GDS_file_tmp, "Alt.count", rep(12L, 12)) @@ -24,7 +24,7 @@ local_GDS_Sample_file <- function(path, env = parent.frame()) { ## The file will be removed automatically local_GDS_1KG_file <- function(path, env = parent.frame()) { GDS_file_tmp <- createfn.gds(filename=path) - defer(unlink(x=path, force=TRUE), envir = env) + withr::defer(unlink(x=path, force=TRUE), envir = env) ## Create sample information initial add.gdsn(GDS_file_tmp, "sample.id", c("HTT101", "HTT102", "HTT103")) diff --git a/tests/testthat/test-allelicFraction.R b/tests/testthat/test-allelicFraction.R index 77e9ec32a..6679efc6a 100644 --- a/tests/testthat/test-allelicFraction.R +++ b/tests/testthat/test-allelicFraction.R @@ -30,7 +30,7 @@ test_that("computeAlleleFraction() must return expected results when not imbalan lap=rep(-1, 8), LOH=rep(0, 8), imbAR=rep(-1, 8), stringAsFactor=FALSE) - result <- RAIDS:::computeAlleleFraction(snp.pos=snpInfo, w=10, cutOff=-3) + result <- RAIDS:::computeAlleleFraction(snpPos=snpInfo, w=10, cutOff=-3) expect_equal(result, NULL) }) @@ -55,7 +55,7 @@ test_that("computeAlleleFraction() must return expected results when imbalanced lap=rep(-1, 8), LOH=rep(0, 8), imbAR=c(1, 1, 1, 1, 1, 0, 1, 1), stringAsFactor=FALSE) - result <- RAIDS:::computeAlleleFraction(snp.pos=snpInfo, w=10, cutOff=-3) + result <- RAIDS:::computeAlleleFraction(snpPos=snpInfo, w=10, cutOff=-3) expect_equal(result, matrix(c(1, 5, 0, 7, 8, NA), byrow=TRUE, nrow=2)) }) @@ -82,7 +82,7 @@ test_that("computeAlleleFraction() must return expected results with small w par lap=rep(-1, 12), LOH=rep(0, 12), imbAR=rep(1, 12), stringAsFactor=FALSE) - result <- RAIDS:::computeAlleleFraction(snp.pos=snpInfo, w=4, cutOff=-3) + result <- RAIDS:::computeAlleleFraction(snpPos=snpInfo, w=4, cutOff=-3) expect_equal(result, matrix(c(1, 12, 0.148714810281518), byrow=TRUE, nrow=1)) }) @@ -494,15 +494,11 @@ test_that("estimateAllelicFraction() must return valid results", { readonly=FALSE) withr::defer((gdsfmt::closefn.gds(gdsProfile)), envir=parent.frame()) - chrInfo <- c(248956422L, 242193529L, 198295559L, 190214555L, - 181538259L, 170805979L, 159345973L, 145138636L, 138394717L, 133797422L, - 135086622L, 133275309L, 114364328L, 107043718L, 101991189L, 90338345L, - 83257441L, 80373285L, 58617616L, 64444167L, 46709983L, 50818468L, - 156040895L, 57227415L, 16569L) + chromosome <- c(95642L, 93529L, 95559L) result <- estimateAllelicFraction(gdsReference=gds1KG, gdsProfile=gdsProfile, currentProfile="ex1", studyID="MYDATA", - chrInfo=chrInfo, studyType="DNA", minCov=10L, minProb=0.999, + chrInfo=chromosome, studyType="DNA", minCov=10L, minProb=0.999, eProb=0.001, cutOffLOH=-5, cutOffHomoScore=-3, wAR=9, cutOffAR=3, gdsRefAnnot=NULL, blockID=NULL, verbose=FALSE) diff --git a/tests/testthat/test-allelicFraction_internal.R b/tests/testthat/test-allelicFraction_internal.R index 2b10b6009..17ea49e65 100644 --- a/tests/testthat/test-allelicFraction_internal.R +++ b/tests/testthat/test-allelicFraction_internal.R @@ -3,7 +3,8 @@ library(RAIDS) library(withr) library(gdsfmt) - +library(GenomeInfoDb) +library(BSgenome.Hsapiens.UCSC.hg38) ############################################################################# ### Tests testEmptyBox() results @@ -50,7 +51,7 @@ test_that("computeLOHBlocksDNAChr() must return expected results", { gds1KG <- snpgdsOpen(fileGDS) withr::defer((gdsfmt::closefn.gds(gds1KG)), envir = parent.frame()) - chrInfo <- c(248956422L, 242193529L, 198295559L, 190214555L, + chromosome <- c(248956422L, 242193529L, 198295559L, 190214555L, 181538259L, 170805979L, 159345973L, 145138636L, 138394717L, 133797422L, 135086622L, 133275309L, 114364328L, 107043718L, 101991189L, 90338345L, 83257441L, 80373285L, 58617616L, 64444167L, 46709983L, 50818468L, @@ -73,7 +74,7 @@ test_that("computeLOHBlocksDNAChr() must return expected results", { stringAsFactor=FALSE) result <- RAIDS:::computeLOHBlocksDNAChr(gdsReference=gds1KG, - chrInfo=chrInfo, snp.pos=snpInfo, chr=1L, genoN=0.0001) + chrInfo=chromosome, snpPos=snpInfo, chr=1L, genoN=0.0001) expected <- data.frame(chr=rep(1, 12), start=c(1, snpInfo$snp.pos[seq_len(8)]+1, 6313146, @@ -91,3 +92,72 @@ test_that("computeLOHBlocksDNAChr() must return expected results", { expect_equal(result, expected) }) + + +############################################################################# +### Tests computeAllelicFractionRNA() results +############################################################################# + +context("computeAllelicFractionRNA() results") + + +test_that("computeAllelicFractionRNA() must return expected results", { + + dataDir <- testthat::test_path("fixtures") + fileGDS <- file.path(dataDir, "ex1_good_small_1KG_GDS.gds") + fileAnnotGDS <- file.path(dataDir, "ex1_good_small_1KG_Annot_GDS.gds") + + ## Open the reference GDS file + gds1KG <- snpgdsOpen(fileGDS) + withr::defer((gdsfmt::closefn.gds(gds1KG)), envir = parent.frame()) + + ## Open the reference GDS file + gds1_Annot_KG <- openfn.gds(fileAnnotGDS) + withr::defer((gdsfmt::closefn.gds(gds1_Annot_KG)), envir = parent.frame()) + + ## Open Profile GDS file for one profile + dataDir <- system.file("extdata/tests", package="RAIDS") + fileProfile <- file.path(tempdir(), "ex1.gds") + file.copy(file.path(dataDir, "ex1_demo_with_pruning_and_1KG_annot.gds"), + fileProfile) + profileGDS <- openfn.gds(fileProfile) + withr::defer((gdsfmt::closefn.gds(profileGDS)), envir = parent.frame()) + + chrInfo <- GenomeInfoDb::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25] + + result <- RAIDS:::computeAllelicFractionRNA(gdsReference=gds1KG, + gdsSample=profileGDS, gdsRefAnnot=gds1_Annot_KG, currentProfile="ex1", + studyID="MYDATA", blockID="GeneS.Ensembl.Hsapiens.v86", + chrInfo=chrInfo, minCov=35L, minProb=0.999, eProb=0.001, + cutOffLOH=-5, cutOffAR=3, verbose=FALSE) + + expect_equal(nrow(result), 49) + expect_equal(ncol(result), 17) + expect_equal(colnames(result), c("cnt.tot", "cnt.ref", "cnt.alt", + "snp.pos", "snp.chr", "normal.geno", "pruned", "snp.index", "keep", + "hetero", "homo", "block.id", "phase", "lap", "LOH", "imbAR", "freq")) +}) + + +############################################################################# +### Tests calcAFMLRNA() results +############################################################################# + +context("calcAFMLRNA() results") + + +test_that("calcAFMLRNA() must return expected results", { + + subset <- data.frame(cnt.ref=c(31, 31, 28, 17), cnt.alt=c(16, 16, 11, 27), + phase=c(3, 3, 3, 3)) + + result <- RAIDS:::calcAFMLRNA(subset) + + expect_equal(length(result), 5) + expect_equal(result$lR, 4.057862, tolerance=1e-7) + expect_equal(result$aFraction, 0.3389831, tolerance=1e-7) + expect_equal(result$nPhase, 0) + expect_equal(result$sumAlleleLow, 60) + expect_equal(result$sumAlleleHigh, 117) + +}) \ No newline at end of file diff --git a/tests/testthat/test-gdsWrapper.R b/tests/testthat/test-gdsWrapper.R index 358fc69ec..a33e49ef7 100644 --- a/tests/testthat/test-gdsWrapper.R +++ b/tests/testthat/test-gdsWrapper.R @@ -24,243 +24,309 @@ local_GDS_file <- function(path) { ############################################################################# -### Tests addUpdateSegment() results +### Tests generateGDSRefSample() results ############################################################################# -context("addUpdateSegment() results") +context("generateGDSRefSample() results") -test_that("addUpdateSegment() must copy the expected entry in \"segment\" node of the GDS file", { +test_that("generateGDSRefSample() must copy the expected entry in \"sample.annot\" node of the GDS file", { ## Create a temporary GDS file in an test directory dataDir <- system.file("extdata/tests", package="RAIDS") - fileGDS <- file.path(dataDir, "GDS_TEMP_01.gds") + fileGDS <- file.path(dataDir, "GDS_TEMP_06.gds") ## Create and open a temporary GDS file GDS_file_tmp <- local_GDS_file(fileGDS) - ## Vector of segment identifiers - segments <- c(1L, 1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L) + ## Create PEF information + pedInformation <- data.frame(sample.id=c("sample_01", "sample_02", "sample_03"), + Name.ID=c("sample_01", "sample_02", "sample_03"), + sex=c(1, 1, 2), # 1:Male 2: Female + pop.group=c("ACB", "ACB", "ACB"), + superPop=c("AFR", "AFR", "AFR"), + batch=c(1, 1, 1), stringsAsFactors=FALSE) + rownames(pedInformation) <- pedInformation$Name.ID - ## Add segments to the GDS file - RAIDS:::addUpdateSegment(gdsProfile=GDS_file_tmp, snp.seg=segments) + ## Add samples to the GDS file + results3 <- RAIDS:::generateGDSRefSample(gdsReference=GDS_file_tmp, dfPedReference=pedInformation, + listSamples=c("sample_01", "sample_02")) - ## Read segments information from GDS file - results <- read.gdsn(index.gdsn(node=GDS_file_tmp, path="segment")) + ## Read sample names from GDS file + results1 <- read.gdsn(index.gdsn(node=GDS_file_tmp, path="sample.id")) + + results2 <- read.gdsn(index.gdsn(node=GDS_file_tmp, path="sample.annot")) ## Close GDS file ## The file will automatically be deleted closefn.gds(gdsfile=GDS_file_tmp) - expect_equal(results, segments) -}) - - -test_that("addUpdateSegment() must copy the expected entry in \"segment\" node of the GDS file", { + expected1 <- c("sample_01", "sample_02") - ## Create a temporary GDS file in an test directory - dataDir <- system.file("extdata/tests", package="RAIDS") - fileGDS <- file.path(dataDir, "GDS_TEMP_02.gds") + expected2 <- pedInformation[c("sample_01", "sample_02"),] + expected2$Name.ID <- NULL + expected2$sample.id <- NULL + rownames(expected2) <- NULL - ## Create and open a temporary GDS file - GDS_file_tmp <- local_GDS_file(fileGDS) + expect_equal(results1, expected1) + expect_equal(results2, expected2) + expect_equal(results3, expected1) +}) - ## Vector of segment identifiers - segments <- c(1L, 1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L) - ## Vector of segment identifiers - segments2 <- c(1L, 1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 5L, 5L) - ## Add segments to the GDS file - RAIDS:::addUpdateSegment(gdsProfile=GDS_file_tmp, snp.seg=segments) - ## Update segments to the GDS file - RAIDS:::addUpdateSegment(gdsProfile=GDS_file_tmp, snp.seg=segments2) +############################################################################# +### Tests addBlockInGDSAnnot() results +############################################################################# - ## Read segments information from GDS file - results <- read.gdsn(index.gdsn(node=GDS_file_tmp, path="segment")) +context("addBlockInGDSAnnot() results") - ## Close GDS file - ## The file will automatically be deleted - closefn.gds(gdsfile=GDS_file_tmp) - expect_equal(results, segments2) -}) +test_that("addBlockInGDSAnnot() must return expected result", { + ## Create and open a temporary GDS Annotation file + GDS_path <- test_path("fixtures", "GDS_addBlockInGDSAnnot_Temp_01.gds") + GDS_file_tmp <- createfn.gds(filename=GDS_path) + defer(unlink(x=GDS_path, force=TRUE), envir=parent.frame()) -############################################################################# -### Tests appendGDSSampleOnly() results -############################################################################# + blockType <- "EAS.0.05.500k" + blockDescription <- "EAS population blocks based on 500k windows" -context("appendGDSSampleOnly() results") + entries <- c(1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5) + ## Add block to the GDS file + results1 <- RAIDS:::addBlockInGDSAnnot(gds=GDS_file_tmp, listBlock=entries, + blockName=blockType, blockDesc=blockDescription) -test_that("appendGDSSampleOnly() must copy the expected entry in \"sample.id\" node of the GDS file", { - ## Create a temporary GDS file in an test directory - dataDir <- system.file("extdata/tests", package="RAIDS") - fileGDS <- file.path(dataDir, "GDS_TEMP_04.gds") + blockType2 <- "AFR.0.05.500k" + blockDescription2 <- "AFR population blocks based on 500k windows" - ## Create and open a temporary GDS file - GDS_file_tmp <- local_GDS_file(fileGDS) + entries2 <- c(1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5) - ## Create sample.id field - add.gdsn(node=GDS_file_tmp, name="sample.id", val=c("sample_01", - "sample_02")) - sync.gds(gdsfile=GDS_file_tmp) + ## Add block to the GDS file + results2 <- RAIDS:::addBlockInGDSAnnot(gds=GDS_file_tmp, listBlock=entries2, + blockName=blockType2, blockDesc=blockDescription2) - ## Vector of SNV names - samples <- c('sample_05', 'sample_08', 'sample_11') - ## Add name of samples to the GDS file - RAIDS:::appendGDSSampleOnly(gds=GDS_file_tmp, listSamples=samples) + ## Read block.annot from GDS file + results3 <- read.gdsn(index.gdsn(node=GDS_file_tmp, path="block.annot")) - ## Read segments information from GDS file - results <- read.gdsn(index.gdsn(node=GDS_file_tmp, path="sample.id")) + ## Read block.annot from GDS file + results4 <- read.gdsn(index.gdsn(node=GDS_file_tmp, path="block")) ## Close GDS file ## The file will automatically be deleted closefn.gds(gdsfile=GDS_file_tmp) - expected <- c("sample_01", "sample_02", samples) + expect_equal(results1, 0L) + expect_equal(results2, 0L) + expect_equal(results3, data.frame(block.id=c(blockType, blockType2), + block.desc=c(blockDescription, blockDescription2), + stringsAsFactors=FALSE)) - expect_equal(results, expected) + expect_equal(results4, matrix(data=c(entries, entries2), ncol=2, + byrow=FALSE)) }) - ############################################################################# -### Tests appendGDSgenotypeMat() results +### Tests generateGDSgenotype() results ############################################################################# -context("appendGDSgenotypeMat() results") +context("generateGDSgenotype() results") +test_that("generateGDSgenotype() must return expected result", { -test_that("appendGDSgenotypeMat() must copy the expected entry in \"genotype\" node of the GDS file", { + ## Create and open a temporary GDS Annotation file + GDS_path <- test_path("fixtures", "GDS_generateGDSgenotype_Temp_01.gds") + GDS_file_tmp <- createfn.gds(filename=GDS_path) + defer(unlink(x=GDS_path, force=TRUE), envir=parent.frame()) - ## Create a temporary GDS file in an test directory - dataDir <- system.file("extdata/tests", package="RAIDS") - fileGDS <- file.path(dataDir, "GDS_TEMP_06.gds") + put.attr.gdsn(GDS_file_tmp$root, "FileFormat", "SNP_ARRAY") - ## Create and open a temporary GDS file - GDS_file_tmp <- local_GDS_file(fileGDS) + pedigree <- data.frame(sample.id=c("HG00100", "HG00101", "HG00102"), + Name.ID=c("HG00100", "HG00101", "HG00102"), + sex=c(1,2,2), pop.group=c("ACB", "ACB", "ACB"), + superPop=c("AFR", "AFR", "AFR"), batch=c(0, 0, 0), + stringsAsFactors=FALSE) - ## Create a "genotype" node with initial matrix - geno_initial <- matrix(rep(0L, 24), nrow=6) + rownames(pedigree) <- pedigree$sample.id - add.gdsn(node=GDS_file_tmp, name="genotype", val=geno_initial) - sync.gds(GDS_file_tmp) + filterSNVFile <- test_path("fixtures", "mapSNVSelected_Demo.rds") + + ## Add information about samples + listSampleGDS <- RAIDS:::generateGDSRefSample(gdsReference=GDS_file_tmp, + dfPedReference=pedigree, listSamples=NULL) - ## new genotype to be added - geno_new <- matrix(rep(1L, 12), nrow=6) + ## Add SNV information to the Reference GDS + RAIDS:::generateGDSSNPinfo(gdsReference=GDS_file_tmp, + fileFreq=filterSNVFile, verbose=FALSE) - ## Add genotype to the GDS file - results0 <- RAIDS:::appendGDSgenotypeMat(gds=GDS_file_tmp, matG=geno_new) + snpIndexFile <- test_path("fixtures", "listSNPIndexes_Demo.rds") - ## Read genotype names from GDS file - results1 <- read.gdsn(index.gdsn(node=GDS_file_tmp, path="genotype")) + ## Add genotype information to the Reference GDS + result1 <- RAIDS:::generateGDSgenotype(gds=GDS_file_tmp, + pathGeno=test_path("fixtures"), + fileSNPsRDS=snpIndexFile, listSamples=listSampleGDS, verbose=FALSE) + + result2 <- read.gdsn(index.gdsn(GDS_file_tmp, "genotype")) ## Close GDS file ## The file will automatically be deleted closefn.gds(gdsfile=GDS_file_tmp) - expected1 <- cbind(geno_initial, geno_new) + expect_equal(result1, 0L) + expect_equal(result2, matrix(data=c(0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, + 1, 0, 0, 2, 2, 0, 2, 1), nrow=7, byrow=FALSE)) - expect_equal(results1, expected1) - expect_equal(results0, 0L) }) -############################################################################# -### Tests generateGDSRefSample() results -############################################################################# - -context("generateGDSRefSample() results") +test_that("generateGDSgenotype() must return expected error when sample not present", { + ## Create and open a temporary GDS Annotation file + GDS_path <- test_path("fixtures", "GDS_generateGDSgenotype_Temp_02.gds") + GDS_file_tmp <- createfn.gds(filename=GDS_path) + defer(closefn.gds(GDS_file_tmp), envir = parent.frame(), + priority = "first") + defer(unlink(x=GDS_path, force=TRUE), envir=parent.frame(), + priority = "last") -test_that("generateGDSRefSample() must copy the expected entry in \"sample.annot\" node of the GDS file", { + put.attr.gdsn(GDS_file_tmp$root, "FileFormat", "SNP_ARRAY") - ## Create a temporary GDS file in an test directory - dataDir <- system.file("extdata/tests", package="RAIDS") - fileGDS <- file.path(dataDir, "GDS_TEMP_06.gds") + pedigree <- data.frame(sample.id=c("HG00100", "HG00101", "HG00102"), + Name.ID=c("HG00100", "HG00101", "HG00102"), + sex=c(1,2,2), pop.group=c("ACB", "ACB", "ACB"), + superPop=c("AFR", "AFR", "AFR"), batch=c(0, 0, 0), + stringsAsFactors=FALSE) - ## Create and open a temporary GDS file - GDS_file_tmp <- local_GDS_file(fileGDS) + rownames(pedigree) <- pedigree$sample.id - ## Create PEF information - pedInformation <- data.frame(sample.id=c("sample_01", "sample_02", "sample_03"), - Name.ID=c("sample_01", "sample_02", "sample_03"), - sex=c(1, 1, 2), # 1:Male 2: Female - pop.group=c("ACB", "ACB", "ACB"), - superPop=c("AFR", "AFR", "AFR"), - batch=c(1, 1, 1), stringsAsFactors=FALSE) - rownames(pedInformation) <- pedInformation$Name.ID + filterSNVFile <- test_path("fixtures", "mapSNVSelected_Demo.rds") - ## Add samples to the GDS file - results3 <- RAIDS:::generateGDSRefSample(gdsReference=GDS_file_tmp, dfPedReference=pedInformation, - listSamples=c("sample_01", "sample_02")) + ## Add information about samples + listSampleGDS <- RAIDS:::generateGDSRefSample(gdsReference=GDS_file_tmp, + dfPedReference=pedigree, listSamples=NULL) - ## Read sample names from GDS file - results1 <- read.gdsn(index.gdsn(node=GDS_file_tmp, path="sample.id")) + ## Add SNV information to the Reference GDS + RAIDS:::generateGDSSNPinfo(gdsReference=GDS_file_tmp, + fileFreq=filterSNVFile, verbose=FALSE) - results2 <- read.gdsn(index.gdsn(node=GDS_file_tmp, path="sample.annot")) - - ## Close GDS file - ## The file will automatically be deleted - closefn.gds(gdsfile=GDS_file_tmp) - - expected1 <- c("sample_01", "sample_02") + snpIndexFile <- test_path("fixtures", "listSNPIndexes_Demo.rds") - expected2 <- pedInformation[c("sample_01", "sample_02"),] - expected2$Name.ID <- NULL - expected2$sample.id <- NULL - rownames(expected2) <- NULL + error_message <- "Missing samples genotype in BYEBYE" - expect_equal(results1, expected1) - expect_equal(results2, expected2) - expect_equal(results3, expected1) + expect_error(RAIDS:::generateGDSgenotype(gds=GDS_file_tmp, + pathGeno=test_path("fixtures"), fileSNPsRDS=snpIndexFile, + listSamples="BYEBYE", verbose=FALSE), error_message, fixed=TRUE) }) ############################################################################# -### Tests addGDSRef() results +### Tests appendGDSgenotype() results ############################################################################# -context("addGDSRef() results") - +context("appendGDSgenotype() results") -test_that("addGDSRef() must return expected result", { +test_that("appendGDSgenotype() must return expected result", { - ## Create and open a temporary GDS file - GDS_path <- test_path("fixtures", "GDS_addGDSRef_Temp_01.gds") + ## Create and open a temporary GDS Annotation file + GDS_path <- test_path("fixtures", "GDS_appendGDSgenotype_Temp_01.gds") GDS_file_tmp <- createfn.gds(filename=GDS_path) defer(unlink(x=GDS_path, force=TRUE), envir=parent.frame()) - ## Create "sample.id" node (the node must be present) - sampleIDs <- c("HG00104", "HG00105", "HG00106", "HG00109", "HG00110") - add.gdsn(node=GDS_file_tmp, name="sample.id", val=sampleIDs) - sync.gds(GDS_file_tmp) + put.attr.gdsn(GDS_file_tmp$root, "FileFormat", "SNP_ARRAY") - listD <- list(rels=c("HG00106", "HG00110"), unrels=c("HG00104", - "HG00105", "HG00109")) + pedigree <- data.frame(sample.id=c("HG00100", "HG00101", "HG00102"), + Name.ID=c("HG00100", "HG00101", "HG00102"), + sex=c(1,2,2), pop.group=c("ACB", "ACB", "ACB"), + superPop=c("AFR", "AFR", "AFR"), batch=c(0, 0, 0), + stringsAsFactors=FALSE) - RDS_file_tmp <- test_path("fixtures", "RDS_addGDSRef_Temp_01.RDS") + rownames(pedigree) <- pedigree$sample.id - saveRDS(listD, RDS_file_tmp) - defer(unlink(RDS_file_tmp), envir=parent.frame()) + filterSNVFile <- test_path("fixtures", "mapSNVSelected_Demo.rds") - ## Add samples to the GDS file - results3 <- RAIDS:::addGDSRef(gdsReference=GDS_file_tmp, filePart=RDS_file_tmp) + ## Add information about samples + listSampleGDS <- RAIDS:::generateGDSRefSample(gdsReference=GDS_file_tmp, + dfPedReference=pedigree, listSamples=NULL) - ## Read sample names from GDS file - results1 <- read.gdsn(index.gdsn(node=GDS_file_tmp, path="sample.ref")) + ## Add SNV information to the Reference GDS + RAIDS:::generateGDSSNPinfo(gdsReference=GDS_file_tmp, + fileFreq=filterSNVFile, verbose=FALSE) + + snpIndexFile <- test_path("fixtures", "listSNPIndexes_Demo.rds") + + ## Add genotype information to the Reference GDS + result1 <- RAIDS:::generateGDSgenotype(gds=GDS_file_tmp, + pathGeno=test_path("fixtures"), + fileSNPsRDS=snpIndexFile, listSamples=listSampleGDS[1], verbose=FALSE) + + + ## Append genotype information to the Reference GDS + result1 <- RAIDS:::appendGDSgenotype(gds=GDS_file_tmp, + pathGeno=test_path("fixtures"), + fileSNPsRDS=snpIndexFile, listSample=listSampleGDS[2], + verbose=FALSE) + + result2 <- read.gdsn(index.gdsn(GDS_file_tmp, "genotype")) ## Close GDS file ## The file will automatically be deleted closefn.gds(gdsfile=GDS_file_tmp) - expected1 <- c(1, 1, 0, 1, 0) + expect_equal(result1, 0L) + expect_equal(result2, matrix(data=c(0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, + 1), nrow=7, byrow=FALSE)) - expect_equal(results3, 0L) - expect_equal(results1, expected1) }) +test_that("appendGDSgenotype() must return expected error when sample not present", { + + ## Create and open a temporary GDS Annotation file + GDS_path <- test_path("fixtures", "GDS_appendGDSgenotype_Temp_02.gds") + GDS_file_tmp <- createfn.gds(filename=GDS_path) + defer(closefn.gds(GDS_file_tmp), envir = parent.frame(), + priority = "first") + defer(unlink(x=GDS_path, force=TRUE), envir=parent.frame(), + priority = "last") + + put.attr.gdsn(GDS_file_tmp$root, "FileFormat", "SNP_ARRAY") + + pedigree <- data.frame(sample.id=c("HG00100", "HG00101", "HG00102"), + Name.ID=c("HG00100", "HG00101", "HG00102"), + sex=c(1,2,2), pop.group=c("ACB", "ACB", "ACB"), + superPop=c("AFR", "AFR", "AFR"), batch=c(0, 0, 0), + stringsAsFactors=FALSE) + + rownames(pedigree) <- pedigree$sample.id + + filterSNVFile <- test_path("fixtures", "mapSNVSelected_Demo.rds") + ## Add information about samples + listSampleGDS <- RAIDS:::generateGDSRefSample(gdsReference=GDS_file_tmp, + dfPedReference=pedigree, listSamples=NULL) + + ## Add SNV information to the Reference GDS + RAIDS:::generateGDSSNPinfo(gdsReference=GDS_file_tmp, + fileFreq=filterSNVFile, verbose=FALSE) + + snpIndexFile <- test_path("fixtures", "listSNPIndexes_Demo.rds") + + ## Add genotype information to the Reference GDS + result1 <- RAIDS:::generateGDSgenotype(gds=GDS_file_tmp, + pathGeno=test_path("fixtures"), + fileSNPsRDS=snpIndexFile, listSamples=listSampleGDS[1], verbose=FALSE) + + ## Append genotype information to the Reference GDS + result1 <- RAIDS:::appendGDSgenotype(gds=GDS_file_tmp, + pathGeno=test_path("fixtures"), + fileSNPsRDS=snpIndexFile, listSample=listSampleGDS[2], + verbose=FALSE) + + error_message <- "Missing reference samples HELLO_CANADA" + + expect_error(RAIDS:::appendGDSgenotype(gds=GDS_file_tmp, + pathGeno=test_path("fixtures"), fileSNPsRDS=snpIndexFile, + listSample="HELLO_CANADA", verbose=FALSE), error_message, fixed=TRUE) +}) diff --git a/tests/testthat/test-gdsWrapper_internal.R b/tests/testthat/test-gdsWrapper_internal.R index e76a03d2a..1450c3527 100644 --- a/tests/testthat/test-gdsWrapper_internal.R +++ b/tests/testthat/test-gdsWrapper_internal.R @@ -23,6 +23,93 @@ local_GDS_file <- function(path) { } +############################################################################# +### Tests addGDSRef() results +############################################################################# + +context("addGDSRef() results") + + +test_that("addGDSRef() must return expected result", { + + ## Create and open a temporary GDS file + GDS_path <- test_path("fixtures", "GDS_addGDSRef_Temp_01.gds") + GDS_file_tmp <- createfn.gds(filename=GDS_path) + defer(unlink(x=GDS_path, force=TRUE), envir=parent.frame()) + + ## Create "sample.id" node (the node must be present) + sampleIDs <- c("HG00104", "HG00105", "HG00106", "HG00109", "HG00110") + add.gdsn(node=GDS_file_tmp, name="sample.id", val=sampleIDs) + sync.gds(GDS_file_tmp) + + listD <- list(rels=c("HG00106", "HG00110"), unrels=c("HG00104", + "HG00105", "HG00109")) + + RDS_file_tmp <- test_path("fixtures", "RDS_addGDSRef_Temp_01.RDS") + + saveRDS(listD, RDS_file_tmp) + defer(unlink(RDS_file_tmp), envir=parent.frame()) + + ## Add samples to the GDS file + results3 <- RAIDS:::addGDSRef(gdsReference=GDS_file_tmp, + filePart=RDS_file_tmp) + + ## Read sample names from GDS file + results1 <- read.gdsn(index.gdsn(node=GDS_file_tmp, path="sample.ref")) + + ## Close GDS file + ## The file will automatically be deleted + closefn.gds(gdsfile=GDS_file_tmp) + + expected1 <- c(1, 1, 0, 1, 0) + + expect_equal(results3, 0L) + expect_equal(results1, expected1) +}) + + +############################################################################# +### Tests appendGDSgenotypeMat() results +############################################################################# + +context("appendGDSgenotypeMat() results") + + +test_that("appendGDSgenotypeMat() must copy the expected entry in \"genotype\" node of the GDS file", { + + ## Create a temporary GDS file in an test directory + dataDir <- system.file("extdata/tests", package="RAIDS") + fileGDS <- file.path(dataDir, "GDS_TEMP_06.gds") + + ## Create and open a temporary GDS file + GDS_file_tmp <- local_GDS_file(fileGDS) + + ## Create a "genotype" node with initial matrix + geno_initial <- matrix(rep(0L, 24), nrow=6) + + add.gdsn(node=GDS_file_tmp, name="genotype", val=geno_initial) + sync.gds(GDS_file_tmp) + + ## new genotype to be added + geno_new <- matrix(rep(1L, 12), nrow=6) + + ## Add genotype to the GDS file + results0 <- RAIDS:::appendGDSgenotypeMat(gds=GDS_file_tmp, matG=geno_new) + + ## Read genotype names from GDS file + results1 <- read.gdsn(index.gdsn(node=GDS_file_tmp, path="genotype")) + + ## Close GDS file + ## The file will automatically be deleted + closefn.gds(gdsfile=GDS_file_tmp) + + expected1 <- cbind(geno_initial, geno_new) + + expect_equal(results1, expected1) + expect_equal(results0, 0L) +}) + + ############################################################################# ### Tests runIBDKING() results ############################################################################# @@ -45,13 +132,13 @@ test_that("runIBDKING() must return expected results", { maf=0.05, verbose=FALSE) sampleIDs <- c("NA07034", "NA07055", "NA12814") - ibs0 <- matrix(c(0.000000000000000, 0.079705823891870, 0.088194306191519, - 0.079705823891870, 0.000000000000000, 0.094035785288270, - 0.088194306191519, 0.094035785288270, 0.000000000000000), nrow=3, + ibs0 <- matrix(c(0.000000000000000, 0.079928243970500, 0.088100458441300, + 0.079928243970500, 0.000000000000000, 0.093880805262109, + 0.088100458441300, 0.093880805262109, 0.000000000000000), nrow=3, byrow=TRUE) - kinship <- matrix(c(0.500000000000000, 0.017237640936687, 0.001294777729823, - 0.017237640936687, 0.500000000000000, -0.014341590612777, - 0.001294777729823, -0.014341590612777, 0.500000000000000), nrow=3, + kinship <- matrix(c(0.500000000000000, 0.015481901439163, 0.001080847384349, + 0.015481901439163, 0.500000000000000, -0.014173571740078, + 0.001080847384349, -0.014173571740078, 0.500000000000000), nrow=3, byrow=TRUE) expect_equal(result$sample.id, sampleIDs) @@ -494,3 +581,158 @@ test_that("addUpdateLap() must copy the expected entry in \"lap\" node of the GD expect_equal(results, lap) }) + +############################################################################# +### Tests getBlockIDs() results +############################################################################# + + +context("getBlockIDs() results") + + +test_that("getBlockIDs() must return the expected result", { + + ## Create a temporary GDS file in an test directory + dataDir <- test_path("fixtures") + fileGDS <- file.path(dataDir, "ex1_good_small_1KG_Annot_GDS.gds") + + annotFile <- openfn.gds(fileGDS) + defer(closefn.gds(annotFile), envir=parent.frame()) + + ## Vector of segment identifiers + indexes <- c(1, 3, 6, 8, 9) + + ## Block identifiers for the selected SNVs + result <- RAIDS:::getBlockIDs(gdsRefAnnot=annotFile, snpIndex=indexes, + blockTypeID="GeneS.Ensembl.Hsapiens.v86") + + expected <- rep(943, 5) + + expect_equal(result, expected) +}) + + +test_that("getBlockIDs() must return expected error", { + + ## Create a temporary GDS file in an test directory + dataDir <- test_path("fixtures") + fileGDS <- file.path(dataDir, "ex1_good_small_1KG_Annot_GDS.gds") + + annotFile <- openfn.gds(fileGDS) + defer(closefn.gds(annotFile), envir=parent.frame()) + + ## Vector of segment identifiers + indexes <- c(1, 3, 6, 8, 9) + + error_message <- paste0("The following block type is not found in the ", + "GDS Annotation file: \'InformationTremblay\'") + + expect_error(RAIDS:::getBlockIDs(gdsRefAnnot=annotFile, snpIndex=indexes, + blockTypeID="InformationTremblay"), + error_message, fixed=TRUE) +}) + + +############################################################################# +### Tests addUpdateSegment() results +############################################################################# + +context("addUpdateSegment() results") + + +test_that("addUpdateSegment() must copy the expected entry in \"segment\" node of the GDS file", { + + ## Create a temporary GDS file in an test directory + dataDir <- system.file("extdata/tests", package="RAIDS") + fileGDS <- file.path(dataDir, "GDS_TEMP_01.gds") + + ## Create and open a temporary GDS file + GDS_file_tmp <- local_GDS_file(fileGDS) + + ## Vector of segment identifiers + segments <- c(1L, 1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L) + + ## Add segments to the GDS file + RAIDS:::addUpdateSegment(gdsProfile=GDS_file_tmp, snpSeg=segments) + + ## Read segments information from GDS file + results <- read.gdsn(index.gdsn(node=GDS_file_tmp, path="segment")) + + ## Close GDS file + ## The file will automatically be deleted + closefn.gds(gdsfile=GDS_file_tmp) + + expect_equal(results, segments) +}) + + +test_that("addUpdateSegment() must copy the expected entry in \"segment\" node of the GDS file", { + + ## Create a temporary GDS file in an test directory + dataDir <- system.file("extdata/tests", package="RAIDS") + fileGDS <- file.path(dataDir, "GDS_TEMP_02.gds") + + ## Create and open a temporary GDS file + GDS_file_tmp <- local_GDS_file(fileGDS) + + ## Vector of segment identifiers + segments <- c(1L, 1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L) + ## Vector of segment identifiers + segments2 <- c(1L, 1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 5L, 5L) + + ## Add segments to the GDS file + RAIDS:::addUpdateSegment(gdsProfile=GDS_file_tmp, snpSeg=segments) + + ## Update segments to the GDS file + RAIDS:::addUpdateSegment(gdsProfile=GDS_file_tmp, snpSeg=segments2) + + ## Read segments information from GDS file + results <- read.gdsn(index.gdsn(node=GDS_file_tmp, path="segment")) + + ## Close GDS file + ## The file will automatically be deleted + closefn.gds(gdsfile=GDS_file_tmp) + + expect_equal(results, segments2) +}) + + +############################################################################# +### Tests appendGDSSampleOnly() results +############################################################################# + +context("appendGDSSampleOnly() results") + + +test_that("appendGDSSampleOnly() must copy the expected entry in \"sample.id\" node of the GDS file", { + + ## Create a temporary GDS file in an test directory + dataDir <- system.file("extdata/tests", package="RAIDS") + fileGDS <- file.path(dataDir, "GDS_TEMP_04.gds") + + ## Create and open a temporary GDS file + GDS_file_tmp <- local_GDS_file(fileGDS) + + ## Create sample.id field + add.gdsn(node=GDS_file_tmp, name="sample.id", val=c("sample_01", + "sample_02")) + sync.gds(gdsfile=GDS_file_tmp) + + ## Vector of SNV names + samples <- c('sample_05', 'sample_08', 'sample_11') + + ## Add name of samples to the GDS file + RAIDS:::appendGDSSampleOnly(gds=GDS_file_tmp, listSamples=samples) + + ## Read segments information from GDS file + results <- read.gdsn(index.gdsn(node=GDS_file_tmp, path="sample.id")) + + ## Close GDS file + ## The file will automatically be deleted + closefn.gds(gdsfile=GDS_file_tmp) + + expected <- c("sample_01", "sample_02", samples) + + expect_equal(results, expected) +}) + diff --git a/tests/testthat/test-process1KG.R b/tests/testthat/test-process1KG.R index cde421236..5764e9894 100644 --- a/tests/testthat/test-process1KG.R +++ b/tests/testthat/test-process1KG.R @@ -173,18 +173,18 @@ test_that("generateMapSnvSel() must generate expected output", { 54489, 54707, 54715), REF=c("T", "T", "C", "G", "G", "G", "C"), ALT=c("G", "A", "A", "A", "A", "C", "T"), - AF=as.character(c(0.02, 0.11, 0.08, 0.07, - 0.1, 0.23, 0.21)), - EAS_AF=c("0.0", "0.0", "0.05", "0.01", - "0.0", "0.08", "0.07"), - EUR_AF=c("0.04", "0.2", "0.12", "0.14", - "0.18", "0.38", "0.34"), - AFR_AF=c("0.03", "0.02", "0.05", "0.06", - "0.02", "0.18", "0.16"), - AMR_AF=c("0.03", "0.12", "0.06", "0.07", - "0.1", "0.25", "0.24"), - SAS_AF=c("0.02", "0.22", "0.1", "0.09", - "0.21", "0.28", "0.27"), + AF=c(0.02, 0.11, 0.08, 0.07, + 0.1, 0.23, 0.21), + EAS_AF=c(0.0, 0.0, 0.05, 0.01, + 0.0, 0.08, 0.07), + EUR_AF=c(0.04, 0.2, 0.12, 0.14, + 0.18, 0.38, 0.34), + AFR_AF=c(0.03, 0.02, 0.05, 0.06, + 0.02, 0.18, 0.16), + AMR_AF=c(0.03, 0.12, 0.06, 0.07, + 0.1, 0.25, 0.24), + SAS_AF=c(0.02, 0.22, 0.1, 0.09, + 0.21, 0.28, 0.27), stringsAsFactors = FALSE) expect_equivalent(readRDS(filterSNVFile), snpFilteredExpected) @@ -562,8 +562,8 @@ test_that("addGeneBlockGDSRefAnnot() must return error when gds is a character s error_message <- "The \'gdsReference\' must be an object of class \'gds.class\'" expect_error(addGeneBlockGDSRefAnnot(gdsReference="test.gds", - file.gdsRefAnnot="toto.gds", winSize=10000, EnsDb="human", - suffixe.blockName="test"), error_message) + gdsRefAnnotFile="toto.gds", winSize=10000, ensDb="human", + suffixBlockName="test"), error_message) }) test_that("addGeneBlockGDSRefAnnot() must return error when file.gdsRefAnnot does not exist", { @@ -576,8 +576,8 @@ test_that("addGeneBlockGDSRefAnnot() must return error when file.gdsRefAnnot doe error_message <- "The file \'fixtures/titi.gds\' does not exist." expect_error(addGeneBlockGDSRefAnnot(gdsReference=gds_1KG, - file.gdsRefAnnot=test_path("fixtures", "titi.gds"), - winSize=1000, EnsDb="human", suffixe.blockName="test"), error_message) + gdsRefAnnotFile=test_path("fixtures", "titi.gds"), + winSize=1000, ensDb="human", suffixBlockName="test"), error_message) }) test_that("addGeneBlockGDSRefAnnot() must return error when winSize is a character string", { @@ -590,9 +590,9 @@ test_that("addGeneBlockGDSRefAnnot() must return error when winSize is a charact error_message <- "The \'winSize\' parameter must be a single numeric value." expect_error(addGeneBlockGDSRefAnnot(gdsReference=gds_1KG, - file.gdsRefAnnot=test_path("fixtures", - "ex1_good_small_1KG_Annot_GDS.gds"), winSize="10", EnsDb="human", - suffixe.blockName="test"), error_message) + gdsRefAnnotFile=test_path("fixtures", + "ex1_good_small_1KG_Annot_GDS.gds"), winSize="10", ensDb="human", + suffixBlockName="test"), error_message) }) @@ -616,7 +616,7 @@ test_that("generatePhase1KG2GDS() must return error when verbose is a numeric", " (TRUE or FALSE).") expect_error(generatePhase1KG2GDS(gdsReference=gds1KG, - gdsReferencePhase=gds1KG, pathGeno=dataDir, fileSNPsRDS="test", + gdsReferencePhase=gds1KG, pathGeno=dataDir, fileSNVIndex="test", verbose="SAVE"), error_message, fixed=TRUE) }) diff --git a/tests/testthat/test-process1KG_internal.R b/tests/testthat/test-process1KG_internal.R index 788f64a6a..8dd56f3d7 100644 --- a/tests/testthat/test-process1KG_internal.R +++ b/tests/testthat/test-process1KG_internal.R @@ -46,3 +46,37 @@ test_that("validateGenerateGDS1KG() must return expected results when all input expect_identical(result1, 0L) }) + + + + +############################################################################# +### Tests pruning1KGbyChr() results +############################################################################# + +context("pruning1KGbyChr() results") + + +test_that("pruning1KGbyChr() must return expected results when all input are valid", { + + set.seed(121) + + dataDir <- test_path("fixtures") + + gds1KG <- snpgdsOpen(file.path(dataDir, "1KG_Test.gds")) + withr::defer((snpgdsClose(gds1KG)), envir = parent.frame()) + + outPrefix <- file.path(tempdir(), "Pruned_Test") + result1 <- RAIDS:::pruning1KGbyChr(gdsReference=gds1KG, + outPrefix=outPrefix) + withr::defer(if(file.exists(paste0(outPrefix, ".rds"))) + {unlink(paste0(outPrefix, ".rds"), force=TRUE)}, + envir=parent.frame()) + + expect_identical(result1, 0L) + expect_true(file.exists(paste0(outPrefix, ".rds"))) + + test <- readRDS(paste0(outPrefix, ".rds")) + + expect_identical(test, c("s5", "s7")) +}) \ No newline at end of file diff --git a/tests/testthat/test-processStudy.R b/tests/testthat/test-processStudy.R index 598aad88a..5a32e1cbd 100644 --- a/tests/testthat/test-processStudy.R +++ b/tests/testthat/test-processStudy.R @@ -4,265 +4,8 @@ library(RAIDS) library(withr) library(gdsfmt) -chrInfo <- c(248956422L, 242193529L, 198295559L, 190214555L, 181538259L, - 170805979L, 159345973L, 145138636L, 138394717L, 133797422L, - 135086622L, 133275309L, 114364328L, 107043718L, 101991189L, - 90338345L, 83257441L, 80373285L, 58617616L, 64444167L, - 46709983L, 50818468L, 156040895L, 57227415L, 16569L) -############################################################################# -### Tests projectSample2PCA() results -############################################################################# - -context("projectSample2PCA() results") - - -test_that("projectSample2PCA() must return error when np is character string", { - - fileGDS <- test_path("fixtures", "1KG_Test.gds") - - error_message <- "The \'np\' parameter must be a single positive integer." - - expect_error(projectSample2PCA(gdsProfile=fileGDS, listPCA=list(), - currentProfile="sample1", - np="test"), error_message) -}) - -test_that("projectSample2PCA() must return error when np is negative integer", { - - fileGDS <- test_path("fixtures", "1KG_Test.gds") - - error_message <- "The \'np\' parameter must be a single positive integer." - - expect_error(projectSample2PCA(gdsProfile=fileGDS, listPCA=list(), - currentProfile="sample1", - np=-1L), error_message) -}) - - -test_that("projectSample2PCA() must return error when np is zero", { - - fileGDS <- test_path("fixtures", "1KG_Test.gds") - - error_message <- "The \'np\' parameter must be a single positive integer." - - expect_error(projectSample2PCA(gdsProfile=fileGDS, listPCA=list(), - currentProfile="sample1", np=0L), error_message) -}) - - -test_that("projectSample2PCA() must return error when currentProfile is number", { - - fileGDS <- test_path("fixtures", "1KG_Test.gds") - - error_message <- paste0("The \'currentProfile\' ", - "parameter must be a character string.") - - expect_error(projectSample2PCA(gdsProfile=fileGDS, listPCA=list(), - currentProfile=101, np=1L), error_message) -}) - - -test_that("projectSample2PCA() must return error when verbose is number", { - - fileGDS <- test_path("fixtures", "1KG_Test.gds") - - error_message <- paste0("The \'verbose\' parameter must be logical ", - "(TRUE or FALSE).") - - expect_error(projectSample2PCA(gdsProfile=fileGDS, listPCA=list(), - currentProfile="sample01", np=1L, verbose=33), error_message, - fixed=TRUE) -}) - - -############################################################################# -### Tests appendStudy2GDS1KG() results -############################################################################# - - -context("appendStudy2GDS1KG() results") - -test_that("appendStudy2GDS1KG() must return error when pathGeno is a numeric", { - - fileGDS <- test_path("fixtures", "1KG_Test.gds") - sampleRDS <- test_path("fixtures", "Sample_Info_Test.RDS") - - studyInfo <- data.frame(study.id="Pancreatic.WES", - study.desc="Pancreatic study", study.platform="WES", - stringsAsFactors=FALSE) - - error_message <- paste0("The \'pathGeno\' must be a character string ", - "representing a path. The path must exist.") - - expect_error(appendStudy2GDS1KG(pathGeno=22, filePedRDS=sampleRDS, - fileNameGDS=fileGDS, batch=2, - studyDF=studyInfo, listSamples=NULL, pathProfileGDS=NULL, - genoSource="snp-pileup", verbose="TRUE"), error_message, fixed=TRUE) -}) - - -test_that("appendStudy2GDS1KG() must return error when filePedRDS is numeric", { - - fileGDS <- test_path("fixtures", "1KG_Test.gds") - sampleRDS <- test_path("fixtures", "Sample_Info_Test.RDS") - - studyInfo <- data.frame(study.id="Pancreatic.WES", - study.desc="Pancreatic study", study.platform="WES", - stringsAsFactors=FALSE) - - error_message <- paste0("The \'filePedRDS\' must be a character string ", - "representing the RDS Sample information file. The file must exist.") - - expect_error(appendStudy2GDS1KG(pathGeno=test_path("fixtures"), - filePedRDS=21, fileNameGDS=fileGDS, batch=1, - studyDF=studyInfo, listSamples=NULL, pathProfileGDS=NULL, - genoSource="snp-pileup", verbose=TRUE), error_message) -}) - - -test_that("appendStudy2GDS1KG() must return error when fileNameGDS is numeric", { - - fileGDS <- test_path("fixtures", "1KG_Test.gds") - sampleRDS <- test_path("fixtures", "Sample_Info_Test.RDS") - - studyInfo <- data.frame(study.id="Pancreatic.WES", - study.desc="Pancreatic study", study.platform="WES", - stringsAsFactors=FALSE) - - error_message <- paste0("The \'fileNameGDS\' must be a character string ", - "representing the GDS 1KG file. The file must exist.") - - expect_error(appendStudy2GDS1KG(pathGeno=test_path("fixtures"), - filePedRDS=sampleRDS, fileNameGDS=33, batch=1, - studyDF=studyInfo, listSamples=NULL, pathProfileGDS=NULL, - genoSource="snp-pileup", verbose=TRUE), error_message) -}) - - -test_that("appendStudy2GDS1KG() must return error when batch is a vector of numerics", { - - fileGDS <- test_path("fixtures", "1KG_Test.gds") - sampleRDS <- test_path("fixtures", "Sample_Info_Test.RDS") - - studyInfo <- data.frame(study.id="Pancreatic.WES", study.desc="Pancreatic", - study.platform="WES", stringsAsFactors=FALSE) - - error_message <- "The \'batch\' must be a single integer." - - expect_error(appendStudy2GDS1KG(pathGeno=test_path("fixtures"), - filePedRDS=sampleRDS, fileNameGDS=fileGDS, batch=c(1,2), - studyDF=studyInfo, listSamples=NULL, pathProfileGDS=NULL, - genoSource="snp-pileup", verbose=TRUE), error_message) -}) - -test_that("appendStudy2GDS1KG() must return error when batch is a character string", { - - fileGDS <- test_path("fixtures", "1KG_Test.gds") - sampleRDS <- test_path("fixtures", "Sample_Info_Test.RDS") - - studyInfo <- data.frame(study.id="Pancreatic.WES", - study.desc="Pancreatic study", study.platform="WES", - stringsAsFactors=FALSE) - - error_message <- "The \'batch\' must be a single integer." - - expect_error(appendStudy2GDS1KG(pathGeno=test_path("fixtures"), - filePedRDS=sampleRDS, fileNameGDS=fileGDS, batch="2", - studyDF=studyInfo, listSamples=NULL, pathProfileGDS=NULL, - genoSource="snp-pileup", verbose=TRUE), error_message) -}) - - -test_that("appendStudy2GDS1KG() must return error when studyDF is missing mandatory column", { - - fileGDS <- test_path("fixtures", "1KG_Test.gds") - sampleRDS <- test_path("fixtures", "Sample_Info_Test.RDS") - - studyInfo <- data.frame(study.id="Pancreatic.WES", - study.descption="Pancreatic study", study.platform="WES", - stringsAsFactors=FALSE) - - error_message <- paste0("The \'studyDF\' must be a data.frame and contain ", - "those 3 columns: \'study.id\', \'study.desc\' and \'study.platform\'.") - - expect_error(appendStudy2GDS1KG(pathGeno=test_path("fixtures"), - filePedRDS=sampleRDS, fileNameGDS=fileGDS, batch=1, - studyDF=studyInfo, listSamples=NULL, pathProfileGDS=NULL, - genoSource="snp-pileup", verbose=TRUE), error_message) -}) - - -test_that("appendStudy2GDS1KG() must return error when listSamples is a numeric", { - - fileGDS <- test_path("fixtures", "1KG_Test.gds") - sampleRDS <- test_path("fixtures", "Sample_Info_Test.RDS") - - studyInfo <- data.frame(study.id="Pancreatic.WES", - study.desc="Pancreatic study", study.platform="WES", - stringsAsFactors=FALSE) - - error_message <- paste0("The \'listSamples\' must be a vector ", - "of character strings (1 entry or more) or NULL.") - - expect_error(appendStudy2GDS1KG(pathGeno=test_path("fixtures"), - filePedRDS=sampleRDS, fileNameGDS=fileGDS, batch=2, - studyDF=studyInfo, listSamples=33, pathProfileGDS=NULL, - genoSource="snp-pileup", verbose=FALSE), error_message, fixed=TRUE) -}) - -test_that("appendStudy2GDS1KG() must return error when genoSource is a numeric", { - - fileGDS <- test_path("fixtures", "1KG_Test.gds") - sampleRDS <- test_path("fixtures", "Sample_Info_Test.RDS") - - studyInfo <- data.frame(study.id="Pancreatic.WES", - study.desc="Pancreatic study", study.platform="WES", - stringsAsFactors=FALSE) - - error_message <- 'The \'genoSource\' parameter must be a character string.' - - expect_error(appendStudy2GDS1KG(pathGeno=test_path("fixtures"), - filePedRDS=sampleRDS, fileNameGDS=fileGDS, batch=2, - studyDF=studyInfo, listSamples=NULL, pathProfileGDS=NULL, - genoSource=3, verbose=TRUE), error_message, fixed=TRUE) -}) - - -test_that("appendStudy2GDS1KG() must return error when verbose is a character string", { - - fileGDS <- test_path("fixtures", "1KG_Test.gds") - sampleRDS <- test_path("fixtures", "Sample_Info_Test.RDS") - - studyInfo <- data.frame(study.id="Pancreatic.WES", - study.desc="Pancreatic study", study.platform="WES", - stringsAsFactors=FALSE) - - error_message <- 'The \'verbose\' parameter must be a logical (TRUE or FALSE).' - - expect_error(appendStudy2GDS1KG(pathGeno=test_path("fixtures"), - filePedRDS=sampleRDS, fileNameGDS=fileGDS, batch=2, - studyDF=studyInfo, listSamples=NULL, pathProfileGDS=NULL, - genoSource="snp-pileup", verbose="TRUE"), error_message, fixed=TRUE) -}) - - -test_that("appendStudy2GDS1KG() must return error when genoSource not in list of choices", { - - fileGDS <- test_path("fixtures", "1KG_Test.gds") - sampleRDS <- test_path("fixtures", "Sample_Info_Test.RDS") - - studyInfo <- data.frame(study.id="Pancreatic.WES", - study.desc="Pancreatic study", study.platform="WES", - stringsAsFactors=FALSE) - - expect_error(appendStudy2GDS1KG(pathGeno=test_path("fixtures"), - filePedRDS=sampleRDS, fileNameGDS=fileGDS, batch=2, - studyDF=studyInfo, listSamples=NULL, pathProfileGDS=NULL, - genoSource="TOP-pileup", verbose=TRUE)) -}) - ############################################################################# ### Tests pruningSample() results ############################################################################# @@ -1206,8 +949,8 @@ test_that("createStudy2GDS1KG() must return error when fileNameGDS is numerical Sample.Type=rep("Primary Tumor", 3), Source=rep("Databank B", 3), stringsAsFactors=FALSE) - error_message <- paste0("The \'fileNameGDS\' must be a character ", - "string representing the GDS 1KG file. The file must exist.") + error_message <- paste0("The \'fileNameGDS\' must be a character string ", + "representing the Population Reference GDS file. The file must exist.") expect_error(createStudy2GDS1KG(pathGeno=dataDir, filePedRDS=NULL, pedStudy=pedDF, fileNameGDS=33, @@ -1219,8 +962,8 @@ test_that("createStudy2GDS1KG() must return error when fileNameGDS is numerical test_that("createStudy2GDS1KG() must return error when batch is character string", { - dataDir <- system.file("extdata/tests", package="RAIDS") - fileGDS <- file.path(dataDir, "1KG_Test.gds") + dataDir <- test_path("fixtures") + fileGDS <- test_path("fixtures", "1KG_Test.gds") pedDF <- data.frame(Name.ID=c("Sample_01", "Sample_02", "Sample_03"), Case.ID=c("Patient_h11", "Patient_h12", "Patient_h18"), @@ -1240,7 +983,7 @@ test_that("createStudy2GDS1KG() must return error when batch is character string test_that("createStudy2GDS1KG() must return error when batch is vector of numerics", { - dataDir <- system.file("extdata/tests", package="RAIDS") + dataDir <- test_path("fixtures") fileGDS <- test_path("fixtures", "1KG_Test.gds") pedDF <- data.frame(Name.ID=c("Sample_01", "Sample_02", "Sample_03"), @@ -1286,7 +1029,7 @@ test_that("createStudy2GDS1KG() must return error when listSamples is vector of test_that("createStudy2GDS1KG() must return error when listProfiles is numeric", { - dataDir <- system.file("extdata/tests", package="RAIDS") + dataDir <- test_path("fixtures") fileGDS <- test_path("fixtures", "1KG_Test.gds") pedDF <- data.frame(Name.ID=c("Sample_01", "Sample_02", "Sample_03"), @@ -1311,7 +1054,7 @@ test_that("createStudy2GDS1KG() must return error when listProfiles is numeric", test_that("createStudy2GDS1KG() must return error when studyDF is missing column", { - dataDir <- system.file("extdata/tests", package="RAIDS") + dataDir <- test_path("fixtures") fileGDS <- test_path("fixtures", "1KG_Test.gds") pedDF <- data.frame(Name.ID=c("Sample_01", "Sample_02", "Sample_03"), @@ -1337,8 +1080,8 @@ test_that("createStudy2GDS1KG() must return error when studyDF is missing column test_that("createStudy2GDS1KG() must return error when verbose is numeric", { - dataDir <- system.file("extdata/tests", package="RAIDS") - fileGDS <- file.path(dataDir, "1KG_Test.gds") + dataDir <- test_path("fixtures") + fileGDS <- test_path("fixtures", "1KG_Test.gds") pedDF <- data.frame(Name.ID=c("Sample_01", "Sample_02", "Sample_03"), Case.ID=c("Patient_h11", "Patient_h12", "Patient_h18"), @@ -1358,11 +1101,34 @@ test_that("createStudy2GDS1KG() must return error when verbose is numeric", { genoSource="snp-pileup", verbose=22), error_message, fixed=TRUE) }) +test_that("createStudy2GDS1KG() must return error when the gdsProfile already exists", { + + dataDir <- test_path("fixtures") + fileGDS <- test_path("fixtures", "1KG_Test.gds") + + pedDF <- data.frame(Name.ID=c("Sample_01", "Sample_02", "Sample_03"), + Case.ID=c("Patient_h11", "Patient_h12", "Patient_h18"), + Diagnosis=rep("Cancer", 3), + Sample.Type=rep("Primary Tumor", 3), + Source=rep("Databank B", 3), stringsAsFactors=FALSE) + + studyDF <- data.frame(study.id="MYDATA", study.desc="Description", + study.platform="PLATFORM", stringsAsFactors=FALSE) + + error_message <- paste0("The gds file for ", "GDS_Sample_with_study_demo", " already exist.") + + expect_error(createStudy2GDS1KG(pathGeno=dataDir, + filePedRDS=NULL, pedStudy=pedDF, fileNameGDS=fileGDS, + batch=1, studyDF=studyDF, listProfiles=c("GDS_Sample_with_study_demo"), + pathProfileGDS=dataDir, + genoSource="snp-pileup", verbose=FALSE), error_message, fixed=TRUE) +}) + test_that("createStudy2GDS1KG() must return error when pathProfileGDS is numeric", { - dataDir <- system.file("extdata/tests", package="RAIDS") - gdsFile <- file.path(dataDir, "1KG_Test.gds") + dataDir <- test_path("fixtures") + gdsFile <- test_path("fixtures", "1KG_Test.gds") pedDF <- data.frame(Name.ID=c("Sample_01", "Sample_02", "Sample_03"), Case.ID=c("Patient_h11", "Patient_h12", "Patient_h18"), @@ -2159,259 +1925,6 @@ test_that(paste0("computeAncestryFromSyntheticFile() must return expected result }) - -############################################################################# -### Tests computePoolSyntheticAncestry() results -############################################################################# - -context("computePoolSyntheticAncestry() results") - - -test_that("computePoolSyntheticAncestry() must return error when gdsReference is a character string", { - - dataDir <- system.file("extdata/tests", package="RAIDS") - fileGDS <- file.path(dataDir, "1KG_Test.gds") - - gdsF <- openfn.gds(fileGDS) - withr::defer((gdsfmt::closefn.gds(gdsF)), envir=parent.frame()) - - dataRefDemo <- data.frame(sample.id=c("SampleA", "SampleB", "SampleC", - "SampleD"), - pop.group=c("TSI", "TSI", "YRI", "YRI"), - superPop=c("EUR", "EUR", "AFR", "AFR")) - - error_message <- paste0("The \'gdsReference\' must be an object of ", - "class \'gds.class\'.") - - expect_error(computePoolSyntheticAncestry(gdsReference="toto.gds", gdsSample=gdsF, - profileID="test", dataRef=dataRefDemo, spRef="TODO", - studyIDSyn="synthetic", - np=1L, listCatPop="EUR", fieldPopIn1KG="superPop", - fieldPopInfAnc="SuperPop", kList=seq(2, 15, 1), - pcaList=seq(2, 15, 1), algorithm="exact", - eigenCount=32L, missingRate=0.025), error_message) -}) - - -test_that(paste0("computePoolSyntheticAncestry() must return error when gds is a numerical value"), { - - dataDir <- system.file("extdata/tests", package="RAIDS") - fileGDS <- file.path(dataDir, "1KG_Test.gds") - - gdsF <- openfn.gds(fileGDS) - withr::defer((gdsfmt::closefn.gds(gdsF)), envir=parent.frame()) - - dataRefDemo <- data.frame(sample.id=c("SampleA", "SampleB", "SampleC", - "SampleD"), - pop.group=c("TSI", "TSI", "YRI", "YRI"), - superPop=c("EUR", "EUR", "AFR", "AFR")) - - error_message <- paste0("The \'gdsReference\' must be an object of ", - "class \'gds.class\'.") - - expect_error(computePoolSyntheticAncestry(gdsReference=31, gdsSample=gdsF, - profileID="test", dataRef=dataRefDemo, spRef="TODO", - studyIDSyn="synthetic", - np=1L, listCatPop="EUR", fieldPopIn1KG="superPop", - fieldPopInfAnc="SuperPop", kList=seq(2, 15, 1), - pcaList=seq(2, 15, 1), algorithm="exact", - eigenCount=32L, missingRate=0.025), error_message) -}) - - -test_that("computePoolSyntheticAncestry() must return error when profileGDS is a numerical value", { - - dataDir <- system.file("extdata/tests", package="RAIDS") - fileGDS <- file.path(dataDir, "1KG_Test.gds") - - gdsF <- openfn.gds(fileGDS) - withr::defer((gdsfmt::closefn.gds(gdsF)), envir=parent.frame()) - - dataRefDemo <- data.frame(sample.id=c("SampleA", "SampleB", "SampleC", - "SampleD"), - pop.group=c("TSI", "TSI", "YRI", "YRI"), - superPop=c("EUR", "EUR", "AFR", "AFR")) - - error_message <- paste0("The \'profileGDS\' must be an object of ", - "class \'gds.class\'.") - - expect_error(computePoolSyntheticAncestry(gdsReference=gdsF, gdsSample=33, - profileID="test", dataRef=dataRefDemo, spRef="TODO", - studyIDSyn="synthetic", - np=1L, listCatPop="EUR", fieldPopIn1KG="superPop", - fieldPopInfAnc="SuperPop", kList=seq(2, 15, 1), - pcaList=seq(2, 15, 1), algorithm="exact", - eigenCount=32L, missingRate=0.025), error_message) -}) - - -test_that("computePoolSyntheticAncestry() must return error when profileID is a numerical value", { - - dataDir <- system.file("extdata/tests", package="RAIDS") - fileGDS <- file.path(dataDir, "1KG_Test.gds") - - gdsF <- openfn.gds(fileGDS) - withr::defer((gdsfmt::closefn.gds(gdsF)), envir=parent.frame()) - - dataRefDemo <- data.frame(sample.id=c("SampleA", "SampleB", "SampleC", - "SampleD"), - pop.group=c("TSI", "TSI", "YRI", "YRI"), - superPop=c("EUR", "EUR", "AFR", "AFR")) - - error_message <- "The \'profileID\' parameter must be a character string." - - expect_error(computePoolSyntheticAncestry(gdsReference=gdsF, gdsSample=gdsF, - profileID=22, dataRef=dataRefDemo, spRef="TODO", - studyIDSyn="synthetic", np=1L, listCatPop="EUR", - fieldPopIn1KG="superPop", fieldPopInfAnc="SuperPop", - kList=seq(2, 15, 1), pcaList=seq(2, 15, 1), algorithm="exact", - eigenCount=32L, missingRate=0.025), error_message) -}) - - -test_that("computePoolSyntheticAncestry() must return error when dataRef is character string", { - - fileGDS <- test_path("fixtures", "1KG_Test.gds") - gdsF <- openfn.gds(fileGDS) - withr::defer(closefn.gds(gdsF), envir=parent.frame()) - - error_message <- "The \'dataRef\' must be a data.frame object." - - expect_error(computePoolSyntheticAncestry(gdsReference=gdsF, gdsSample=gdsF, - profileID="test", dataRef="test", spRef="TODO", - studyIDSyn="synthetic", - np=1L, listCatPop="EUR", fieldPopIn1KG="superPop", - fieldPopInfAnc="SuperPop", kList=seq(2, 15, 1), - pcaList=seq(2, 15, 1), algorithm="exact", - eigenCount=32L, missingRate=0.025), error_message) -}) - - -test_that("computePoolSyntheticAncestry() must return error when studyIDSyn is numeric value", { - - fileGDS <- test_path("fixtures", "1KG_Test.gds") - gdsF <- openfn.gds(fileGDS) - withr::defer(closefn.gds(gdsF), envir=parent.frame()) - - dataRefDemo <- data.frame(sample.id=c("SampleA", "SampleB", "SampleC", "SampleD"), - pop.group=c("TSI", "TSI", "YRI", "YRI"), - superPop=c("EUR", "EUR", "AFR", "AFR")) - - error_message <- "The \'studyIDSyn\' parameter must be a character string." - - expect_error(computePoolSyntheticAncestry(gdsReference=gdsF, gdsSample=gdsF, - profileID="test", dataRef=dataRefDemo, spRef="TODO", - studyIDSyn=33, - np=1L, listCatPop="EUR", fieldPopIn1KG="SuperPop", - fieldPopInfAnc="SuperPop", kList=seq(2, 15, 1), - pcaList=seq(2, 15, 1), algorithm="exact", - eigenCount=32L, missingRate=0.025), error_message) -}) - - -test_that("computePoolSyntheticAncestry() must return error when fieldPopIn1KG is numeric value", { - - fileGDS <- test_path("fixtures", "1KG_Test.gds") - gdsF <- openfn.gds(fileGDS) - withr::defer(closefn.gds(gdsF), envir=parent.frame()) - - dataRefDemo <- data.frame(sample.id=c("SampleA", "SampleB", "SampleC", "SampleD"), - pop.group=c("TSI", "TSI", "YRI", "YRI"), - superPop=c("EUR", "EUR", "AFR", "AFR")) - - error_message <- "The \'fieldPopIn1KG\' parameter must be a character string." - - expect_error(computePoolSyntheticAncestry(gdsReference=gdsF, gdsSample=gdsF, - profileID="test", dataRef=dataRefDemo, spRef="TODO", - studyIDSyn="synthetic", - np=1L, listCatPop="EUR", fieldPopIn1KG=33, - fieldPopInfAnc="SuperPop", kList=seq(2, 15, 1), - pcaList=seq(2, 15, 1), algorithm="exact", - eigenCount=32L, missingRate=0.025), error_message) -}) - - -############################################################################# -### Tests addPhase1KG2SampleGDSFromFile() results -############################################################################# - -context("addPhase1KG2SampleGDSFromFile() results") - - -test_that("addPhase1KG2SampleGDSFromFile() must return error when gdsReference is character string", { - - fileGDS <- test_path("fixtures", "1KG_Test.gds") - - error_message <- "The \'gdsReference\' must be an object of class \'gds.class\'" - - expect_error(addPhase1KG2SampleGDSFromFile(gdsReference=fileGDS, - pathProfileGDS=test_path("fixtures"), pathGenotest_path("fixtures"), - fileSNPsRDS="test", verbose="CANADA"), error_message, fixed=TRUE) -}) - - -test_that("addPhase1KG2SampleGDSFromFile() must return error when verbose is character string", { - - fileGDS <- test_path("fixtures", "1KG_Test.gds") - gdsF <- openfn.gds(fileGDS) - withr::defer(closefn.gds(gdsF), envir=parent.frame()) - - error_message <- paste0("The \'verbose\' parameter must be a ", - "logical (TRUE or FALSE).") - - expect_error(addPhase1KG2SampleGDSFromFile(gdsReference=gdsF, - pathProfileGDS=test_path("fixtures"), pathGenotest_path("fixtures"), - fileSNPsRDS="test", verbose="CANADA"), error_message, fixed=TRUE) -}) - - -############################################################################# -### Tests computePrunedPCARef() results -############################################################################# - -context("computePrunedPCARef() results") - - -test_that("computePrunedPCARef() must return error when gdsProfile is character string", { - - fileGDS <- test_path("fixtures", "1KG_Test.gds") - - error_message <- "The \'gdsProfile\' must be an object of class \'gds.class\'" - - expect_error(computePrunedPCARef(gdsProfile=fileGDS, - listRef=c("sample1", "sample2"), np=1L, verbose=FALSE), - error_message, fixed=TRUE) -}) - - -test_that("computePrunedPCARef() must return error when np is character string", { - - fileGDS <- test_path("fixtures", "1KG_Test.gds") - gdsF <- openfn.gds(fileGDS) - withr::defer(closefn.gds(gdsF), envir=parent.frame()) - - error_message <- "The \'np\' parameter must be a single positive integer." - - expect_error(computePrunedPCARef(gdsProfile=gdsF, - listRef=c("sample1", "sample2"), np="1", verbose=FALSE), - error_message, fixed=TRUE) -}) - - -test_that("computePrunedPCARef() must return error when verbose is character string", { - - fileGDS <- test_path("fixtures", "1KG_Test.gds") - gdsF <- openfn.gds(fileGDS) - withr::defer(closefn.gds(gdsF), envir=parent.frame()) - - error_message <- "The \'verbose\' parameter must be logical (TRUE or FALSE)." - - expect_error(computePrunedPCARef(gdsProfile=gdsF, - listRef=c("sample1", "sample2"), np=1L, verbose="GLUTEN"), - error_message, fixed=TRUE) -}) - - ############################################################################# ### Tests runExomeAncestry() results ############################################################################# @@ -2426,7 +1939,7 @@ test_that("runExomeAncestry() must return error when pathOut is numeric", { fileGDS <- file.path(pathOut, "ex1_good_small_1KG_GDS.gds") gdsFileAnnot <- file.path(pathOut, "ex1_good_small_1KG_Annot_GDS.gds") - chrInfo <- c(248956422L, 242193529L, 198295559L, 190214555L) + chromosome <- c(956422L, 193529L, 295559L, 214555L) studyDF <- data.frame(study.id="MYDATA", study.desc="Description", study.platform="PLATFORM", stringsAsFactors=FALSE) @@ -2448,7 +1961,7 @@ test_that("runExomeAncestry() must return error when pathOut is numeric", { expect_error(runExomeAncestry(pedStudy=ped, studyDF=studyDF, pathProfileGDS=pathOut, pathGeno=pathOut, pathOut=33, fileReferenceGDS=fileGDS, - fileReferenceAnnotGDS=gdsFileAnnot, chrInfo=chrInfo, + fileReferenceAnnotGDS=gdsFileAnnot, chrInfo=chromosome, syntheticRefDF=syntheticRefDF, genoSource="snp-pileup"), error_message) }) @@ -2460,7 +1973,7 @@ test_that("runExomeAncestry() must return error when fileReferenceGDS is numeric fileGDS <- file.path(pathOut, "ex1_good_small_1KG_GDS.gds") gdsFileAnnot <- file.path(pathOut, "ex1_good_small_1KG_Annot_GDS.gds") - chrInfo <- c(248956422L, 242193529L, 198295559L, 190214555L) + chromosome <- c(956422L, 193529L, 295559L, 214555L) studyDF <- data.frame(study.id="MYDATA", study.desc="Description", study.platform="PLATFORM", stringsAsFactors=FALSE) @@ -2482,7 +1995,7 @@ test_that("runExomeAncestry() must return error when fileReferenceGDS is numeric expect_error(runExomeAncestry(pedStudy=ped, studyDF=studyDF, pathProfileGDS=pathOut, pathGeno=pathOut, pathOut=pathOut, fileReferenceGDS=33, fileReferenceAnnotGDS=gdsFileAnnot, - chrInfo=chrInfo, syntheticRefDF, + chrInfo=chromosome, syntheticRefDF, genoSource="snp-pileup"), error_message) }) @@ -2493,7 +2006,7 @@ test_that("runExomeAncestry() must return error when fileReferenceAnnotGDS is nu fileGDS <- file.path(pathOut, "ex1_good_small_1KG_GDS.gds") gdsFileAnnot <- file.path(pathOut, "ex1_good_small_1KG_Annot_GDS.gds") - chrInfo <- c(248956422L, 242193529L, 198295559L, 190214555L) + chromosome <- c(956422L, 193529L, 295559L, 214555L) studyDF <- data.frame(study.id="MYDATA", study.desc="Description", study.platform="PLATFORM", stringsAsFactors=FALSE) @@ -2516,7 +2029,7 @@ test_that("runExomeAncestry() must return error when fileReferenceAnnotGDS is nu expect_error(runExomeAncestry(pedStudy=ped, studyDF=studyDF, pathProfileGDS=pathOut, pathGeno=pathOut, pathOut=pathOut, fileReferenceGDS=fileGDS, fileReferenceAnnotGDS=32, - chrInfo=chrInfo, syntheticRefDF=syntheticRefDF, + chrInfo=chromosome, syntheticRefDF=syntheticRefDF, genoSource="snp-pileup"), error_message) }) diff --git a/tests/testthat/test-processStudy_internal.R b/tests/testthat/test-processStudy_internal.R index d89c8acd1..1c24553db 100644 --- a/tests/testthat/test-processStudy_internal.R +++ b/tests/testthat/test-processStudy_internal.R @@ -152,32 +152,6 @@ test_that("validateComputePCARefSample() must return expected results when all i }) -############################################################################# -### Tests validateAppendStudy2GDS1KG() results -############################################################################# - -context("validateAppendStudy2GDS1KG() results") - - -test_that("validateAppendStudy2GDS1KG() must return expected results when all input are valid", { - - dataDir <- test_path("fixtures") - fileGDS <- file.path(dataDir, "GDS_Sample_with_study_demo.gds") - rdsFile <- file.path(dataDir, "mapSNVSelected_Demo.rds") - - studyInfo <- data.frame(study.id="Pancreatic.WES", - study.desc="Pancreatic study", study.platform="WES", - stringsAsFactors=FALSE) - - result1 <- RAIDS:::validateAppendStudy2GDS1KG(pathGeno=test_path("fixtures"), - filePedRDS=rdsFile, fileNameGDS=fileGDS, - batch=1L, studyDF=studyInfo, listSamples=c("HC01", "HC02"), - pathProfileGDS=test_path("fixtures"), genoSource="snp-pileup", - verbose=TRUE) - - expect_identical(result1, 0L) -}) - ############################################################################# ### Tests validateAdd1KG2SampleGDS() results @@ -208,13 +182,13 @@ test_that("validateAdd1KG2SampleGDS() must return expected results when all inpu ############################################################################# -### Tests validateRunExomeAncestry() results +### Tests validateRunExomeOrRNAAncestry() results ############################################################################# -context("validateRunExomeAncestry() results") +context("validateRunExomeOrRNAAncestry() results") -test_that("validateRunExomeAncestry() must return expected results when all input are valid", { +test_that("validateRunExomeOrRNAAncestry() must return expected results when all input are valid", { dataDir <- test_path("fixtures") gdsRefFile <- file.path(dataDir, "ex1_good_small_1KG_GDS.gds") @@ -235,12 +209,13 @@ test_that("validateRunExomeAncestry() must return expected results when all inpu "FIN", "FIN"), superPop=c("EUR", "EUR", "EUR", "EUR"), stringsAsFactors=FALSE) - chrInfo <- c(248956422L, 242193529L, 198295559L, 190214555L) + chromosome <- c(956422L, 193529L, 295559L, 214555L) - result <- RAIDS:::validateRunExomeAncestry(pedStudy=ped, studyDF=studyInfo, + result <- RAIDS:::validateRunExomeOrRNAAncestry(pedStudy=ped, + studyDF=studyInfo, pathProfileGDS=dataDir, pathGeno=dataDir, pathOut=dataDir, fileReferenceGDS=gdsRefFile, fileReferenceAnnotGDS=gdsRefAnnotFile, - chrInfo=chrInfo, syntheticRefDF=syntheticRefDF, + chrInfo=chromosome, syntheticRefDF=syntheticRefDF, genoSource="snp-pileup", verbose=TRUE) expect_identical(result, 0L) @@ -329,11 +304,11 @@ test_that("validateEstimateAllelicFraction() must return expected results when a gdsSample <- openfn.gds(fileProfileGDS) withr::defer((gdsfmt::closefn.gds(gdsSample)), envir = parent.frame()) - chrInfo <- c(248956422L, 242193529L, 198295559L, 190214555L) + chromosome <- c(956422L, 193529L, 295559L, 214555L) result <- RAIDS:::validateEstimateAllelicFraction(gdsReference=gdsRef, gdsProfile=gdsSample, currentProfile="ex1", studyID="MYDATA", - chrInfo=chrInfo, studyType="DNA", minCov=10L, minProb=0.999, + chrInfo=chromosome, studyType="DNA", minCov=10L, minProb=0.999, eProb=0.001, cutOffLOH=-5, cutOffHomoScore=-3, wAR=9, cutOffAR=3, gdsRefAnnot=NULL, blockID=NULL, verbose=FALSE) diff --git a/tests/testthat/test-synthetic.R b/tests/testthat/test-synthetic.R index 9d9136727..8bfa72f2b 100644 --- a/tests/testthat/test-synthetic.R +++ b/tests/testthat/test-synthetic.R @@ -25,9 +25,7 @@ test_that("select1KGPop() must return error when gdsReference is a character str test_that("select1KGPop() must return error when nbProfiles is a character string", { - dataDir <- system.file("extdata/tests", package="RAIDS") - - fileGDS <- file.path(dataDir, "1KG_Test.gds") + fileGDS <- test_path("fixtures", "1KG_Test.gds") gdsF <- openfn.gds(fileGDS) withr::defer((gdsfmt::closefn.gds(gdsF)), envir=parent.frame()) @@ -42,17 +40,15 @@ test_that("select1KGPop() must return error when nbProfiles is a character strin test_that("select1KGPop() must return expected result", { - dataDir <- system.file("extdata/tests", package="RAIDS") - - fileGDS <- file.path(dataDir, "1KG_Test_02.gds") + fileGDS <- test_path("fixtures", "1KG_Test.gds") gdsF <- openfn.gds(fileGDS) withr::defer((gdsfmt::closefn.gds(gdsF)), envir=parent.frame()) - expected <- data.frame(sample.id=c("HG00101", "HG00097", - "HG00096", "HG00100"), - pop.group=c("GBR", "GBR", "GBR", "GBR"), - superPop=rep("EUR", 4)) + expected <- data.frame(sample.id=c("HG00109", "HG00108", + "HG00104", "HG00103"), + pop.group=rep("ACB", 4), + superPop=rep("AFR", 4)) set.seed(1212) results <- select1KGPop(gdsReference=gdsF, nbProfiles=4L) @@ -136,7 +132,7 @@ context("syntheticGeno() results") test_that("syntheticGeno() must return error when gds is a numeric value", { - dataDir <- system.file("extdata/tests", package="RAIDS") + dataDir <- test_path("fixtures") fileGDS <- file.path(dataDir, "1KG_Test.gds") gdsF <- openfn.gds(fileGDS) @@ -154,7 +150,7 @@ test_that("syntheticGeno() must return error when gds is a numeric value", { test_that("syntheticGeno() must return error when gdsRefAnnot is a numeric value", { - dataDir <- system.file("extdata/tests", package="RAIDS") + dataDir <- test_path("fixtures") fileGDS <- file.path(dataDir, "1KG_Test.gds") gdsF <- openfn.gds(fileGDS) @@ -172,7 +168,7 @@ test_that("syntheticGeno() must return error when gdsRefAnnot is a numeric value test_that("syntheticGeno() must return error when fileProfileGDS is a numeric value", { - dataDir <- system.file("extdata/tests", package="RAIDS") + dataDir <- test_path("fixtures") fileGDS <- file.path(dataDir, "1KG_Test.gds") gdsF <- openfn.gds(fileGDS) @@ -191,7 +187,7 @@ test_that("syntheticGeno() must return error when fileProfileGDS is a numeric va test_that("syntheticGeno() must return error when profileID is a numeric value", { - dataDir <- system.file("extdata/tests", package="RAIDS") + dataDir <- test_path("fixtures") fileGDS <- file.path(dataDir, "1KG_Test.gds") gdsF <- openfn.gds(fileGDS) @@ -209,7 +205,7 @@ test_that("syntheticGeno() must return error when profileID is a numeric value", test_that("syntheticGeno() must return error when listSampleRef is a numeric value", { - dataDir <- system.file("extdata/tests", package="RAIDS") + dataDir <- test_path("fixtures") fileGDS <- file.path(dataDir, "1KG_Test.gds") gdsF <- openfn.gds(fileGDS) @@ -226,7 +222,7 @@ test_that("syntheticGeno() must return error when listSampleRef is a numeric val test_that("syntheticGeno() must return error when profileID is a vector of strings", { - dataDir <- system.file("extdata/tests", package="RAIDS") + dataDir <- test_path("fixtures") fileGDS <- file.path(dataDir, "1KG_Test.gds") gdsF <- openfn.gds(fileGDS) @@ -244,7 +240,7 @@ test_that("syntheticGeno() must return error when profileID is a vector of strin test_that("syntheticGeno() must return error when nbSim is a character string", { - dataDir <- system.file("extdata/tests", package="RAIDS") + dataDir <- test_path("fixtures") fileGDS <- file.path(dataDir, "1KG_Test.gds") gdsF <- openfn.gds(fileGDS) @@ -264,7 +260,7 @@ test_that("syntheticGeno() must return error when nbSim is a character string", test_that(paste0("syntheticGeno() must return error when prefId is a vector ", "of character strings"), { - dataDir <- system.file("extdata/tests", package="RAIDS") + dataDir <- test_path("fixtures") fileGDS <- file.path(dataDir, "1KG_Test.gds") gdsF <- openfn.gds(fileGDS) @@ -284,7 +280,7 @@ test_that(paste0("syntheticGeno() must return error when prefId is a vector ", test_that("syntheticGeno() must return error when prefix is a numeric value", { - dataDir <- system.file("extdata/tests", package="RAIDS") + dataDir <- test_path("fixtures") fileGDS <- file.path(dataDir, "1KG_Test.gds") gdsF <- openfn.gds(fileGDS) @@ -304,7 +300,7 @@ test_that("syntheticGeno() must return error when prefix is a numeric value", { test_that(paste0("syntheticGeno() must return error when pRecomb is a character string"), { - dataDir <- system.file("extdata/tests", package="RAIDS") + dataDir <- test_path("fixtures") fileGDS <- file.path(dataDir, "1KG_Test.gds") gdsF <- openfn.gds(fileGDS) @@ -324,7 +320,7 @@ test_that(paste0("syntheticGeno() must return error when pRecomb is a character test_that(paste0("syntheticGeno() must return error when seqError is a ", "character string"), { - dataDir <- system.file("extdata/tests", package="RAIDS") + dataDir <- test_path("fixtures") fileGDS <- file.path(dataDir, "1KG_Test.gds") gdsF <- openfn.gds(fileGDS) @@ -345,7 +341,7 @@ test_that(paste0("syntheticGeno() must return error when seqError is a ", test_that(paste0("syntheticGeno() must return error when minProb is a ", "character string"), { - dataDir <- system.file("extdata/tests", package="RAIDS") + dataDir <- test_path("fixtures") fileGDS <- file.path(dataDir, "1KG_Test.gds") gdsF <- openfn.gds(fileGDS) @@ -361,7 +357,7 @@ test_that(paste0("syntheticGeno() must return error when minProb is a ", }) -test_that(paste0("syntheticGeno() must return error expected results"), { +test_that(paste0("syntheticGeno() must return expected results"), { set.seed(121) @@ -415,6 +411,79 @@ test_that(paste0("syntheticGeno() must return error expected results"), { rep(0, 27))) }) + + +test_that(paste0("syntheticGeno() must return expected results when nbSim=3"), { + + set.seed(121) + + dataDirSample <- test_path("fixtures/sampleGDSforEstimAlleFraction") + file.copy(file.path(dataDirSample, "ex1_demoForEstimAllFrac.gds"), + file.path(dataDirSample, "ex1.gds")) + withr::defer((unlink(file.path(dataDirSample, "ex1.gds"))), + envir=parent.frame()) + + dataDirRef <- test_path("fixtures") + + ## Open 1KG Annotation GDS file + fileRefAnnot <- file.path(dataDirRef, "ex1_good_small_1KG_Annot_GDS.gds") + gdsRefAnnot <- openfn.gds(fileRefAnnot) + withr::defer((closefn.gds(gdsRefAnnot)), envir=parent.frame()) + + ## Open 1KG GDS file + gds1KG <- snpgdsOpen(file.path(dataDirRef, "ex1_good_small_1KG_GDS.gds")) + withr::defer((closefn.gds(gds1KG)), envir=parent.frame()) + + synthStudyDF <- data.frame(study.id="MYDATA.Synthetic", + study.desc="MYDATA synthetic data", + study.platform="PLATFORM", + stringsAsFactors=FALSE) + + result1 <- prepSynthetic(fileProfileGDS=file.path(dataDirSample, "ex1.gds"), + listSampleRef=c("HG00243", "HG00149"), profileID="ex1", + studyDF=synthStudyDF, nbSim=3L, prefix="test2", verbose=FALSE) + + result2 <- syntheticGeno(gdsReference=gds1KG, gdsRefAnnot=gdsRefAnnot, + fileProfileGDS=file.path(dataDirSample, "ex1.gds"), profileID="ex1", + listSampleRef=c("HG00243", "HG00149"), nbSim=3, prefix="test2", + pRecomb=0.01, minProb=0.999, seqError=0.001) + + expect_equal(result2, 0L) + + profileGDS <- openfn.gds(file.path(dataDirSample, "ex1.gds")) + withr::defer((closefn.gds(profileGDS)), envir=parent.frame()) + + sampleList <- read.gdsn(index.gdsn(profileGDS, "sample.id")) + + expect_equal(sampleList[158:163], + c("test2.ex1.HG00243.1", "test2.ex1.HG00243.2", "test2.ex1.HG00243.3", + "test2.ex1.HG00149.1", "test2.ex1.HG00149.2", "test2.ex1.HG00149.3")) + + genotype <- read.gdsn(index.gdsn(profileGDS, "genotype")) + + expect_equal(genotype[,158], c(rep(0, 16), 2, 0, 0, 1, rep(0, 13), 2, + rep(0, 27))) + + expect_equal(genotype[,159], c(rep(0, 16), 2, 0, 0, 1, rep(0, 13), + 2, rep(0, 16), 3, rep(0, 9), 3)) + + expect_equal(genotype[,160], c(rep(0, 8), 3, 0, 3, rep(0, 5), 2, 0, 0, 1, + 3, rep(0, 12), 2, rep(0, 27))) + + expect_equal(genotype[,161], c(3, rep(0, 6), 3, 0, 2, rep(0, 5), 1, 2, + rep(0, 13), 1, 0, 0, 2, + rep(0, 13), 1, rep(0, 13))) + + expect_equal(genotype[,162], c(rep(0, 8), 3, 2, rep(0, 5), 1, 2, + rep(0, 13), 1, 0, 0, 2, + rep(0, 4), 3, rep(0, 8), 1, rep(0, 13))) + + expect_equal(genotype[,163], c(rep(0, 9), 2, rep(0, 5), 1, 2, + rep(0, 13), 1, + rep(0, 2), 2, rep(0, 13), 1, rep(0, 13))) +}) + + ############################################################################# ### Tests prepSynthetic() results ############################################################################# @@ -436,7 +505,7 @@ test_that(paste0("prepSynthetic() must return error when fileProfileGDS is", test_that(paste0("prepSynthetic() must return error when profileID is numeric"), { - dataDir <- system.file("extdata/tests", package="RAIDS") + dataDir <- test_path("fixtures") fileGDS <- file.path(dataDir, "1KG_Test.gds") studyDF <- data.frame(study.id="Id of the study", @@ -454,7 +523,7 @@ test_that(paste0("prepSynthetic() must return error when profileID is numeric"), test_that(paste0("prepSynthetic() must return error when nbSim is ", "a character string"), { - dataDir <- system.file("extdata/tests", package="RAIDS") + dataDir <- test_path("fixtures") fileGDS <- file.path(dataDir, "1KG_Test.gds") studyDF <- data.frame(study.id="Id of the study", @@ -471,7 +540,7 @@ test_that(paste0("prepSynthetic() must return error when nbSim is ", test_that(paste0("prepSynthetic() must return error when listSampleRef is vector of numerics"), { - dataDir <- system.file("extdata/tests", package="RAIDS") + dataDir <- test_path("fixtures") fileGDS <- file.path(dataDir, "1KG_Test.gds") error_message <- paste0("The \'listSampleRef\' must be a vector of ", @@ -485,7 +554,7 @@ test_that(paste0("prepSynthetic() must return error when listSampleRef is vector test_that(paste0("prepSynthetic() must return error when studyDF is missing mandatory column"), { - dataDir <- system.file("extdata/tests", package="RAIDS") + dataDir <- test_path("fixtures") fileGDS <- file.path(dataDir, "1KG_Test.gds") studyDF <- data.frame(study.id="Id of the study", @@ -502,7 +571,7 @@ test_that(paste0("prepSynthetic() must return error when studyDF is missing mand test_that(paste0("prepSynthetic() must return error when prefix is numeric"), { - dataDir <- system.file("extdata/tests", package="RAIDS") + dataDir <- test_path("fixtures") fileGDS <- file.path(dataDir, "1KG_Test.gds") studyDF <- data.frame(study.id="Id of the study", @@ -519,7 +588,7 @@ test_that(paste0("prepSynthetic() must return error when prefix is numeric"), { test_that(paste0("prepSynthetic() must return error when verbose is numeric"), { - dataDir <- system.file("extdata/tests", package="RAIDS") + dataDir <- test_path("fixtures") fileGDS <- file.path(dataDir, "1KG_Test.gds") studyDF <- data.frame(study.id="Id of the study", diff --git a/tests/testthat/test-tools.R b/tests/testthat/test-tools.R index f5a99df6d..9745a652a 100644 --- a/tests/testthat/test-tools.R +++ b/tests/testthat/test-tools.R @@ -4,8 +4,6 @@ library(RAIDS) library(withr) library(testthat) - - ############################################################################# ### Tests snvListVCF() results ############################################################################# @@ -43,7 +41,7 @@ test_that("snvListVCF() must return error when offset is a character string", { error_message <- "The \'offset\' must be a single integer." - expect_error(snvListVCF(gdsReference=gds, fileOUT, offset="HELLO", + expect_error(snvListVCF(gdsReference=gds, fileOut, offset="HELLO", freqCutoff=NULL), error_message) closefn.gds(gds) @@ -56,11 +54,11 @@ test_that("snvListVCF() must return error when gds is a character string", { data.dir <- system.file("extdata", package="RAIDS") - fileOUT <- file.path(data.dir, "VCF_TEMP.vcf") + fileOut <- file.path(data.dir, "VCF_TEMP.vcf") error_message <- "The \'gdsReference\' must be an object of class \'gds.class\'." - expect_error(snvListVCF(gdsReference="welcome.txt", fileOUT=fileOUT, offset=0L, + expect_error(snvListVCF(gdsReference="welcome.txt", fileOut=fileOut, offset=0L, freqCutoff=NULL), error_message) }) @@ -94,11 +92,11 @@ test_that("snvListVCF() must return error when freqCutoff is a character string" gds <- snpgdsOpen(fileGDS) withr::defer(closefn.gds(gds), envir = parent.frame()) - fileOUT <- file.path(data.dir, "VCF_TEMP.vcf") + fileOut <- file.path(data.dir, "VCF_TEMP.vcf") error_message <- "The \'freqCutoff\' must be a single numeric or NULL." - expect_error(snvListVCF(gdsReference=gds, fileOUT=fileOUT, offset=0L, + expect_error(snvListVCF(gdsReference=gds, fileOut=fileOut, offset=0L, freqCutoff="BED"), error_message) }) @@ -112,16 +110,16 @@ test_that("snvListVCF() must return expected results when freqCutoff is NULL", { gds <- openfn.gds(fileGDS) withr::defer(closefn.gds(gds), envir=parent.frame()) - fileOUT <- file.path(data.dir, "VCF_TEMP_01.vcf") - withr::defer(unlink(fileOUT, force=TRUE), envir=parent.frame()) + fileOut <- file.path(data.dir, "VCF_TEMP_01.vcf") + withr::defer(unlink(fileOut, force=TRUE), envir=parent.frame()) - result1 <- suppressWarnings(snvListVCF(gdsReference=gds, fileOUT=fileOUT, offset=0L, - freqCutoff=NULL)) + result1 <- suppressWarnings(snvListVCF(gdsReference=gds, fileOut=fileOut, + offset=0L, freqCutoff=NULL)) ## Read two times the vcf file, ## First for the columns names, second for the data - tmp_vcf <- readLines(fileOUT) - tmp_vcf_data <- read.table(fileOUT, stringsAsFactors=FALSE) + tmp_vcf <- readLines(fileOut) + tmp_vcf_data <- read.table(fileOut, stringsAsFactors=FALSE) # filter for the columns names tmp_vcf <- tmp_vcf[-(grep("#CHROM", tmp_vcf)+1):-(length(tmp_vcf))] @@ -130,7 +128,7 @@ test_that("snvListVCF() must return expected results when freqCutoff is NULL", { expect_equal(result1, 0L) - expect_true(file.exists(fileOUT)) + expect_true(file.exists(fileOut)) expect_equal(nrow(tmp_vcf_data), 7) expect_equal(ncol(tmp_vcf_data), 8) }) @@ -145,16 +143,16 @@ test_that("snvListVCF() must return expected results when freqCutoff is 0.3", { gds <- openfn.gds(fileGDS) withr::defer(closefn.gds(gds), envir=parent.frame()) - fileOUT <- file.path(data.dir, "VCF_TEMP_02.vcf") - withr::defer(unlink(fileOUT, force=TRUE), envir=parent.frame()) + fileOut <- file.path(data.dir, "VCF_TEMP_02.vcf") + withr::defer(unlink(fileOut, force=TRUE), envir=parent.frame()) - result1 <- suppressWarnings(snvListVCF(gdsReference=gds, fileOUT=fileOUT, offset=0L, + result1 <- suppressWarnings(snvListVCF(gdsReference=gds, fileOut=fileOut, offset=0L, freqCutoff=0.3)) ## Read two times the vcf file, ## First for the columns names, second for the data - tmp_vcf <- readLines(fileOUT) - tmp_vcf_data <- read.table(fileOUT, stringsAsFactors=FALSE) + tmp_vcf <- readLines(fileOut) + tmp_vcf_data <- read.table(fileOut, stringsAsFactors=FALSE) # filter for the columns names tmp_vcf <- tmp_vcf[-(grep("#CHROM",tmp_vcf)+1):-(length(tmp_vcf))] @@ -163,7 +161,7 @@ test_that("snvListVCF() must return expected results when freqCutoff is 0.3", { expect_equal(result1, 0L) - expect_true(file.exists(fileOUT)) + expect_true(file.exists(fileOut)) expect_equal(nrow(tmp_vcf_data), 2) expect_equal(ncol(tmp_vcf_data), 8) }) @@ -202,3 +200,52 @@ test_that("groupChr1KGSNV() must return error when pathOut does not exist", { pathOut=dirNotExisting), error_message) }) + +test_that("groupChr1KGSNV() must return expected results", { + + dataDir <- test_path("fixtures/demoGenoChr") + + dirTMP <- withr::local_tempdir(tmpdir="test_groupChr1KGSNV") + + result <- groupChr1KGSNV(pathGenoChr=dataDir, pathOut=dirTMP) + + expect_equal(result, 0L) + + result2 <- list.files(dirTMP) + + expect_true(all(c("NA12003.csv.bz2", "NA12004.csv.bz2", + "NA12005.csv.bz2", "NA12006.csv.bz2") %in% result2)) + + geno1 <- read.csv2(file.path(dirTMP, "NA12003.csv.bz2"), + sep="\t", row.names=NULL) + + expect_equal(nrow(geno1), 2178L) + expect_equal(colnames(geno1), "NA12003") + expect_equal(geno1[2,], "0|0") + expect_equal(geno1[3,], "0|0") + expect_equal(geno1[444,], "0|0") + expect_equal(geno1[2177,], "0|1") + expect_equal(geno1[2082,], "1|1") + + geno2 <- read.csv2(file.path(dirTMP, "NA12006.csv.bz2"), + sep="\t", row.names=NULL) + + expect_equal(nrow(geno2), 2178L) + expect_equal(colnames(geno2), "NA12006") + expect_equal(geno2[2,], "0|0") + expect_equal(geno2[3,], "0|1") + expect_equal(geno2[444,], "0|0") + expect_equal(geno2[2177,], "0|0") + expect_equal(geno2[2082,], "1|1") + + geno3 <- read.csv2(file.path(dirTMP, "NA12005.csv.bz2"), + sep="\t", row.names=NULL) + + expect_equal(nrow(geno3), 2178L) + expect_equal(colnames(geno3), "NA12005") + expect_equal(geno3[2,], "1|0") + expect_equal(geno3[3,], "0|0") + expect_equal(geno3[444,], "0|0") + expect_equal(geno3[2177,], "0|0") + expect_equal(geno3[2082,], "1|1") +}) diff --git a/tests/testthat/test-tools_internal.R b/tests/testthat/test-tools_internal.R index cdc6eb9da..78b190b68 100644 --- a/tests/testthat/test-tools_internal.R +++ b/tests/testthat/test-tools_internal.R @@ -3,7 +3,7 @@ library(RAIDS) library(withr) library(gdsfmt) - +library(Rsamtools) ############################################################################# ### Tests validateGDSClass() results @@ -28,9 +28,9 @@ test_that("validateGDSClass() must return expected results when all input are va test_that("validateGDSClass() must return error when input is not valid", { - expected <- "The \'todo_01\' must be an object of class \'gds.class\'." + expected <- "The \'tudo_01\' must be an object of class \'gds.class\'." - expect_error(RAIDS:::validateGDSClass(gds="toto.gds", name="todo_01"), + expect_error(RAIDS:::validateGDSClass(gds="toto.gds", name="tudo_01"), expected, fixed=TRUE) }) @@ -136,8 +136,218 @@ context("validatePositiveIntegerVector() results") test_that("validatePositiveIntegerVector() must return expected results when all input are valid", { - result1 <- RAIDS:::validatePositiveIntegerVector(value=c(1,2,3), name="parameter_01") + result1 <- RAIDS:::validatePositiveIntegerVector(value=c(1,2,3), + name="parameter_01") expect_identical(result1, 0L) }) + +############################################################################# +### Tests extractNucleotide() results +############################################################################# + +context("extractNucleotide() results") + + +test_that("extractNucleotide() must return expected results when all input are valid", { + + nuc <- c("A", "G", "C", "T") + cnt <- c(100, 200, 4, 32) + result1 <- RAIDS:::extractNucleotide(nucleotide=nuc, count=cnt, + curNucleo="C") + + expect_identical(result1, 4) +}) + + +test_that("extractNucleotide() must return expected results when nucleotide not present in the input", { + + nuc <- c("A", "G", "T") + cnt <- c(100, 200, 32) + result1 <- RAIDS:::extractNucleotide(nucleotide=nuc, count=cnt, + curNucleo="C") + + expect_identical(result1, 0) +}) + + +############################################################################# +### Tests processPileupChrBin() results +############################################################################# + +context("processPileupChrBin() results") + + +test_that("processPileupChrBin() must return NULL when chromosome absent", { + + varDf <- c("chr1"=c(1,2,3), "chr2"=c(1,3,5)) + result1 <- RAIDS:::processPileupChrBin(chr="chr3", resPileup=NULL, + varDf=varDf, verbose=FALSE) + + expect_identical(result1, NULL) +}) + + +############################################################################# +### Tests readSNVPileupFile() results +############################################################################# + +context("readSNVPileupFile") + + +test_that("readSNVPileupFile() must return error when file does not content expected columns", { + + data.dir <- test_path("fixtures") + fileTxt <- file.path(data.dir, "ex1.txt.gz") + + result1 <- RAIDS:::readSNVPileupFile(fileName=fileTxt, offset=0L) + + tmpF <- tempfile("ex1_notGood", fileext = c(".txt")) + + withr::defer(if(file.exists(tmpF)) unlink(tmpF), envir=parent.frame()) + + write.table(result1[1:10, 1:3], file = tmpF) + + error_message <- paste0("The SNP-pileup file must contain all those ", + "columns: \'Chromosome\', \'Position\', \'Ref\', \'Alt\',", + " \'File1R\', \'File1A\', \'File1E\', \'File1D\'.") + + expect_error(RAIDS:::readSNVPileupFile(fileName=tmpF, offset=0L), + error_message) +}) + + +############################################################################# +### Tests readSNVFileGeneric() results +############################################################################# + +context("readSNVFileGeneric() results") + + +test_that("readSNVFileGeneric() must return error when file does not content expected columns", { + + data.dir <- test_path("fixtures") + fileTxt <- file.path(data.dir, "ex1.txt.gz") + + result1 <- RAIDS:::readSNVPileupFile(fileName=fileTxt, offset=0L) + + tmpF2 <- tempfile("ex1_notGood", fileext = c(".txt")) + + withr::defer(if(file.exists(tmpF2)) unlink(tmpF2), envir=parent.frame()) + + write.table(result1[1:10, 1:4], file = tmpF2) + + error_message <- paste0("The generic SNP pileup file must contain all ", + "those columns: \'Chromosome\', \'Position\', \'Ref\', \'Alt\', ", + "\'File1R\', \'File1A\', \'Count\'.") + + expect_error(RAIDS:::readSNVFileGeneric(fileName=tmpF2, offset=0L), + error_message) +}) + + +test_that("readSNVFileGeneric() must return expected value when all parameters are valid", { + + data.dir <- test_path("fixtures") + fileTxt <- file.path(data.dir, "ex1.generic.txt.gz") + + result1 <- RAIDS:::readSNVFileGeneric(fileName=fileTxt, offset=0L) + + expect_equal(ncol(result1), 7) + expect_equal(nrow(result1), 50) + expect_equal(colnames(result1), c("Chromosome", "Position", "Ref", "Alt", + "File1R", "File1A", "count")) + +}) + + +############################################################################# +### Tests readSNVVCF() results +############################################################################# + +context("readSNVVCF() results") + + +test_that("readSNVVCF() must return expected results", { + + data.dir <- test_path("fixtures") + fileTxt <- file.path(data.dir, "ex1_small.vcf.gz") + + result1 <- RAIDS:::readSNVVCF(fileName=fileTxt, profileName=NULL, offset=0L) + + expect_equal(ncol(result1), 7) + expect_equal(nrow(result1), 60) + expect_equal(colnames(result1), c("Chromosome", "Position", "Ref", "Alt", + "File1R", "File1A", "count")) +}) + + +############################################################################# +### Tests readSNVVCF() results +############################################################################# + +context("readSNVBAM() results") + + +test_that("readSNVBAM() must return expected results", { + + fileTxt <- system.file("extdata", "no_which_buffered_pileup.bam", + package="Rsamtools", mustWork=TRUE) + + result1 <- RAIDS:::readSNVBAM(fileTxt, varSelected=data.frame(chr=c(1,1), + start=c(3,5), REF=c("A", "A"), ALT=c("C", "C"))) + + expect_equal(ncol(result1), 11) + expect_equal(nrow(result1), 2) + expect_equal(colnames(result1), c("Chromosome", "Position", "Ref", "Alt", + "File1R", "File1A", "count", "A", "C", "G", "T")) +}) + + + +############################################################################# +### Tests processBlockChr() results +############################################################################# + +context("processBlockChr() results") + + +test_that("processBlockChr() must return expected results", { + + fileGDS <- test_path("fixtures", "1KG_Test.gds") + + fileLdBlock <- test_path("fixtures", "ThisFileDoesntExist.txt") + + error_message <- paste0("The \'fileBlock\' must be a character string ", + "representing the file .det from plink block result. ", + "The file must exist.") + + expect_error(RAIDS:::processBlockChr(fileReferenceGDS=fileGDS, + fileBlock=fileLdBlock), error_message) +}) + + +test_that("processBlockChr() must return expected results", { + + fileGDS <- test_path("fixtures", "1KG_Test.gds") + + blockChr <- data.frame(CHR=c(rep(1, 4)), BP1=c(51897, 54707, 55544, 61986), + BP2=c(51927, 54715, 59039, 66506), KB=c(0.031, 0.009, 3.496, 4.521), + NSNPS=c(2, 2, 2, 3), SNPS=c("s3|s4", "s6|s7", "s14|s15", + "s17|s18|s31"), stringsAsFactors=FALSE) + + + tmpF3 <- tempfile("block.DEMO", fileext = c(".det")) + + withr::defer(if(file.exists(tmpF3)) unlink(tmpF3), envir=parent.frame()) + + write.table(x=blockChr, file=tmpF3, sep=" ") + + results <- RAIDS:::processBlockChr(fileReferenceGDS=fileGDS, + fileBlock=tmpF3) + + expect_equal(length(results), 2) + expect_equal(results$chr, c(1)) + expect_equal(results$block.snp, c(-1, -2, 1, 1, -3, 2, 2)) +}) diff --git a/tests/testthat/test-visualization.R b/tests/testthat/test-visualization.R new file mode 100644 index 000000000..c8af637a5 --- /dev/null +++ b/tests/testthat/test-visualization.R @@ -0,0 +1,238 @@ +### Unit tests for tools.R functions + +library(RAIDS) +library(testthat) +library(withr) + +############################################################################# +### Tests createAccuracyGraph() results +############################################################################# + +context("createAccuracyGraph() results") + + +test_that("createAccuracyGraph() must return error when fileRDS is not a character string", { + + error_message <- "The \'fileRDS\' parameter must be a character string." + + expect_error(createAccuracyGraph(fileRDS=44, title="", + selectD=c(3,7,11), selectColor=c("#5e688a", "#cd5700", "#CC79A7")), + error_message) +}) + +test_that("createAccuracyGraph() must return error when fileRDS does not exist", { + + error_message <- "The \'fileRDS\' file does not exist." + + expect_error(createAccuracyGraph(fileRDS="./toto.RDS", title="", + selectD=c(3,7,11), selectColor=c("#5e688a", "#cd5700", "#CC79A7")), + error_message) +}) + +test_that("createAccuracyGraph() must return error when fileRDS is not RDS", { + + + fileGDS <- test_path("fixtures", "1KG_Test.gds") + + error_message <- "The \'fileRDS\' must have a RDS (or rds) extension." + + expect_error(createAccuracyGraph(fileRDS=fileGDS, title="", + selectD=c(3,7,11), selectColor=c("#5e688a", "#cd5700", "#CC79A7")), + error_message, fixed=TRUE) +}) + + +test_that("createAccuracyGraph() must return error when fileRDS is not RDS", { + + fileGDS <- test_path("fixtures", "TEST_01.infoCall.RDS") + + error_message <- "The \'selectD\' parameter cannot be empty." + + expect_error(createAccuracyGraph(fileRDS=fileGDS, title="", + selectD=c(), selectColor=c("#5e688a", "#cd5700", "#CC79A7")), + error_message) +}) + + +test_that("createAccuracyGraph() must return error when title is number", { + + fileGDS <- test_path("fixtures", "TEST_01.infoCall.RDS") + + error_message <- "The \'title\' must be a character string." + + expect_error(createAccuracyGraph(fileRDS=fileGDS, title=33, + selectD=c(3,5,6), selectColor=c("#5e688a", "#cd5700", "#CC79A7")), + error_message) +}) + + +test_that("createAccuracyGraph() must return error when selectD is empty", { + + fileGDS <- test_path("fixtures", "TEST_01.infoCall.RDS") + + error_message <- "The \'selectD\' parameter cannot be empty." + + expect_error(createAccuracyGraph(fileRDS=fileGDS, title="", + selectD=c(), selectColor=c("#5e688a", "#cd5700", "#CC79A7")), + error_message) +}) + + +test_that("createAccuracyGraph() must return error when selectD has 6 elements", { + + fileGDS <- test_path("fixtures", "TEST_01.infoCall.RDS") + + error_message <- paste0("The \'selectD\' parameter can contain a ", + "maximum of 5 elements.") + + expect_error(createAccuracyGraph(fileRDS=fileGDS, title="", + selectD=c(1,2,3,4,5,6), selectColor=c("#5e688a", "#cd5700", "#CC79A7")), + error_message) +}) + + +test_that("createAccuracyGraph() must return error when selectD value not in the file", { + + fileGDS <- test_path("fixtures", "TEST_01.infoCall.RDS") + + error_message <- paste0("Not all values in \'selectD\' are present in the RDS file.") + + expect_error(createAccuracyGraph(fileRDS=fileGDS, title="", + selectD=c(32,32,34), selectColor=c("#5e688a", "#cd5700", "#CC79A7")), + error_message) +}) + + +test_that("createAccuracyGraph() must return error when selectColor shorter than selectD", { + + fileGDS <- test_path("fixtures", "TEST_01.infoCall.RDS") + + error_message <- paste0("The \'selectColor\' parameter must be the ", + "same length than the \'selectD\' parameter.") + + expect_error(createAccuracyGraph(fileRDS=fileGDS, title="", + selectD=c(1,2,3,4), selectColor=c("#5e688a", "#cd5700", "#CC79A7")), + error_message) +}) + + +test_that("createAccuracyGraph() must return a gglot object when successful", { + + fileGDS <- test_path("fixtures", "TEST_01.infoCall.RDS") + + graphE <-createAccuracyGraph(fileRDS=fileGDS, title="", + selectD=c(7,8,9), selectColor=c("#5e688a", "#cd5700", "#CC79A7")) + + testthat::expect_is(graphE, "ggplot") +}) + + +############################################################################# +### Tests createAUROCGraph() results +############################################################################# + +context("createAUROCGraph() results") + + +test_that("createAUROCGraph() must return a gglot object when successful", { + + dfAuroc <- data.frame(D=c(rep(2, 15), rep(3, 15)), + K=c(rep(c(2, 3, 4), 10)), + Call=c("EUR", "EUR", "EUR", "AMR", "AMR", "AMR", + "EAS", "EAS", "EAS", "SAS", "SAS", "SAS", + "AFR", "AFR", "AFR", + "EUR", "EUR", "EUR", "AMR", "AMR", "AMR", + "EAS", "EAS", "EAS", "SAS", "SAS", "SAS", + "AFR", "AFR", "AFR"), + AUROC=c(0.95000, 0.9628737, 0.9701246, 0.8337130, + 0.8509514, 0.9800000, 0.9158718, 0.9267399, + 0.9386384, 0.7484138, 0.9000000, 0.9892067, + 0.88000, 0.8758737, 0.9021246, + 0.99000, 0.9888737, 0.9931246, 0.8837130, + 0.8959514, 1.0000000, 0.9788718, 0.9977399, + 0.9886384, 0.8244138, 1.0000000, 0.9982067, + 0.92000, 0.8998737, 0.9251246)) + dfAuroc$L <- dfAuroc$AUROC - 0.03 + dfAuroc$H <- dfAuroc$AUROC + 0.02 + dfAuroc$H[which(dfAuroc$H > 1.0000)] <- 1.0000000 + + + graphE <-createAUROCGraph(dfAUROC=dfAuroc, title="", + selectD=c(2,3), selectColor=c("#5e688a", "#CC79A7")) + + testthat::expect_is(graphE, "ggplot") +}) + + +test_that("createAccuracyGraph() must return error when dfAUROC is missing mandatory column", { + + dfAuroc <- data.frame(D=c(rep(2, 15), rep(3, 15)), + K=c(rep(c(2, 3, 4), 10)), + Call=c("EUR", "EUR", "EUR", "AMR", "AMR", "AMR", + "EAS", "EAS", "EAS", "SAS", "SAS", "SAS", + "AFR", "AFR", "AFR", + "EUR", "EUR", "EUR", "AMR", "AMR", "AMR", + "EAS", "EAS", "EAS", "SAS", "SAS", "SAS", + "AFR", "AFR", "AFR"), + AUROC=c(0.95000, 0.9628737, 0.9701246, 0.8337130, + 0.8509514, 0.9800000, 0.9158718, 0.9267399, + 0.9386384, 0.7484138, 0.9000000, 0.9892067, + 0.88000, 0.8758737, 0.9021246, + 0.99000, 0.9888737, 0.9931246, 0.8837130, + 0.8959514, 1.0000000, 0.9788718, 0.9977399, + 0.9886384, 0.8244138, 1.0000000, 0.9982067, + 0.92000, 0.8998737, 0.9251246)) + dfAuroc$Low <- dfAuroc$AUROC - 0.03 + dfAuroc$H <- dfAuroc$AUROC + 0.02 + dfAuroc$H[which(dfAuroc$H > 1.0000)] <- 1.0000000 + + error_message <- paste0("The \'dfAUROC\' must have all those columns: ", + "D, K, Call, L, AUROC, H.") + + expect_error(createAUROCGraph(dfAUROC=dfAuroc, title="", + selectD=c(2, 3), selectColor=c("#cd5700", "#CC79A7")), + error_message) +}) + + +test_that("createAccuracyGraph() must return error when dfAUROC is an integer", { + + dfAuroc <- 100L + + error_message <- paste0("The \'dfAUROC\' parameter must be a data frame.") + + expect_error(createAUROCGraph(dfAUROC=dfAuroc, title="", + selectD=c(2, 3), selectColor=c("#cd5700", "#CC79A7")), + error_message) +}) + + +test_that("createAccuracyGraph() must return error when selectD not in the data frame", { + + dfAuroc <- data.frame(D=c(rep(2, 15), rep(3, 15)), + K=c(rep(c(2, 3, 4), 10)), + Call=c("EUR", "EUR", "EUR", "AMR", "AMR", "AMR", + "EAS", "EAS", "EAS", "SAS", "SAS", "SAS", + "AFR", "AFR", "AFR", + "EUR", "EUR", "EUR", "AMR", "AMR", "AMR", + "EAS", "EAS", "EAS", "SAS", "SAS", "SAS", + "AFR", "AFR", "AFR"), + AUROC=c(0.95000, 0.9628737, 0.9701246, 0.8337130, + 0.8509514, 0.9800000, 0.9158718, 0.9267399, + 0.9386384, 0.7484138, 0.9000000, 0.9892067, + 0.88000, 0.8758737, 0.9021246, + 0.99000, 0.9888737, 0.9931246, 0.8837130, + 0.8959514, 1.0000000, 0.9788718, 0.9977399, + 0.9886384, 0.8244138, 1.0000000, 0.9982067, + 0.92000, 0.8998737, 0.9251246)) + dfAuroc$L <- dfAuroc$AUROC - 0.03 + dfAuroc$H <- dfAuroc$AUROC + 0.02 + dfAuroc$H[which(dfAuroc$H > 1.0000)] <- 1.0000000 + + error_message <- paste0("Not all values in \'selectD\' are present in ", + "the \'dfAUROC\' data frame.") + + expect_error(createAUROCGraph(dfAUROC=dfAuroc, title="", + selectD=c(2, 4), selectColor=c("#cd5700", "#CC79A7")), + error_message) +}) \ No newline at end of file diff --git a/tests/testthat/test-visualization_internal.R b/tests/testthat/test-visualization_internal.R new file mode 100644 index 000000000..3e312f784 --- /dev/null +++ b/tests/testthat/test-visualization_internal.R @@ -0,0 +1,23 @@ +### Unit tests for visualization_internal.R functions + +library(RAIDS) +library(testthat) + + +############################################################################# +### Tests validateCreateAccuracyGraph() results +############################################################################# + +context("validateCreateAccuracyGraph() results") + + +test_that("validateCreateAccuracyGraph() must return expected results when all input are valid", { + + dataDir <- system.file("extdata", package="RAIDS") + fileRDS <- file.path(dataDir, "TEST_01.infoCall.RDS") + + result1 <- RAIDS:::validateCreateAccuracyGraph(fileRDS=fileRDS, + title="test", selectD=c(5, 15), selectColor=c("red","darkblue")) + + expect_identical(result1, 0L) +}) diff --git a/vignettes/Create_1KG_GDS_File.Rmd b/vignettes/Create_1KG_GDS_File.Rmd deleted file mode 100644 index e5b3ae3d5..000000000 --- a/vignettes/Create_1KG_GDS_File.Rmd +++ /dev/null @@ -1,503 +0,0 @@ ---- -title: "Formatting the information from 1000 Genomes (optional)" -author: Pascal Belleau, Astrid Deschênes and Alexander Krasnitz -output: - BiocStyle::html_document: - number_sections: yes - toc: true - pkgdown: - number_sections: yes - as_is: true -urlcolor: darkred -linkcolor: darkred -bibliography: aicsBiblio.bibtex -vignette: > - %\VignetteIndexEntry{Formatting the information from 1000 Genomes (optional)} - %\VignettePackage{RAIDS} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r style, echo=FALSE, results='hide', warning=FALSE, message=FALSE} -BiocStyle::markdown() - -suppressPackageStartupMessages({ - library(knitr) - library(RAIDS) -}) - -set.seed(121444) -``` - -
-**Package**: `r Rpackage("RAIDS")`
-**Authors**: `r packageDescription("RAIDS")[["Author"]]`
-**Version**: `r packageDescription("RAIDS")$Version`
-**Compiled date**: `r Sys.Date()`
-**License**: `r packageDescription("RAIDS")[["License"]]`
- - -
-
- -# Step 1 - Formatting the information from 1000 Genomes (optional) - - -This is an overview of the main steps to infer the genetic ancestry -from cancer-derived molecular data: - -1. Format the information from 1000 Genomes (1KG) into a 1KG GDS file (optional) -2. Format the information from an external study -3. Find the optimized parameters for the ancestry inference -4. Run the ancestry inference on the external study - - -```{r graphStep1, echo=FALSE, fig.align="center", fig.cap="Step 1 - Formatting the information from 1000 Genomes (optional)", out.width = '120%', results='asis'} -knitr::include_graphics("MainSteps_Step1_v01.png") -``` - - -****** - -Beware that a pre-processed 1KG -GDS file is available at this address: - - -[https://labshare.cshl.edu/shares/krasnitzlab/aicsPaper/matGeno1000g.gds](https://labshare.cshl.edu/shares/krasnitzlab/aicsPaper/matGeno1000g.gds) - - -The size of the file is 15GB. -This section can be skipped if you choose to use this file. - -****** - - -This section explains in further details how to generate the 1000 Genomes (1KG) -GDS file that is needed to run the ancestry inference tool (step 1). Beware -that it is unnecessary to re-run those steps as the 1KG GDS file is -publicly available. - -More specifically, the formatting of the 1KG information includes all those -sub-steps: - -1. Download the required files from 1KG -2. Split the 1KG genotyping file to one file per sample per chromosome -3. Combine the chromosomes to obtain one genotyping file per sample -4. Create a pedigree file in RDS format -5. Prepare a bulk SNP information file based on 1KG VCF -6. Filter the bulk SNP information file -7. Generate the GDS file with the 1KG information -8. Identify genetically related patients present in 1KG GDS file -9. Add information about unrelated patients to the 1KG GDS file -10. Create a second GDS file containing the 1KG phase information - -Beware that those sub-steps are time, as well as, space consuming. - -In the following sections, those sub-steps are described in details. - -
- -## Download the required files from 1KG - -First, the pedigree file with the description of the 1KG samples -needs to be downloaded from the 1KG ftp site: - -> ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/working/20130606_sample_info/20130606_g1k.ped - -The genotyping files for the 1KG samples also needs to be downloaded. Beware -that there is one file per chromosome: - -> ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20181203_biallelic_SNV - -As the genotyping files are split by chromosome, all files corresponding to -this pattern must be downloaded: - -> ALL.chr\*.shapeit2_integrated_v1a.GRCh38.20181129.phased.vcf.gz - -All files are related to GRCh38 genome. - -
-
- -## Split the 1KG genotyping file to one file per sample per chromosome - -To facilitate the manipulation of the genotyping data, the large 1KG -genotyping files are split into smaller files (one per sample). - -As there is one genotyping file per chromosome, the process must be executed -for each of chromosome - -The splitting is done through a bash script: - -```bash -## This in not a R script -## This script is in bash - -## The script has to be run separately for each chromosome - -## Create one directory for the specific chromosome -## -## Two variables need to be assigned -## The variable PATHGENO is the path where the split 1KG genotyping -## files will be located -## The variable chrCur is the current chromosome (ex: "chr1"), a sub-directory -## with the name of the chromosome will be created. The variable must -## include the prefix "chr". -## -## Ex: PATHGENO=./genotypingPerSample -## Ex: chrCur=chr1 -## -mkdir ${PATHGENO}/${chrCur} -cd ${PATHGENO}/${chrCur} - -## The variable FILECUR is the associated 1KG genotyping VCF file for the -## specific chromosome -## The variable FILECUR must also contain the relative or complete path to -## the VCF file -## Ex: FILECUR=../../1KG_files/ALL.chr1.shapeit2_integrated_v1a.GRCh38.20181129.phased.vcf.gz - -## There is 2548 samples in one phase VCF file -## The information associated to each sample is extracted and a VCF file -## specific to each sample for the current chromosome is generated -## -for i in `seq 1 2548` -do - j=$(( $i + 9 )) - SAMPLE=$(zcat $FILECUR||head -n 1000 |grep "#CHROM"|cut -d$'\t' -f$j) - zcat $FILECUR|grep -v "##"|cut -d$'\t' -f$j |bzip2 -c > ${SAMPLE}.${CHR}.vcf.bz2 -done -``` - -
-
- -## Combine the chromosomes to obtain one genotyping file per sample - -The genotyping information for all chromosomes is merged so that there is only -one genotyping file per sample. - - -```{r graphChr1KGSNV, echo = FALSE, fig.align="center", fig.cap="The function groupChr1KGSNV() merge samples that are split by chromosomes into one file.", out.width = '120%', results='asis'} -knitr::include_graphics("groupChr1KGSNV_v01.png") -``` - - -The __groupChr1KGSNV()__ function is used to combine the genotyping information -from multiple files. - -```{r fileGenopart2, echo=TRUE, eval=FALSE} -## Load required package -library(RAIDS) - -## The path where the genotyping files are located -pathGenoChr <- file.path("data", "pathgenochr") - -## The path where the merged genotyping files will be created -PATHOUT <- file.path("data", "pathgenoOUT") - -## Combining the genotyping information for all chromosome in one file per -## profile -groupChr1KGSNV(pathGenoChr=pathGenoChr, pathOut=PATHOUT) - -``` - -
-
- -## Create a predigree file in RDS format - -The function __prepPed1KG()__ is used to create the pedigree file -in RDS format: - -```{r prepPed1KG, eval=FALSE, echo=TRUE} -## Load required package -library(RAIDS) - -## The path to the pedigree file from 1KG -## In this example, the file is in the current directory -filePED1KGOri <- "20130606_g1k.ped" - -## Extract needed information from the pedigree file from 1KG into a data.frame -## Only the samples with genotyping information (sample file present -## in pathGeno parameter) are retained to create the final data.frame -ped <- prepPed1KG(filePed=filePED1KGOri, - pathGeno=file.path("data", "sampleGeno")) - -## Save the pedigree information data.frame as a RDS file -## In this example, the file is saved here ./data/metadata/ped1KG.rds -filePED1KG <- file.path("data", "metadata","ped1KG.rds") -saveRDS(ped, filePED1KG) - -``` - - -
-
- -## Prepare a bulk SNP information file based on 1KG VCF - -Some intermediate file containing the SNP information from 1KG need to be -generated so the information can ultimately be imported in a GDS file -using Bioconductor [gdsfmt](https://bioconductor.org/packages/gdsfmt/). - -The bulk intermediate SNP file contains the SNP position as well as the -frequency in each super population. - -```{r intermediateVCF, echo=TRUE, eval=FALSE} -## This is not done in R -## The python script is in the 'scriptsPy' directory - -for i in `ls PATHVCF/*shapeit2_integrated_v1a.GRCh38.20181129.phased.vcf.gz` -do - chr=$(echo $FILECUR|perl -n -e '/ALL\.(chr[^\.]+)/;print $1') - python PATH2SCRIPT/extract1000gFreq.py ${FILECUR} matFreq.${chr} -done - -for i in `seq 1 22` -do - cat matFreq.chr${i}.txt >matFreqSNV.txt - bzip2 matFreqSNV.txt -done - -``` - -The bulk SNP info file is called __matFreqSNV.txt.bz2__. - -
-
- -## Filter the bulk SNP information file - -The bulk SNP file is filtered to only retain the SNPs with frequency -higher then a specific cut-off (here >=0.01) for at least one super -population. - -The filter SNP file is saved in RDS format (_fileSNPsRDS_ parameter). A -second file containing the index of the retained SNPs is also -created (*fileSNPsRDS* parameter) - -The function __generateMapSnvSel()__ is used to filter the SNP file and -generated the needed RDS files: - -```{r filterVCF, echo=TRUE, eval=FALSE} -## Load required package -library(RAIDS) - -## The path to the bulk SNP info file -fileSNV.v <- file.path(pathGeno, "matFreqSNV.txt.bz2") - -## The paths and names of the two output files -## One file contains the index of retained SNPs ("listSNP.rds") -## One file contains the filter SNP information ("mapSNVSel.rds") -fileSNPsRDS <- file.path(PATHSEL, "listSNP.rds") -fileFREQ.v <- file.path(PATHSEL, "mapSNVSel.rds") - -## Filter the bulk SNP file (fileSNV parameter) -## Create a RDS with filter SNPs (fileFREQ parameter) -## Also creates a RDS with the indexes of the retained SNPs (fileFREQ parameter) -generateMapSnvSel(cutOff=0.01, fileSNV=fileSNV.v, - fileSNPsRDS=fileSNPsRDS, - fileFREQ=fileFREQ.v) -``` - - -
-
- - -## Generate the GDS file with the 1KG information - -The CoreArray Genomic Data Structure (GDS) data files are files suited for -large-scale datasets, especially for data which are much larger than the -available random-access memory. - -The function __generateGDS1KG()__ is used to generate the GDS file that will -contain the information related to 1KG: - -```{r gdsCreation, echo=TRUE, eval=FALSE} -## TODO: what is PATHMETA and PATHSEL - -## The path and file names of the required files -## First, the RDS file containing the pedigree information -## Second, the RDS file with the indexes of the retained SNPs -## Third, the RDS file with the filtered SNP information -filePED1KG <- file.path(PATHMETA,"ped1KG.rds") -fileSNPsRDS <- file.path(PATHSEL, "listSNP.rds") -fileFREQ.v <- file.path(PATHSEL, "mapSNVSel.rds") - -## The name of the GDS file that will be created -fileNameGDS <- "matGeno1000g.gds" -fileGDS <- file.path(PATHGDS, fileNameGDS) - -## Generate GDS file containing the 1KG information -generateGDS1KG(pathGeno=file.path("data", "sampleGeno"), - filePedRDS=filePED1KG, - fileSNVSelected=fileFREQ.v, - fileSNVIndex=fileSNPsRDS, - fileNameGDS=fileGDS) -``` - - -
-
- -## Identify genetically related patients present in 1KG GDS file - -As only unrelated patients can be used in the following analyses, the -genetically related patients in 1KG must be identified. - -The function __identifyRelative()__ identifies patients that are genetically -related in the 1KG files. It generates a RDS file with the unrelated patient -information (_filePart_ parameter) as well as a RDS file with the kinship -coefficient between the patients (_fileIBD_ parameter). - -```{r identifyRelative, echo=TRUE, eval=FALSE} -## Load required package -library(gdsfmt) - -## TODO: what is PATHMETA - -## The name of the GDS file that contains the 1KG information -fileNameGDS <- "matGeno1000g.gds" -fileGDS <- file.path(PATHGDS, fileNameGDS) - -## Files that will be created by the identifyRelative() function -## The first RDS file will contain the kinship information between patients -## The second RDS file will contain the list of unrelated patients -fileIBD <- file.path(PATHMETA,"ibd.All.0.05.rds") -filePart <- file.path(PATHMETA,"part.All.0.05.rds") - -## Open the 1KG GDS file -gds <- snpgdsOpen(fileGDS) - -## Identify the genetically related patients in 1KG -identifyRelative(gds=fileGDS, maf=0.05, thresh = 2^(-11/2), - fileIBD=fileIBD, filePart=filePart) - -## Close the 1KG GDS file -closefn.gds(gds) -``` - -
-
- - -## Add information about unrelated patients to the 1KG GDS file - -Only the unrelated patients from 1KG are used in the following ancestry -inference and kept in the GDS 1KG file. The function __identifyRelative()__ -identifies the unrelated patients and saves the information about those -patients in an intermediate external file. - -The function __addRef2GDS1KG()__ is adding the information about the unrelated -patients to the GDS 1KG file using the intermediate external file. - -```{r addRef2GDS1KG, echo=TRUE, eval=FALSE} -## Add the information about the unrelated 1KG patients to the 1KG GDS file -addRef2GDS1KG(fileGDS=gds, filePart=filePart) -``` - -
-
- -## Create a second GDS containing the 1KG phase information - -We generate a GDS with the phase information. - -The function __generatePhase1KG2GDS()__ is adding the phase information -into the newly created 1KG GDS phase file. - -```{r addPhaseGDS1KG, echo=TRUE, eval=FALSE} -## Load required package -library(gdsfmt) - -## The name of the 1KG GDS file that already exists -fileGDS1KG <- file.path(PATHGDS, "matGeno1000g.gds") - -## The name of the 1KG Phase GDS file that will be created -fileGDSPhase <- file.path(PATHGDS, "matPhase1000g.gds") - -## Open the 1KG GDS file -gds <- openfn.gds(fileGDS1KG) - -## Create the 1KG Phase GDS file -gdsPhase <- createfn.gds(fileGDSPhase) - -## Add the phase information to the 1KG Phase GDSfile -generatePhase1KG2GDS(gds, gdsPhase, pathGeno, fileSNPsRDS) - -## Close both files -closefn.gds(gdsPhase) -closefn.gds(gds) -``` - -
-
- -### Create a VCF file containing the retained SNP positions from 1KG GDS file - -The __snvListVCF()__ function is used to generate a VCF file that contains -the information of all retained SNPs from 1KG GDS file: - -```{r snvListVCF, echo=TRUE, eval=FALSE} -## Load required package -library(RAIDS) - -## Open the 1KG GDS file -gds <- snpgdsOpen(fileGDS1kg) - -## The VCF file that will be created -fileOUT <- "SNPretained.VCF" - -## Generate the VCF with the retained SNP position -snvListVCF(gdsReference=gds, fileOUT=fileOUT, offset=1, freqCutoff=NULL) - -## Close the 1KG GDS file -closefn.gds(gds) -``` - -You should compress and indexing the newly created VCF file. Do do so, -you need to install [HTSlib](http://www.htslib.org/download/) [@Bonfield2021]. - -In a terminal: - -```bash -## This in not a R script -## This script is in bash - -## Compress the new VCF file (fileOUT parameter) -bgzip fileOUT - -## Index the new VCF file -## HTSlib software is needed -tabix -p vcf fileOUT.gz -``` - - -
-
- -# Pre-processed files are available - -Pre-processed files, such as the 1KG GDS file, are available at this address: - - -[https://labshare.cshl.edu/shares/krasnitzlab/aicsPaper](https://labshare.cshl.edu/shares/krasnitzlab/aicsPaper) - -Beware that some of those files are voluminous. - -
-
- -# Session info - -Here is the output of `sessionInfo()` on the system on which this document was -compiled: - -```{r sessionInfo, echo=FALSE} -sessionInfo() -``` - -
-
- diff --git a/vignettes/Create_Reference_GDS_File.Rmd b/vignettes/Create_Reference_GDS_File.Rmd new file mode 100644 index 000000000..7cbb7e826 --- /dev/null +++ b/vignettes/Create_Reference_GDS_File.Rmd @@ -0,0 +1,443 @@ +--- +title: "Population reference dataset GDS files" +author: Pascal Belleau, Astrid Deschênes and Alexander Krasnitz +output: + BiocStyle::html_document: + number_sections: yes + toc: true + pkgdown: + number_sections: yes + as_is: true +urlcolor: darkred +linkcolor: darkred +bibliography: aicsBiblio.bibtex +vignette: > + %\VignetteIndexEntry{Population reference dataset GDS files} + %\VignettePackage{RAIDS} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r style, echo=FALSE, results='hide', warning=FALSE, message=FALSE} +BiocStyle::markdown() + +suppressPackageStartupMessages({ + library(knitr) + library(RAIDS) + library(SNPRelate) + library(gdsfmt) +}) + +set.seed(121444) +``` + +
+**Package**: `r Rpackage("RAIDS")`
+**Authors**: `r packageDescription("RAIDS")[["Author"]]`
+**Version**: `r packageDescription("RAIDS")$Version`
+**Compiled date**: `r Sys.Date()`
+**License**: `r packageDescription("RAIDS")[["License"]]`
+ + +
+
+ + + + +This vignette explains, in further details, the format of the population +reference files that are required to run the ancestry inference tool. + +Two different files are generated from a reference dataset: + +- The Population Reference GDS File +- The Population Reference SNV Annotation GDS file + + +
+
+ +# Population Reference GDS File + +The *Population Reference GDS file* should contain the genome-wide SNV +information related to the population data set with known genetic ancestry. +This reference data set will be used to generate the simulated samples. It is +also used to generate the PCA on which the samples of interest are going to +be projected. + +The *Population Reference GDS file* is a GDS object of class +[SNPGDSFileClass](https://www.bioconductor.org/packages/release/bioc/vignettes/) from [SNPRelate](https://www.bioconductor.org/packages/release/bioc/html/SNPRelate.html) +package [@Zheng2012]. + +Beware that related profiles should be flagged in the *Population Reference GDS file* files. + +```{r runRefGDS, echo=TRUE, eval=TRUE, collapse=TRUE, warning=FALSE, message=FALSE} +############################################################################# +## Load required packages +############################################################################# +library(RAIDS) +library(SNPRelate) + +pathRef <- system.file("extdata/", package="RAIDS") + +fileReferenceGDS <- file.path(pathRef, "PopulationReferenceDemo.gds") + +gdsRef <- snpgdsOpen(fileReferenceGDS) + +## Show the file format +print(gdsRef) + +closefn.gds(gdsRef) +``` + +
+ +This output lists all variables stored in the *Population Reference GDS file*. +At the first level, it stores variables *sample.id*, *snp.id*, etc. +The additional information displayed in the braces indicate the data type, +size, compressed or not with compression ratio. + +The mandatory fields are: + +* **sample.id**: a *character* string (saved in *Str8* format) used as unique identifier for each sample +* **sample.annot**: a *data.frame* where each row correspond to a sample and containing those columns: + * **sex**: a *character* string (saved in *Str8* format) used as identifier of the sex of the sample + * **pop.Group**: a *character* string (saved in *Str8* format) representing the sub-population ancestry of the sample (ex:GBR, etc) + * **superPop**: a *character* string (saved in *Str8* format) representing the super-population ancestry of the sample (ex:EUR, AFR, EAS, SAS, AMR) + * **batch**: an *integer* (saved in *Float64* format) representing the batch of provenance of the sample +* **snp.id**: a a *character* string (saved in *Str8* format) used as unique identifier for each SNV +* **snp.chromosome**: an *integer* or *character* (saved in *UInt16 * format) mapping for each chromosome. Integer: numeric values 1-26, mapped in order from 1-22, 23=X, 24=XY (the pseudoautosomal region), 25=Y, 26=M (the mitochondrial probes), and 0 for probes with unknown positions; it does not allow NA. Character: “X”, “XY”, “Y” and “M” can be used here, and a blank string indicating unknown position +* **snp.position**: an *integer* (saved in *Int32* format) representing the base position of each SNV on the chromosome, and 0 for unknown position; it does not allow NA. +* **snp.allele**: a *character* string (saved as *Str8* format) representing the reference allele and alternative allele for each of the SNVs present in the *snp.id* field +* **snp.AF**: a *numeric* value between 0 and 1 (saved as *PackedReal24* format) representing the allelic frequency of the alternative allele in the general population for each of the SNVs present in the *snp.id* field +* **snp.EAS_AF**: a *numeric* value between 0 and 1 (saved as *PackedReal24* format) representing the allelic frequency of the alternative allele in the East Asian population for each of the SNVs present in the *snp.id* field +* **snp.EUR_AF**: a *numeric* value between 0 and 1 (saved as *PackedReal24* format) representing the allelic frequency of the alternative allele in the European population for each of the SNVs present in the *snp.id* field +* **snp.AFR_AF**: a *numeric* value between 0 and 1 (saved as *PackedReal24* format) representing the allelic frequency of the alternative allele in the African population for each of the SNVs present in the *snp.id* field +* **snp.AMR_AF**: a *numeric* value between 0 and 1 (saved as *PackedReal24* format) representing the allelic frequency of the alternative allele in the American population for each of the SNVs present in the *snp.id* field +* **snp.SAS_AF**: a *numeric* value between 0 and 1 (saved as *PackedReal24* format) representing the allelic frequency of the alternative allele in the South Asian population for each of the SNVs present in the *snp.id* field +* **genotype**: a SNV genotypic *matrix* of *integer* values (saved in *Bit2* format) (i.e., the number of A alleles) with SNVs as rows and samples as columns (number of SNVs × number of Samples) +* **sample.ref**: an *integer* (saved in *Bit1* format) indicating if the sample is retained to be used as reference (=1) or removed (=0) as related samples have to be discarded + +
+ +This following example shows how to create a *Population GDS Reference file*. +This example is for demonstration purpose only and use hard coded values. +A working *Population GDS Reference file* would have to contain multiple +samples from +each continental population and would also have to contain the SNVs from the +entire genome. + +To generate a real *Population GDS Reference file*, the pipeline to process +the information would depend of the selected source. +If the source files are in VCF format, you can use Bioconductor +[VariationAnnotation](https://bioconductor.org/packages/release/bioc/html/VariantAnnotation.html) +package to extract the genotypic information (beware it may use a lot of +memory). +Often, you will need to parse metadata files to get information such as the +sex and population of the profiles. In addition, the Bioconductor +[GENESIS](https://bioconductor.org/packages/release/bioc/html/GENESIS.html) +package can +be used to compute kinship coefficients to identify the unrelated profiles. + + +```{r createRefGDS, echo=TRUE, eval=TRUE, collapse=TRUE, warning=FALSE, message=FALSE} +############################################################################# +## Load required packages +############################################################################# +library(RAIDS) +library(SNPRelate) +library(gdsfmt) + +## Create a temporary GDS Reference file in the temporary directory +fileNewReferenceGDS <- file.path(tempdir(), "reference_DEMO.gds") + +gdsRefNew <- createfn.gds(fileNewReferenceGDS) + +## The entry 'sample.id' contain the unique identifiers of 10 samples +## that constitute the reference dataset +sample.id <- c("HG00243", "HG00150", "HG00149", "HG00246", "HG00138", + "HG01334", "HG00268", "HG00275", "HG00290", "HG00364") +add.gdsn(node=gdsRefNew, name="sample.id", val=sample.id, + storage="string", check=TRUE) + +## A data frame containing the information about the 10 samples +## (in the same order than in the 'sample.id') is created and added to +## the 'sample.annot' entry +## The data frame must contain those columns: +## 'sex': '1'=male, '2'=female +## 'pop.group': acronym for the population (ex: GBR, CDX, MSL, ASW, etc..) +## 'superPop': acronym for the super-population (ex: AFR, EUR, etc...) +## 'batch': number identifying the batch of provenance +sampleInformation <- data.frame(sex=c("1", "2", "1", "1", "1", + "1", "2", "2", "1", "2"), pop.group=c(rep("GBR", 6), rep("FIN", 4)), + superPop=c(rep("EUR", 10)), batch=rep(0, 10), stringsAsFactors=FALSE) +add.gdsn(node=gdsRefNew, name="sample.annot", val=sampleInformation, + check=TRUE) + +## The identifier of each SNV is added in the 'snp.id' entry +snvID <- c("s29603", "s29605", "s29633", "s29634", "s29635", "s29637", + "s29638", "s29663", "s29664", "s29666", "s29667", "s29686", + "s29687", "s29711", "s29741", "s29742", "s29746", "s29750", + "s29751", "s29753") +add.gdsn(node=gdsRefNew, name="snp.id", val=snvID, check=TRUE) + +## The chromosome of each SNV is added to the 'snp.chromosome' entry +## The order of the SNVs is the same than in the 'snp.id' entry +snvChrom <- c(rep(1, 20)) +add.gdsn(node=gdsRefNew, name="snp.chromosome", val=snvChrom, storage="uint16", + check=TRUE) + +## The position on the chromosome of each SNV is added to +## the 'snp.position' entry +## The order of the SNVs is the same than in the 'snp.id' entry +snvPos <- c(3467333, 3467428, 3469375, 3469387, 3469502, 3469527, + 3469737, 3471497, 3471565, 3471618) +add.gdsn(node=gdsRefNew, name="snp.position", val=snvPos, storage="int32", + check=TRUE) + +## The allele information of each SNV is added to the 'snp.allele' entry +## The order of the SNVs is the same than in the 'snp.allele' entry +snvAllele <- c("A/G", "C/G", "C/T", "C/T", "T/G", "C/T", + "G/A", "A/G", "G/A", "G/A") +add.gdsn(node=gdsRefNew, name="snp.allele", val=snvAllele, storage="string", + check=TRUE) + +## The allele frequency in the general population (between 0 and 1) of each +## SNV is added to the 'snp.AF' entry +## The order of the SNVs is the same than in the 'snp.id' entry +snvAF <- c(0.86, 0.01, 0.00, 0.00, 0.01, 0.00, 0.00, 0.00, 0.00, 0.01) +add.gdsn(node=gdsRefNew, name="snp.AF", val=snvAF, storage="packedreal24", + check=TRUE) + +## The allele frequency in the East Asian population (between 0 and 1) of each +## SNV is added to the 'snp.EAS_AF' entry +## The order of the SNVs is the same than in the 'snp.id' entry +snvAF <- c(0.80, 0.00, 0.00, 0.01, 0.00, 0.00, 0.01, 0.00, 0.02, 0.00) +add.gdsn(node=gdsRefNew, name="snp.EAS_AF", val=snvAF, storage="packedreal24", + check=TRUE) + +## The allele frequency in the European population (between 0 and 1) of each +## SNV is added to the 'snp.EUR_AF' entry +## The order of the SNVs is the same than in the 'snp.id' entry +snvAF <- c(0.91, 0.00, 0.01, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.03) +add.gdsn(node=gdsRefNew, name="snp.EUR_AF", val=snvAF, storage="packedreal24", + check=TRUE) + +## The allele frequency in the African population (between 0 and 1) of each +## SNV is added to the 'snp.AFR_AF' entry +## The order of the SNVs is the same than in the 'snp.id' entry +snvAF <- c(0.85, 0.04, 0.00, 0.00, 0.00, 0.01, 0.00, 0.00, 0.00, 0.00) +add.gdsn(node=gdsRefNew, name="snp.AFR_AF", val=snvAF, storage="packedreal24", + check=TRUE) + +## The allele frequency in the American population (between 0 and 1) of each +## SNV is added to the 'snp.AMR_AF' entry +## The order of the SNVs is the same than in the 'snp.id' entry +snvAF <- c(0.83, 0.01, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.02) +add.gdsn(node=gdsRefNew, name="snp.AMR_AF", val=snvAF, storage="packedreal24", + check=TRUE) + +## The allele frequency in the South Asian population (between 0 and 1) of each +## SNV is added to the 'snp.SAS_AF' entry +## The order of the SNVs is the same than in the 'snp.id' entry +snvAF <- c(0.89, 0.00, 0.00, 0.00, 0.05, 0.00, 0.00, 0.01, 0.00, 0.00) +add.gdsn(node=gdsRefNew, name="snp.SAS_AF", val=snvAF, storage="packedreal24", + check=TRUE) + +## The genotype of each SNV for each sample is added to the 'genotype' entry +## The genotype correspond to the number of A alleles +## The rows represent the SNVs is the same order than in 'snp.id' entry +## The columns represent the samples is the same order than in 'sample.id' entry +genotypeInfo <- matrix(data=c(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), nrow=10, byrow=TRUE) +add.gdsn(node=gdsRefNew, name="genotype", val=genotypeInfo, + storage="bit2", check=TRUE) + +## The entry 'sample.ref' is filled with 1 indicating that all 10 +## samples are retained to be used as reference +## The order of the samples is the same than in the 'sample.id' entry +add.gdsn(node=gdsRefNew, name="sample.ref", val=rep(1L, 10), + storage="bit1", check=TRUE) + +## Show the file format +print(gdsRefNew) + +closefn.gds(gdsRefNew) + +unlink(fileNewReferenceGDS, force=TRUE) + +``` + +
+
+ +# Population Reference Annotation GDS file + +The *Population Reference Annotation GDS file* contains phase information +and block group information for all the SNVs present in +*Population Reference GDS file*. +If the source files are in VCF format, you can use Bioconductor +[VariationAnnotation](https://bioconductor.org/packages/release/bioc/html/VariantAnnotation.html) +package to extract the phase information (beware it may use a lot of +memory). +A block can be a linkage disequelibrium block +relative to a population or a gene. A bioconductor package like +[GENESIS](https://bioconductor.org/packages/release/bioc/html/GENESIS.html) +can be used to get the block information. + + +```{r runRefAnnotGDS, echo=TRUE, eval=TRUE, collapse=TRUE, warning=FALSE, message=FALSE} +############################################################################# +## Load required packages +############################################################################# +library(RAIDS) +library(SNPRelate) + +pathReference <- system.file("extdata/tests", package="RAIDS") + +fileReferenceAnnotGDS <- file.path(pathReference, "ex1_good_small_1KG.gds") + +gdsRefAnnot <- openfn.gds(fileReferenceAnnotGDS) + +## Show the file format +print(gdsRefAnnot) + +closefn.gds(gdsRefAnnot) +``` + +
+ +This output lists all variables stored in +the *Population Reference Annotation GDS file*. +At the first level, it stores variables *phase*, *block.annot*, etc. +The additional information displayed in the braces indicate the data type, +size, compressed or not + compression ratio. + +The mandatory fields are: + +* **phase**: a *integer* (saved in *Bit2* format) representing the phase of the SNVs in the *Population Annotation GDS file*; 0 means the first allele is a reference; 1 means the first allele is the alternative and 3 means unknown. The first allele combine with the genotype of the variant determine the phase for a biallelic variant. The SNVs (rows) and samples (columns) in phase are in the same order as in the *Population Annotation GDS file*. +* **block.annot**: a *data.frame* containing those columns: + * **block.id**: a *character* string (saved in *Str8* format) representing an identifier of block group. A block can be linkage disequilibrium block relative to a population or a gene. + * **block.desc**: a *character* string (saved in *Str8* format) describing the block group. +* **bloc**: a *matrix* of *integer* values (saved in *Int32* format) where each row representing a SNV in the *Population Annotation GDS file* in the same order. The columns are the block groups described in *block.annot*. Each *integer* in the *matrix* representing a specific block. + +
+
+ +This following example shows how to create a +*Population Reference Annotation GDS file*. +This example is for demonstration purpose only. A working +*Population Reference Annotation GDS file* would have to contain multiple +samples from each continental population and would also have to contain +the SNVs from the entire genome. + +```{r createRefAnnotGDS, echo=TRUE, eval=TRUE, collapse=TRUE, warning=FALSE, message=FALSE} +############################################################################# +## Load required packages +############################################################################# +library(RAIDS) +library(gdsfmt) + +## Create a temporary GDS Reference file in the temporary directory +fileNewReferenceAnnotGDS <- + file.path(tempdir(), "reference_SNV_Annotation_DEMO.gds") + +gdsRefAnnotNew <- createfn.gds(fileNewReferenceAnnotGDS) + +## The entry 'phase' contain the phase of the SNVs in the +## Population Annotation GDS file +## 0 means the first allele is a reference; 1 means the first allele is +## the alternative and 3 means unknown +## The SNVs (rows) and samples (columns) in phase are in the same order as +## in the Population Annotation GDS file. +phase <- matrix(data=c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, + 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, + 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, + 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, + 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, + 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, + 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 0, 1, 1, 0, 1, 1, 1, 1), ncol=10, byrow=TRUE) +add.gdsn(node=gdsRefAnnotNew, name="phase", val=phase, storage="bit2", + check=TRUE) + +## The entry 'blockAnnot' contain the information for each group of blocks +## that are present in the 'block' entry. +blockAnnot <- data.frame(block.id=c("EAS.0.05.500k", "EUR.0.05.500k", + "AFR.0.05.500k", "AMR.0.05.500k", "SAS.0.05.500k"), + block.desc=c( + "EAS populationblock base on SNP 0.05 and windows 500k", + "EUR populationblock base on SNP 0.05 and windows 500k", + "AFR populationblock base on SNP 0.05 and windows 500k", + "AMR populationblock base on SNP 0.05 and windows 500k", + "SAS populationblock base on SNP 0.05 and windows 500k"), + stringsAsFactors=FALSE) +add.gdsn(node=gdsRefAnnotNew, name="block.annot", val=blockAnnot, check=TRUE) + +## The entry 'block' contain the block information for the SNVs in the +## Population Annotation GDS file. +## The SNVs (rows) are in the same order as in +## the Population Annotation GDS file. +## The block groups (columns) are in the same order as in +## the 'block.annot' entry. +block <- matrix(data=c(-1, -1, -1, -1, -1, + -2, -2, 1, -2, -2, + -2, 1, 1, 1, -2, + -2, 1, 1, 1, -2, + -2, -3, -2, -3, -2, + 1, 2, 2, 2, 1, + 1, 2, 2, 2, 1, + -3, -4, -3, -4, -3, + 2, -4, 3, -4, -3, + 2, -4, 3, -4, -3), ncol=5, byrow=TRUE) +add.gdsn(node=gdsRefAnnotNew, name="block", val=block, storage="int32", + check=TRUE) + +## Show the file format +print(gdsRefAnnotNew) + +closefn.gds(gdsRefAnnotNew) + +unlink(fileNewReferenceAnnotGDS, force=TRUE) + +``` + +
+
+ +# Pre-processed files, from 1000 Genomes in hg38, are available + +Pre-processed files used in the RAIDS associated publication, are +available at this address: + + +[https://labshare.cshl.edu/shares/krasnitzlab/aicsPaper](https://labshare.cshl.edu/shares/krasnitzlab/aicsPaper) + +Beware that some of those files are voluminous. + +
+
+ +# Session info + +Here is the output of `sessionInfo()` on the system on which this document was +compiled: + +```{r sessionInfo, echo=FALSE} +sessionInfo() +``` + +
+
+ +# References diff --git a/vignettes/MainSteps_Step1_v01.png b/vignettes/MainSteps_Step1_v01.png deleted file mode 100644 index dc47f16a3..000000000 Binary files a/vignettes/MainSteps_Step1_v01.png and /dev/null differ diff --git a/vignettes/MainSteps_Step1_v03.png b/vignettes/MainSteps_Step1_v03.png new file mode 100644 index 000000000..030ec5578 Binary files /dev/null and b/vignettes/MainSteps_Step1_v03.png differ diff --git a/vignettes/MainSteps_Step1_v04.png b/vignettes/MainSteps_Step1_v04.png new file mode 100644 index 000000000..650fa1877 Binary files /dev/null and b/vignettes/MainSteps_Step1_v04.png differ diff --git a/vignettes/MainSteps_Step2_SubStep2_SNP-Pileup_v01.mod.png b/vignettes/MainSteps_Step2_SubStep2_SNP-Pileup_v01.mod.png deleted file mode 100644 index c87e15cfb..000000000 Binary files a/vignettes/MainSteps_Step2_SubStep2_SNP-Pileup_v01.mod.png and /dev/null differ diff --git a/vignettes/MainSteps_Step2_SubStep2_SNP-Pileup_v03.png b/vignettes/MainSteps_Step2_SubStep2_SNP-Pileup_v03.png new file mode 100644 index 000000000..ddd98046b Binary files /dev/null and b/vignettes/MainSteps_Step2_SubStep2_SNP-Pileup_v03.png differ diff --git a/vignettes/MainSteps_Step2_v01.png b/vignettes/MainSteps_Step2_v01.png deleted file mode 100644 index 1a42f92c9..000000000 Binary files a/vignettes/MainSteps_Step2_v01.png and /dev/null differ diff --git a/vignettes/MainSteps_Step2_v03.png b/vignettes/MainSteps_Step2_v03.png new file mode 100644 index 000000000..67b269915 Binary files /dev/null and b/vignettes/MainSteps_Step2_v03.png differ diff --git a/vignettes/MainSteps_Step3_v01.png b/vignettes/MainSteps_Step3_v01.png deleted file mode 100644 index 66fccf010..000000000 Binary files a/vignettes/MainSteps_Step3_v01.png and /dev/null differ diff --git a/vignettes/MainSteps_Step3_v03.png b/vignettes/MainSteps_Step3_v03.png new file mode 100644 index 000000000..b0fd6e2e4 Binary files /dev/null and b/vignettes/MainSteps_Step3_v03.png differ diff --git a/vignettes/MainSteps_Step4_v01.png b/vignettes/MainSteps_Step4_v01.png deleted file mode 100644 index e1958a409..000000000 Binary files a/vignettes/MainSteps_Step4_v01.png and /dev/null differ diff --git a/vignettes/MainSteps_Step4_v02.png b/vignettes/MainSteps_Step4_v02.png new file mode 100644 index 000000000..4ae2b9dab Binary files /dev/null and b/vignettes/MainSteps_Step4_v02.png differ diff --git a/vignettes/MainSteps_Wrapper_v04.png b/vignettes/MainSteps_Wrapper_v04.png new file mode 100644 index 000000000..ee7a8c5ab Binary files /dev/null and b/vignettes/MainSteps_Wrapper_v04.png differ diff --git a/vignettes/MainSteps_v01.png b/vignettes/MainSteps_v01.png deleted file mode 100644 index 0e20f1560..000000000 Binary files a/vignettes/MainSteps_v01.png and /dev/null differ diff --git a/vignettes/MainSteps_v04.png b/vignettes/MainSteps_v04.png new file mode 100644 index 000000000..1720ec1fd Binary files /dev/null and b/vignettes/MainSteps_v04.png differ diff --git a/vignettes/MainSteps_v05.png b/vignettes/MainSteps_v05.png new file mode 100644 index 000000000..ee9fa070c Binary files /dev/null and b/vignettes/MainSteps_v05.png differ diff --git a/vignettes/ProfileGDSdemo_after_add1KG2SampleGDS.png b/vignettes/ProfileGDSdemo_after_add1KG2SampleGDS.png new file mode 100644 index 000000000..e4e0560c9 Binary files /dev/null and b/vignettes/ProfileGDSdemo_after_add1KG2SampleGDS.png differ diff --git a/vignettes/ProfileGDSdemo_after_estimateAllelicFraction.png b/vignettes/ProfileGDSdemo_after_estimateAllelicFraction.png new file mode 100644 index 000000000..5856f1671 Binary files /dev/null and b/vignettes/ProfileGDSdemo_after_estimateAllelicFraction.png differ diff --git a/vignettes/ProfileGDSdemo_when_created.png b/vignettes/ProfileGDSdemo_when_created.png new file mode 100644 index 000000000..9cf90afbf Binary files /dev/null and b/vignettes/ProfileGDSdemo_when_created.png differ diff --git a/vignettes/RAIDS.Rmd b/vignettes/RAIDS.Rmd index c2675356c..235aca641 100644 --- a/vignettes/RAIDS.Rmd +++ b/vignettes/RAIDS.Rmd @@ -1,18 +1,18 @@ --- -title: "Accurate Inference of Genetic Ancestry from Cancer-derived Sequences" +title: "Robust Ancestry Inference using Data Synthesis" author: Pascal Belleau, Astrid Deschênes and Alexander Krasnitz output: BiocStyle::html_document: - number_sections: yes + number_sections: no toc: true pkgdown: - number_sections: yes + number_sections: no as_is: true urlcolor: darkred linkcolor: darkred bibliography: aicsBiblio.bibtex vignette: > - %\VignetteIndexEntry{Accurate Inference of Genetic Ancestry from Cancer-derived Sequences} + %\VignetteIndexEntry{Robust Ancestry Inference using Data Synthesis} %\VignettePackage{RAIDS} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} @@ -58,44 +58,37 @@ the following: # Introduction -Multiple methods have been implemented to infer ancestry from germline DNA -sequence [@Price2006; @Pritchard2000; @Alexander2009]. However, genotyping of -DNA from matched normal specimens is not part of standard clinical practice -and is not performed routinely outside academic clinical centers. -In sum, matched germline DNA sequence is often missing for cancer-derived -molecular data. In such cases, having the possibility to infer ancestry -from tumor-derived data would be beneficial. - -The **RAIDS** package implements an inference procedure that has been -specifically developed to accurately infer genetic ancestry from -cancer-derived sequences. The current version can handle cancer-derived +The **RAIDS** (Robust Ancerstry Inference using Data Synthesis) package enables accurate and robust inference of genetic ancestry from human molecular data other than whole-genome or whole-exome sequences of cancer-free DNA. The current version can handle sequences of: -* tumor exomes +* whole genomes +* whole exomes * targeted gene panels -* RNA +* RNA, -The **RAIDS** package implements a data synthesis method that, for any given -cancer-derived sequence profile, enables -on the one hand, profile-specific inference -parameter optimization and on the other hand, a profile-specific inference -accuracy estimate. +including those from cancer-derived nucleic acids. The **RAIDS** package implements a +data synthesis method that, for any given +molecular profile of an idividual, enables, on the one hand, profile-specific inference +parameter optimization and, on the other hand, a profile-specific inference +accuracy estimate. By the molecular profile we mean a table of the individual's +germline genotypes at genome positions with sufficient read coverage in the +individual's input data, where sequence variants are frequent in the population reference data.

# Installation -To install the latest version from the -[RAIDS Github Website](https://github.com/KrasnitzLab/RAIDS "RAIDS Github Site"), -the `r CRANpkg("devtools")` package is required. +To install this package +from [Bioconductor](https://bioconductor.org/packages/RAIDS), start R +(version 4.3 or later) and enter: + ```{r installDemo01, eval=FALSE, warning=FALSE, message=FALSE} -## Load required package -library(devtools) +if (!requireNamespace("BiocManager", quietly = TRUE)) + install.packages("BiocManager") -## Install the latest version of RAIDS -devtools::install_github('KrasnitzLab/RAIDS') +BiocManager::install("RAIDS") ```
@@ -103,1049 +96,409 @@ devtools::install_github('KrasnitzLab/RAIDS') -# Main Steps - +# Using RAIDS: step-by-step explanation -This is an overview of genetic ancestry inference from cancer-derived -molecular data: +This is an overview of the RAIDS inferential framework: -```{r graphMainSteps, echo=FALSE, fig.align="center", fig.cap="An overview of the genetic ancestry inference process.", out.width='120%', results='asis', warning=FALSE, message=FALSE} -knitr::include_graphics("MainSteps_v01.png") +```{r graphMainSteps, echo=FALSE, fig.align="center", fig.cap="An overview of the genetic ancestry inference procedure.", out.width='130%', results='asis', warning=FALSE, message=FALSE} +knitr::include_graphics("MainSteps_v05.png") ``` The main steps are: -1. Format reference data from the 1000 Genomes (1KG) (optional) -2. Format cancer-derived data set starting from BAM files -3. Optimize ancestry inference parameters -4. Infer ancestry for the subjects of the external study +**Step 1.** Set-up working directory and provide population reference files -These main steps are described in detail in the following. +**Step 2** Sample the reference data for donors whose genotypes will be used for synthesis and optimize ancestry inference parameters using synthetic data -You can also run the steps from Step 2 Sub-step 4 to the end, with the -default parameters, -in one command with the [wrapper](#wrapper) function. +**Step 3** Infer ancestry -
-
+**Step 4** Summarize and visualize the results -## Step 1 - Format reference data from the 1000 Genomes (optional) +These steps are described in detail in the following. +
+
-```{r graphStep1, echo=FALSE, fig.align="center", fig.cap="Step 1 - Formatting the information from 1000 Genomes (optional)", out.width='120%', results='asis', warning=FALSE, message=FALSE} -knitr::include_graphics("MainSteps_Step1_v01.png") -``` +## Step 1. Set-up working directory and provide population reference files -****** -At this step three important reference files are created: +### 1.1 Create a working directory structure -- The 1KG GDS File -- The 1KG SNV Annotation GDS file -- The 1KG SNV Retained VCF file +First, the following working directory structure should be created: -Note that these pre-processed -files are available at: +``` - -[https://labshare.cshl.edu/shares/krasnitzlab/aicsPaper](https://labshare.cshl.edu/shares/krasnitzlab/aicsPaper) - +############################################################################# +## Working directory structure +############################################################################# +workingDirectory/ + data/ + refGDS + profileGDS -The size of the 1KG GDS file -is 15GB. +``` -The 1KG GDS file is mapped on -hg38 [@Lowy-Gallego2019a]. +
-This section can be skipped if -you choose to use the pre-processed files. +The following code creates a temporary working directory structure where the +example will be run. -The execution of this step is explained in the [Formatting the information from 1000 Genomes (optional)](Create_1KG_GDS_file.html) -vignette. -
+```{r createDir, echo=TRUE, eval=TRUE, collapse=TRUE, warning=FALSE, message=FALSE} -## Step 2 - Prepare cancer-derived data for ancestry inference +############################################################################# +## Create a temporary working directory structure +## using the tempdir() function +############################################################################# +pathWorkingDirectory <- file.path(tempdir(), "workingDirectory") +pathWorkingDirectoryData <- file.path(pathWorkingDirectory, "data") -Molecular profiles in a cancer-derived data set must be formatted -following a series of sub-steps. +if (!dir.exists(pathWorkingDirectory)) { + dir.create(pathWorkingDirectory) + dir.create(pathWorkingDirectoryData) + dir.create(file.path(pathWorkingDirectoryData, "refGDS")) + dir.create(file.path(pathWorkingDirectoryData, "profileGDS")) +} -```{r graphStep2, echo=FALSE, fig.align="center", fig.cap="Step 2 - Formatting the information from an external study", out.width='120%', results='asis', warning=FALSE, message=FALSE} -knitr::include_graphics("MainSteps_Step2_v01.png") ```
-These are: - -1. Create a directory containing the 3 reference files from 1KG -2. Make a SNP pileup file for the profile -3. Create an RDS file containing information about the samples -4. Create a Sample GDS file (1 GDS file per sample) -5. Generate a pruned subset of the single nucleotide variants (SNVs) -6. Estimate the allelic frequency for the pruned SNVs +### 1.2 Download the population reference files -Note that a mapped BAM file is needed for each sample (step 2). -The reference -genome used for the mapping must be the same as the one used to generate the -1KG GDS file. The 1KG GDS file available for download ( -[https://labshare.cshl.edu/shares/krasnitzlab/aicsPaper](https://labshare.cshl.edu/shares/krasnitzlab/aicsPaper) -) is based on the hg38 genome build. -
+The population reference files should be downloaded into the *data/refGDS* +sub-directory. This following code downloads the complete pre-processed files +for 1000 Genomes (1KG), for the hg38 build of the human genome, in the GDS +format. The size of the 1KG GDS file is 15GB. -### Sub-Step 1. Create a directory containing the 3 required reference files +``` -The 3 required reference files may be downloaded: +############################################################################# +## How to download the pre-processed files for 1000 Genomes (1KG) (15 GB) +############################################################################# +cd workingDirectory +cd data/refGDS -```bash -#################################### -## The 1KG GDS file -#################################### wget https://labshare.cshl.edu/shares/krasnitzlab/aicsPaper/matGeno1000g.gds - -#################################### -## The 1KG SNV Annotation GDS file -#################################### wget https://labshare.cshl.edu/shares/krasnitzlab/aicsPaper/matAnnot1000g.gds +cd - -#################################### -## The 1KG SNV Retained VCF file -#################################### -wget https://labshare.cshl.edu/shares/krasnitzlab/aicsPaper/snvSel0.01.vcf.gz -``` - -The 3 files should be stored in the same directory. In the example below, -this directory is referred to as **path1KG**. - -For more information on creating your own reference files, -see the vignette -[Formatting the information from 1000 Genomes (optional)](Create_1KG_GDS_file.html). - -
- -### Sub-Step 2. Generate a SNP pileup file (1 file per profile) - -This step requires installation of the external code -[snp-pileup](https://github.com/mskcc/facets/tree/master/inst/extcode) which -is associated to the facets package [@Shen2016]. Given a VCF file -containing SNP locations, -[snp-pileup](https://github.com/mskcc/facets/tree/master/inst/extcode) -application outputs, for each location, -the read counts for the reference and alternative nucleotides from the BAM input. - -At this point, -[snp-pileup](https://github.com/mskcc/facets/tree/master/inst/extcode) is used -to create,for each cancer-derived profile, -a **Profile SNP pileup file** containing the read counts for the reference and -alternative nucleotides at each SNP position in the -reference 1KG GDS file. -The **1KG SNV Retained VCF file**, which contains the list of -retained 1KG SNVs, is required. - - -```{r graphStep2_SubStep2, echo=FALSE, fig.align="center", fig.cap="Generate a Sample SNP pileup file (1 file per sample)", out.width='100%', results='asis', warning=FALSE, message=FALSE} -knitr::include_graphics("MainSteps_Step2_SubStep2_SNP-Pileup_v01.mod.png") -``` - -Note that the name assigned to the **Profile SNP pileup file** will correspond -to the profile identifier (Name.ID) in the following analysis. For example, a -SNP pileup file called "Sample.01.txt.gz" would be associated to the -"Sample.01" profile. - -This is the command line to run -[snp-pileup](https://github.com/mskcc/facets/tree/master/inst/extcode): - -```bash -##################################################################### -## Description of the parameters -## -g : Compresses the output file with BGZF -## -d5000 : Sets the maximum depth to 5000 -## -q15 : Sets the minimum threshold for mapping quality to 15 -## -Q20 : Sets the minimum threshold for base quality to 20 -## -r0 : Sets the minimum read counts for a position to be output to 0 -## path1KG/snvSel0.01.vcf.gz : The SNP Retained VCF file containing the -## positions of all retained 1KG SNPs -## pathOut/Name.ID.txt : The name of the output Sample VCF file that will be -## compressed by the application -## FILEBAM.bam : The aligned reads from the sample used as input -##################################################################### -snp-pileup -g -d5000 -q15 -Q20 -r0 path1KG/snvSel0.01.vcf.gz pathOut/Name.ID.txt FILEBAM.bam ```
-### Sub-Step 3. Create a profile PED RDS file containing the information about the profiles +For illustrative purposes, a small +**population reference GDS file** (called _ex1_good_small_1KG.gds_) and a small +**population reference SNV Annotation GDS file** (called +_ex1_good_small_1KG_Annot.gds_) are +included in this package. Please note that these "mini-reference" files are for illustrative purposes only and cannot be used to infer genetic ancestry reliably. -An RDS file describing all the profiles to be analyzed -is required. +In this example, the mini-reference files are copied to the +*data/refGDS* directory. -The PED RDS file must contain a *data.frame* with these 5 columns: +```{r copyRefFile, echo=TRUE, eval=TRUE, collapse=TRUE, warning=FALSE, message=FALSE} -- **Name.ID**: The unique sample identifier. The profile VCF file -should be called "Name.ID.txt.gz". -- **Case.ID**: The patient identifier associated to the sample. -- **Sample.Type**: The information about the profile tissue source -(primary tumor, metastatic tumor, normal, etc..). -- **Diagnosis**: The donor's diagnosis. -- **Source**: The source of the profile sequence data (example: dbGAP_XYZ). +############################################################################# +## Load RAIDS package +############################################################################# +library(RAIDS) -Important: The row names of the *data.frame* must be the profiles' **Name.ID**. +############################################################################# +## The population reference GDS file and SNV Annotation GDS file +## need to be located in the same sub-directory. +## Note that the mini-reference GDS file used for this example is +## NOT sufficient for reliable inference. +############################################################################# +## Path to the demo 1KG GDS file is located in this package +dataDir <- system.file("extdata", package="RAIDS") +pathReference <- file.path(dataDir, "tests") -This file is referred to as the **Profile PED RDS file** (PED for pedigree). -Alternatively, the PED information can be saved in another type of -file (CVS, etc..) as long as the *data.frame* information can be regenerated -in R (with _read.csv()_ or else). +fileGDS <- file.path(pathReference, "ex1_good_small_1KG.gds") +fileAnnotGDS <- file.path(pathReference, "ex1_good_small_1KG_Annot.gds") +file.copy(fileGDS, file.path(pathWorkingDirectoryData, "refGDS")) +file.copy(fileAnnotGDS, file.path(pathWorkingDirectoryData, "refGDS")) -```{r pedCreation, echo=TRUE, warning=FALSE, message=FALSE} -############################################################## -## Location of the Profile PED RDS file to be created -############################################################## -dataDir <- system.file("extdata", package="RAIDS") -demoPEDFile <- file.path(dataDir, "Demo_PED.RDS") - -############################################################## -## Create a data frame with the mandatory columns -## All columns are in character string format (no factor) -############################################################## -pedDF <- data.frame(Name.ID=c("Sample_01", "Sample_02", "Sample_03"), - Case.ID=c("Patient_h11", "Patient_h12", "Patient_h18"), - Sample.Type=rep("Primary Tumor", 3), - Diagnosis=rep("Cancer", 3), - Source=rep("Databank B", 3), - stringsAsFactors=FALSE) - -############################################################## -## The row names must correspond to the name of the profiles -############################################################## -rownames(pedDF) <- pedDF$Name.ID - -############################################################## -## Save the data frame into a RDS file -############################################################## -saveRDS(object=pedDF, file=demoPEDFile) - -## Remove RDS file (created for demo purpose) -rm(demoPEDFile) ``` - +

+## Step 2 Ancestry inference with RAIDS -### Sub-Step 4 Create a profile GDS file (1 GDS file per profile) +### 2.1 Set-up the required directories -From here, you can run directly the [wrapper](#wrapper) function -or you can run each step separately as explained here. +All required directories are created at this point. In addition, the paths to +the reference files are stored in variables for later use. -This step requires 3 files as input: +```{r installRaids, echo=TRUE, eval=TRUE, collapse=TRUE, warning=FALSE, message=FALSE} -- The **1KG GDS file** -- The **Profile SNP pileup file** (one per profile present in the study) -- The **Profile PED RDS file** (one file with information for all profiles in the study) +############################################################################# +## The file path to the population reference GDS file +## is required (refGenotype will be used as input later) +## The file path to the population reference SNV Annotation GDS file +## is also required (refAnnotation will be used as input later) +############################################################################# +pathReference <- file.path(pathWorkingDirectoryData, "refGDS") -A *data.frame* containing the general information about the study is -also required. The *data.frame* must contain those 3 columns: +refGenotype <- file.path(pathReference, "ex1_good_small_1KG.gds") +refAnnotation <- file.path(pathReference, "ex1_good_small_1KG_Annot.gds") -- **study.id**: The study identifier (example: TCGA-BRCA). -- **study.desc**: The description of the study. -- **study.platform**: The type of sequencing (example: RNA-seq). +############################################################################# +## The output profileGDS directory, inside workingDirectory/data, must be +## created (pathProfileGDS will be used as input later) +############################################################################# +pathProfileGDS <- file.path(pathWorkingDirectoryData, "profileGDS") -Using all those inputs, the *createStudy2GDS1KG()* function will -generate a **Profile GDS file**. One **Profile GDS file** is created for each -sample passed to the *listSamples* argument. +if (!dir.exists(pathProfileGDS)) { + dir.create(pathProfileGDS) +} -```{r appendStudy2GDS1KG, echo=TRUE, eval=TRUE, collapse=TRUE, warning=FALSE, message=FALSE} -################################################################# -## Load required package -################################################################# -library(RAIDS) - -################################################################# -## The 1KG GDS file and the 1KG SNV Annotation GDS file -## need to be located in the same directory -## Note that the 1KG GDS file used for this example is a -## simplified version and CANNOT be used for any real analysis -################################################################# -dataDir <- system.file("extdata", package="RAIDS") -path1KG <- file.path(dataDir, "example", "gdsRef") - -gds1KG <- file.path(path1KG, "ex1kg.gds") - -################################################################# -## The Profile SNP pileup files (one per sample) need -## to be located in the same directory. -################################################################# -pathGeno <- file.path(dataDir, "example", "snpPileup") - -################################################################# -## The path where the Profile GDS files (one per profile) -## will be created need to be specified. -################################################################# -pathProfileGDS <- file.path(dataDir, "example", "out") - -################################################################# -## The path and file name for the PED RDS file -## will the information about the analyzed samples -################################################################# -filePED <- file.path(dataDir, "example", "pedEx.rds") -ped <- readRDS(filePED) -head(ped) - -################################################################# -## A data frame containing general information about the study -## is also required. The data frame must have -## those 3 columns: "study.id", "study.desc", "study.platform" -################################################################# -studyDF <- data.frame(study.id="MYDATA", - study.desc="Description", - study.platform="PLATFORM", - stringsAsFactors=FALSE) - -################################################################# -## The list of profiles to analyzed is passed to the function. -## The profiles must be present in the Profile PED RDS file see -## sub-step 4 and must have an associated Profile SNP pileup file. -## Not all profiles present in the Profile PED file need to -## be selected. -################################################################# -listSamples <- ped[, "Name.ID"] - -################################################################# -## This function creates one Profile GDS file for each -## profile present in the 'listProfiles' parameter. -################################################################# -createStudy2GDS1KG(pathGeno=pathGeno, - pedStudy=ped, - fileNameGDS=gds1KG, - listProfiles=listSamples, - studyDF=studyDF, - pathProfileGDS=pathProfileGDS, - genoSource="snp-pileup") - -################################################################# -## The Profile GDS file has been created in the -## directory pathProfileGDS using the name of the sample (ex1) -################################################################# -list.files(path=pathProfileGDS) ``` -
- -### Sub-Step 5. Generate a pruned subset of the single nucleotide variants (SNVs) - -The initial list of 1KG SNVs is pruned, using linkage disequilibrium analysis, -and a profile-specific subset of SNVs is retained for each profile. This -information is added to the **Profile GDS file**. - -The __pruningSample()__ function requires the **1KG GDS file** as input. It -also requires the path to the **Profile GDS file(s)**. - -Note that this step can require large disk space. - -```{r pruningSample, echo=TRUE, eval=TRUE, warning=FALSE, message=FALSE} -############################################################## -## Load required package -############################################################## -library(RAIDS) - -############################################################## -## The 1KG GDS file is required (demo version) -## Note that the 1KG GDS file used for this example is a -## simplified version and CANNOT be used for any real analyses -############################################################## -path1KG <- file.path(dataDir, "example", "gdsRef") - -fileGDS <- file.path(path1KG, "ex1kg.gds") - -## Open the 1KG GDS file (demo version) -gds1KG <- snpgdsOpen(fileGDS) - -############################################################## -## The pruning function is called with one profile as input at the time -############################################################## -for(i in seq_len(length(listSamples))) { - ## Compute the SNV pruned subset - ## studyID: Study identifier as defined in the preceding sub-step 4. - ## The study identifier must be the same that the one present in - ## Profile GDS file. - ## pathProfileGDS: All Profile GDS files must be in the same directory - ## sub-step 4. - pruningSample(gdsReference=gds1KG, - currentProfile=listSamples[i], - studyID=studyDF$study.id, - pathProfileGDS=pathProfileGDS) - - ## Profile GDS file for the current profile - ## The file name corresponds to the path + profile identifier + ".gds" - fileGDSProfile <- file.path(pathProfileGDS, paste0(listSamples[i], ".gds")) - - ## Add the genotype information for the list of pruned SNVs - ## into the Profile GDS file - ## The genotype information is extracted from the 1KG GDS file - add1KG2SampleGDS(gdsReference=gds1KG, fileProfileGDS=fileGDSProfile, - currentProfile=listSamples[i], - studyID=studyDF$study.id) - - ## Add annotation from the 1KG GDS file to the Profile GDS file - ## This is required. - addStudy1Kg(gdsReference=gds1KG, fileProfileGDS=fileGDSProfile, - verbose=FALSE) -} - -## Close the 1KG GDS file (it is important to always close the GDS files) -closefn.gds(gds1KG) -```
-### Sub-Step 6. Estimate the allelic fraction for the pruned SNVs - -The __estimateAllelicFraction()__ estimates the allele fraction for all -SNVs present in the pruned SNV dataset. Note that the function requires -different inputs for DNA and RNA profiles. +### 2.2 Sample the reference data for donors whose genotypes will be used for synthesis and optimize ancestry inference parameters using synthetic data -For the DNA samples, these 2 files are required: +With the 1KG reference, we recommend sampling 30 donor profiles per +sub-continental population. +For reproducibility, be sure to use the same random-number generator seed. -1. The **Profile GDS file** -2. The **1KG GDS file** +In the following code, only 2 individual profiles per +sub-continental population are sampled from the +demo population GDS file: -For the RNA samples, these 3 files are required: +```{r samplingProfiles, echo=TRUE, eval=TRUE, collapse=TRUE, warning=FALSE, message=FALSE} -1. The **Profile GDS file** -2. The **1KG GDS file** -3. The **1KG SNV Annotation GDS file** - -In both cases, the other required input is: - -1. The information about the length of the chromosomes -2. The profile identifier (it corresponds to the Profile GDS file name) -3. The study identifier (it should correspond to the one used previously) - -The information about the length of the chromosomes must be assigned into a -*vector* object. This is an example on how to retrieve the information. -There are alternative ways to retrieve this information, e.g., - -```{r extractChrLength, echo=TRUE, message=FALSE, warning=FALSE, collapse=TRUE} -################################################################### -## Load required library -################################################################### -library(BSgenome.Hsapiens.UCSC.hg38) - -################################################################### -## The length of each chromosome is required -## Chromosomes X, Y and M need relabeling (see below) -## There are alternative ways to retrieve this information -################################################################### -chrInfo <- integer(25L) -for(i in seq_len(22L)) { - chrInfo[i] <- length(Hsapiens[[paste0("chr", i)]]) -} -chrInfo[23] <- length(Hsapiens[["chrX"]]) -chrInfo[24] <- length(Hsapiens[["chrY"]]) -chrInfo[25] <- length(Hsapiens[["chrM"]]) - -chrInfo -``` +############################################################################# +## Set up the following random number generator seed to reproduce +## the expected results +############################################################################# +set.seed(3043) -
-
+############################################################################# +## Choose the profiles from the population reference GDS file for +## data synthesis. +## Here we choose 2 profiles per subcontinental population +## from the mini 1KG GDS file. +## Normally, we would use 30 randomly chosen profiles per +## subcontinental population. +############################################################################# +dataRef <- select1KGPopForSynthetic(fileReferenceGDS=refGenotype, + nbProfiles=2L) -The __estimateAllelicFraction()__ function processes one profile at the time, -as shown in this example. - - -```{r estimateAllelicFraction, echo=TRUE, eval=TRUE, collapse=TRUE, warning=FALSE, message=FALSE} -##################################################################### -## Load the required packages -##################################################################### -library(RAIDS) -library(gdsfmt) - -##################################################################### -## The 1KG GDS file is required -## The 1KG SNV Annotation GDS file is only required for RNA profiles -##################################################################### -path1KG <- file.path(dataDir, "example", "gdsRef") - -file1KG <- file.path(path1KG, "ex1kg.gds") -fileAnnot1KG <- file.path(path1KG, "exAnnot1kg.gds") - -## Open the 1KG GDS file -gds <- snpgdsOpen(file1KG) - -##################################################################### -## The information about the length of the chromosomes -##################################################################### -head(chrInfo) - -##################################################################### -## The function must be called for each profile -## This example only uses one profile -##################################################################### -## The first profile is used in the demo -profileName <- listSamples[1] - -##################################################################### -## The Profile GDS file is required -##################################################################### -## The name must correspond to the profile identifier -fileProfile <- file.path(pathProfileGDS, paste0(profileName, ".gds")) - -## Open the Profile GDS file in writing mode -gdsProfile <- openfn.gds(fileProfile, readonly=FALSE) - -################################################################### -## The estimation of the allelic fraction -################################################################### -## Estimate the allele fraction of the pruned SNVs -## The current example is for a DNA sample -## In the case of RNA sample, the function needs different inputs -## such as the 1KG Annotation GDS file and -## The 'blockID' should be as listed in the 1KG Annotation GDS file -## for the gene annotation of the SNVs -estimateAllelicFraction(gdsReference=gds, gdsProfile=gdsProfile, - currentProfile=profileName, - studyID=studyDF$study.id, - chrInfo=chrInfo) - -## Close both GDS files (important) -closefn.gds(gdsProfile) -closefn.gds(gds) ``` -This step must be executed for each profile present in the study.
+### 2.3 Infer ancestry -## Step 3 - Optimize the ancestry inference parameters +Within a single function call, data synthesis is performed, the synthetic +data are used to optimize the inference parameters and, with these, the +ancestry is inferred from the input sequence profile. -At this step, optimization of the parameters is required to maximize the -the ancestry inference accuracy (next step). +According to the type of input data (RNA or DNA sequence), a specific function +should be called. The *inferAncestry()* function (*inferAncestryDNA()* is +the same as *inferAncestry()* ) is used for DNA profiles while +the *inferAncestryGeneAware()* function is RNA specific. -```{r graphStep3, echo=FALSE, fig.align="center", fig.cap="Step 3 - Find the optimized parameters for the ancestry inference", out.width='120%', results='asis', warning=FALSE, message=FALSE} -knitr::include_graphics("MainSteps_Step3_v01.png") -``` +The *inferAncestry()* function requires a specific input format for the individual's +genotyping profile as explained in the Introduction. The format is set by +the *genoSource* parameter. -Two inference parameters to be optimized: +One of the allowed formats is VCF (*genoSource=c("VCF")*), with the following +mandatory fields: _GT_, _AD_ and _DP_. +The VCF file must be gzipped. -- _K_: the number of neighbors used to call the ancestry -- _D_: the number of PCA components retained +Also allowed is a "generic" file format (*genoSource=c("generic")*), specified as +a comma-separated table The following columns are mandatory: -The accuracy is evaluated using a synthetic data set created from merging one -cancer profile with multiple 1KG samples of known ancestry. Using the synthetic -profiles, a range of _K_ and _D_ values are tested. Through that process, the -_K_ and _D_ values are tuned to maximize accuracy. +* _Chromosome_: The name of the chromosome can be formatted as chr1 or 1 +* _Position_: The position on the chromosome +* _Ref_: The reference nucleotide +* _Alt_: The alternative nucleotide +* _Count_: The total read count +* _File1R_: Read count for the reference nucleotide +* _File1A_: Read count for the alternative nucleotide + +Note: a header with identical column names is required. -This step consists of two sub-steps: +In this example, the profile is from DNA source and requires the use of the +*inferAncestry()* function. -1. Generate the synthetic dataset -2. Compute the PCA-KNN ancestry call for each synthetic profile -
+```{r infere, echo=TRUE, eval=TRUE, collapse=TRUE, warning=FALSE, message=FALSE} -### Sub-Step 1. Generate the synthetic dataset - -A synthetic profile is generated through the merging of one cancer profile -with one 1KG sample of known ancestry. Multiple 1KG samples of different -ancestry are required to create a synthetic data set that will be able to -show the specific accuracy for each super-population. All the synthetic -profiles are saved in the **Profile GDS file** corresponding to the -cancer profile used to generate the synthetic data set. - -In summary, a fixed number of profiles for each super-population is extracted -from the 1KG study. The information is saved in the **Profile GDS file** -associated to the selected cancer profile. A synthetic profile is created for -each combination of one 1KG sample and cancer profile. All synthetic profiles -are then saved in the **Profile GDS file**. - -The three functions _select1KGPop()_, _prepSynthetic()_ and _syntheticGeno()_ -are used for the synthetic data synthesis. - -These 3 files are required: - -1. The **Profile GDS file** -2. The **1KG GDS file** -3. The **1KG Annotation GDS file** - - -```{r generateSynthetic, echo=TRUE, eval=TRUE, collapse=TRUE, warning=FALSE, message=FALSE} -#################################################################### -## Load required packages -#################################################################### -library(RAIDS) -library(gdsfmt) - -#################################################################### -## Randomly extract a fixed number of profiles for each -## subcontinental population present in the 1KG GDS file. -## When not enough profiles are available, all profiles are selected. -#################################################################### -gds1KG <- snpgdsOpen(file1KG) - -#################################################################### -## Fix seed to ensure reproducible results -#################################################################### -set.seed(3043) - -#################################################################### -## Select the profiles from 1KG for the synthetic data. -## Here we select 2 profiles from 1KG for each subcontinental-level -## Normally, we use 30 profiles from 1KG for each -## subcontinental-level but it is too big for the example. -## The 1KG GDS file in this example only has 6 profiles for each -## subcontinental-level (for demo purpose only) -#################################################################### -dataRef <- select1KGPop(gds1KG, nbProfiles=2L) - -## Extract the list of selected 1KG sample identifiers -listProfileRef <- dataRef$sample.id - -#################################################################### -## A data.frame with the description of the study for the synthetic -## data is required. -## The column names must be as shown -#################################################################### -syntheticStudyDF <- data.frame(study.id="MYDATA.Synthetic", - study.desc="MYDATA synthetic data", - study.platform="PLATFORM", - stringsAsFactors=FALSE) - -## The Profile GDS file is needed -fileProfile <- file.path(pathProfileGDS, paste0(profileName, ".gds")) - -#################################################################### -## The prepSynthetic() function prepares the annotation for -## the synthetic data -## The information is saved into the Profile GDS file -#################################################################### -prepSynthetic(fileProfileGDS=fileProfile, - listSampleRef=listProfileRef, - profileID=profileName, - studyDF=syntheticStudyDF, - prefix="1") - -#################################################################### -## Both the 1KG GDS file and the 1KG Annotation GDS file -## are required -#################################################################### -path1KG <- file.path(dataDir, "example", "gdsRef") -fileRefAnnot <- file.path(path1KG, "exAnnot1kg.gds") - -## Open 1KG Annotation GDS file -gdsRefAnnot <- openfn.gds(fileRefAnnot) - -#################################################################### -## The syntheticGeno() function generates the synthetic profiles. -## The synthetic profiles are saved in the Profile GDS file -#################################################################### -resG <- syntheticGeno(gdsReference=gds1KG, - gdsRefAnnot=gdsRefAnnot, - fileProfileGDS=fileProfile, - profileID=profileName, - listSampleRef=listProfileRef, - prefix="1") - -## Close both GDS files -closefn.gds(gds1KG) -closefn.gds(gdsRefAnnot) -``` +########################################################################### +## GenomeInfoDb and BSgenome are required libraries to run this example +########################################################################### +if (requireNamespace("GenomeInfoDb", quietly=TRUE) && + requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) { -
+ ####################################################################### + ## Chromosome length information is required + ## chr23 is chrX, chr24 is chrY and chrM is 25 + ####################################################################### + genome <- BSgenome.Hsapiens.UCSC.hg38::Hsapiens + chrInfo <- GenomeInfoDb::seqlengths(genome)[1:25] + ####################################################################### + ## The demo SNP VCF file of the DNA profile donor + ####################################################################### + fileDonorVCF <- file.path(dataDir, "example", "snpPileup", "ex1.vcf.gz") -### Sub-Step 2. Perform the PCA-KNN ancestry call for each synthetic profile - -The ancestry is inferred for each synthetic profile. As the ancestry of origin -of the 1KG profile used to generate the synthetic profile is known, the -accuracy of the calls will be assessed for different parameters. - -```{r PCA.KNN.Synthetic, collapse=TRUE, echo=TRUE, eval=TRUE, warning=FALSE, message=FALSE} -##################################################################### -## Load required packages -##################################################################### -library(RAIDS) -library(gdsfmt) - -#################################################################### -## The 1KG GDS file is required -##################################################################### -## Open the 1KG GDS file -gds <- openfn.gds(file1KG) - -##################################################################### -## The path to the directory where the PCA results will be saved -## in RDS files. -## The directory must exist. -##################################################################### -pathOut <- file.path(pathProfileGDS) - -if(! file.exists(pathOut)) { - dir.create(pathOut) + ####################################################################### + ## The ancestry inference call + ####################################################################### + resOut <- inferAncestry(profileFile=fileDonorVCF, + pathProfileGDS=pathProfileGDS, + fileReferenceGDS=refGenotype, + fileReferenceAnnotGDS=refAnnotation, + chrInfo=chrInfo, + syntheticRefDF=dataRef, + genoSource=c("VCF")) } -##################################################################### -## Get the super-population information (known ancestry) for the -## reference profiles. This is the ground truth for the 1KG profiles. -##################################################################### -refKnownSuperPop <- getRef1KGPop(gds, "superPop") - -##################################################################### -## Fix the RNG seed as in step 6 to ensure the same results -##################################################################### -set.seed(3043) - -## Select the 1KG samples used to generate the synthetic dataset -## Already done in step 6, no need to repeat if the results have been saved -dataRef <- select1KGPop(gds, nbProfiles=2L) - -##################################################################### -## The function splitSelectByPop() generates a matrix with the -## reference samples split by sub-continental population -##################################################################### -sampleRM <- splitSelectByPop(dataRef) - -## Loop for all cancer samples with associated synthetic data -for(i in seq_len(length(listSamples))) { - - ## The Profile GDS file associated to the cancer profile - fileProfile <- file.path(pathProfileGDS, - paste0(listSamples[i], ".gds")) - - ## A sub-directory is created for the cancer sample - ## Beware that the number of files created will correspond to the - ## number of rows in the sampleRM matrix - pathOutProfile <- file.path(pathOut, listSamples[i]) - if(! file.exists(pathOutProfile)) { - dir.create(pathOutProfile) - } - - ## Open the Profile GDS file - gdsProfile <- snpgdsOpen(fileProfile) - - ## For each row of the sampleRM matrix - for(j in seq_len(nrow(sampleRM))) { - ## Run a PCA analysis using 1 synthetic profile from each - ## sub-continental ancestry - ## The synthetic profiles are projected on the 1KG PCA space - ## (the 1KG reference profiles used to generate the synthetic profiles - ## are removed from this PCA) - ## The K-nearest neighbors analysis is done using - ## a range of K and D values - syntKNN <- computePoolSyntheticAncestryGr(gdsProfile=gdsProfile, - sampleRM=sampleRM[j,], - studyIDSyn=syntheticStudyDF$study.id, - np=4L, - spRef=refKnownSuperPop, - eigenCount=15L) - - ## Results are saved - saveRDS(syntKNN$matKNN, file.path(pathOutProfile, - paste0("KNN.synt.", listSamples[i], ".", j, ".rds"))) - } - - ## Close Sample GDS file (important) - closefn.gds(gdsProfile) -} - -## Close 1KG GDS file (important) -closefn.gds(gds) ``` -
- -## Step 4 - Run the ancestry inference in the input data set - -The ancestry inference is done with the optimized _K_ and _D_ parameters. More -specifically, a PCA is generated using the 1KG reference samples and the -cancer sample. The _D_ parameter specifies the number of dimension for the -PCA. Then, the ancestry of the cancer sample is inferred using -a k-nearest neighbors classification method. The _K_ parameter specifies the -number of neighbors used for the classification. +The temporary files created in this example are deleted as follows. +```{r removeTmp, echo=TRUE, eval=TRUE, collapse=TRUE, warning=FALSE, message=FALSE} -```{r graphStep4, echo=FALSE, fig.align="center", fig.cap="Step 4 - Run the ancestry inference on the external study", out.width='120%', results='asis', warning=FALSE, message=FALSE} -knitr::include_graphics("MainSteps_Step4_v01.png") +####################################################################### +## Remove temporary files created for this demo +####################################################################### +unlink(pathWorkingDirectory, recursive=TRUE, force=TRUE) + ```
- -The PCA of the sample and KNN sample and Call the ancestry with the optimal -_K_ and _D_ parameters. - -Note: The formal selection of _K_ and _D_ parameters is done at this step but -all the synthetic data are prepared in the step 3. +
-```{r PCA.KNN.Sample, warning=FALSE, message=FALSE, collapse=TRUE, echo=TRUE, eval=TRUE} -#################################################################### -## Load required packages -#################################################################### -library(RAIDS) -library(gdsfmt) +## Step 3. Examine the value of the inference call -#################################################################### -## The reference 1KG GDS file is required -#################################################################### +The inferred ancestry and the optimal parameters are present in the *list* +object generated by the *inferAncestry()* and *inferAncestryGeneAware()* +functions. -## Open the 1KG GDS file -gdsReference <- openfn.gds(file1KG) - -## A directory where result files are going to be saved, -## where a sub-directory will be set up for each input profile -pathOut <- file.path(pathProfileGDS) -if(! file.exists(pathOut)) { - dir.create(pathOut) -} +```{r printRes, echo=TRUE, eval=TRUE, collapse=TRUE, warning=FALSE, message=FALSE} -#################################################################### -## The getRef1KGPop() function extract the known super-population -## of the reference samples. -## We expect the ancestry call from a synthetic profile to -## correspond to the known ancestry of the reference sample used to -## synthesize it. -#################################################################### -refKnownSuperPop <- getRef1KGPop(gdsReference, "superPop") - -## Loop on each profile -## Can also be run in parallel or on different clusters... -for(i in seq_len(length(listSamples))){ - - ## Extract the GDS file name and path for the current profile - fileProfile <- file.path(pathProfileGDS, paste0(listSamples[i], ".gds")) - - ## Directory where the KNN results of the synthetic profiles have been saved - pathKNN <- file.path(pathOut, listSamples[i]) - listFilesName <- dir(file.path(pathKNN), ".rds") - - ## List of the KNN result files from PCA run on synthetic data - listFiles <- file.path(file.path(pathKNN) , listFilesName) - - ## Open the Profile GDS file - gdsProfile <- snpgdsOpen(fileProfile) - - ## Select the optimal K and D parameters from the synthetic data results - ## Use those parameter to infer the ancestry of the specific profile - resCall <- computeAncestryFromSyntheticFile(gdsReference=gdsReference, - gdsProfile=gdsProfile, - listFiles=listFiles, - currentProfile=listSamples[i], - spRef=refKnownSuperPop, - studyIDSyn=syntheticStudyDF$study.id, - np=1L) - - saveRDS(resCall, file.path(pathOut, - paste0(listSamples[i], ".infoCall", ".rds"))) - - write.csv(resCall$Ancestry, - file.path(pathOut, paste0(listSamples[i], ".Ancestry",".csv")), - quote=FALSE, row.names=FALSE) - - ## Close the Profile GDS file (important) - closefn.gds(gdsProfile) -} +########################################################################### +## The output is a list object with multiple entries +########################################################################### +class(resOut) +names(resOut) -## Close the 1KG GDS file (important) -closefn.gds(gdsReference) - -#################################################################### -## Show the ancestry inference (SuperPop) and -## optimal number of PCA components D -## optimal number of neighbours K -#################################################################### -resAncestry <- read.csv(file.path(pathOut, - paste0(ped$Name.ID[1], ".Ancestry.csv"))) -resAncestry - -## Clean-up demo files -unlink(fileProfile, force=TRUE) -unlink(pathOut, recursive=TRUE, force=TRUE) ``` -The *computeAncestryFromSyntheticFile()* function generates 3 types of files -in the *OUTPUT* directory. +
-* The ancestry inference CSV file (".Ancestry.csv" file) -* The inference information RDS file (".infoCall.rds" file) -* The parameter information RDS files from the synthetic inference ("KNN.synt.*.rds" files in a sub-directory) +### 3.1 Inspect the inference and the optimal parameters -In addition, a sub-directory (named using the *profile ID*) is -also created. +Global ancestry is inferred using principal-component decomposition +followed by nearest neighbor classification. Two parameters are defined and optimized: +*D*, the number of the top principal directions retained and *k*, the number of nearest +neighbors. - +The results of the inference are provided as the *Ancestry* item in the *resOut* list. +It is a *data.frame* with the following columns: -# Wrapper function to run ancestry inference in one command +* _sample.id_: The unique identifier of the sample +* _D_: The optimal *D* inference parameter +* _k_: The optimal *k* inference parameter +* _SuperPop_: The inferred ancestry -The Steps 1 to Step 2 Sub-step 3 ( [Main Step](#mains) ) are required before -this section. -The wrapper function requires 4 files as input: +```{r print, echo=TRUE, eval=TRUE, collapse=TRUE, warning=FALSE, message=FALSE} -- The **1KG GDS file** -- The **1KG SNV Annotation GDS file** -- The **Sample SNP pileup file** (one per sample present in the study) -- The **Sample PED RDS file** (one file with information for all samples in the study) +########################################################################### +## The ancestry information is stored in the 'Ancestry' entry +########################################################################### +print(resOut$Ancestry) -A *data.frame* containing the general information about the study is -also required. The *data.frame* must contain those 3 columns: +``` -- **study.id**: The study identifier (example: TCGA-BRCA). -- **study.desc**: The description of the study. -- **study.platform**: The type of sequencing (example: RNA-seq). +
+### 3.2 Visualize the RAIDS performance for the synthetic data -```{r runExomeAncestry, echo=TRUE, eval=TRUE, collapse=TRUE, warning=FALSE, message=FALSE} -#################################################################### -## Load required packages -#################################################################### -library(RAIDS) -library(gdsfmt) -## Path to the demo 1KG GDS file is located in this package -dataDir <- system.file("extdata", package="RAIDS") +The *createAUROCGraph()* function enable the visualization of RAIDS +performance for the synthetic data, as a function of *D* and *k*. -################################################################# -## The path and file name for the PED RDS file -## will the information about the analyzed samples -################################################################# -filePED <- file.path(dataDir, "example", "pedEx.rds") -ped <- readRDS(filePED) -head(ped) - -################################################################# -## The 1KG GDS file and the 1KG SNV Annotation GDS file -## need to be located in the same directory -## Note that the 1KG GDS file used for this example is a -## simplified version and CANNOT be used for any real analysis -################################################################# -path1KG <- file.path(dataDir, "example", "gdsRef") - -fileGDS <- file.path(path1KG, "ex1kg.gds") -fileAnnotGDS <- file.path(path1KG, "exAnnot1kg.gds") - -################################################################# -## The Sample SNP pileup files (one per sample) need -## to be located in the same directory. -################################################################# -pathGeno <- file.path(dataDir, "example", "snpPileup") - -################################################################# -## The path where the Sample GDS files (one per sample) -## will be created need to be specified. -################################################################# -pathProfileGDS <- file.path(dataDir, "example", "out.tmp") - -pathOut <- file.path(dataDir, "example", "res.out") - -################################################################# -## A data frame containing general information about the study -## is also required. The data frame must have -## those 3 columns: "study.id", "study.desc", "study.platform" -################################################################# -studyDF <- data.frame(study.id="MYDATA", - study.desc="Description", - study.platform="PLATFORM", - stringsAsFactors=FALSE) - -#################################################################### -## Fix RNG seed to ensure reproducible results -#################################################################### -set.seed(3043) +```{r visualize, echo=TRUE, eval=TRUE, fig.align="center", fig.cap="RAIDS performance for the synthtic data.", results='asis', collapse=FALSE, warning=FALSE, message=FALSE} -#################################################################### -## Select the profiles from 1KG for the synthetic data. -## Here we select 2 profiles from 1KG for each subcontinental-level -## Normally, we use 30 profiles from 1KG for each -## subcontinental-level but it is too big for the example. -## The 1KG files in this example only have 6 profiles for each -## subcontinental-level (for demo purpose only) -#################################################################### -gds1KG <- snpgdsOpen(fileGDS) -dataRef <- select1KGPop(gds1KG, nbProfiles=2L) -closefn.gds(gds1KG) - -## Chromosome length information -## chrInfo[23] is chrX, chrInfo[24] is chrY and chrM is chrInfo[25] -chrInfo <- c(248956422L, 242193529L, 198295559L, 190214555L, - 181538259L, 170805979L, 159345973L, 145138636L, 138394717L, 133797422L, - 135086622L, 133275309L, 114364328L, 107043718L, 101991189L, 90338345L, - 83257441L, 80373285L, 58617616L, 64444167L, 46709983L, 50818468L, - 156040895L, 57227415L, 16569L) - -## A formal way to get the chormosome length information -## library(BSgenome.Hsapiens.UCSC.hg38) -## chrInfo <- integer(25L) -## for(i in seq_len(22L)){ chrInfo[i] <- -## length(Hsapiens[[paste0("chr", i)]])} -## chrInfo[23] <- length(Hsapiens[["chrX"]]) -## chrInfo[24] <- length(Hsapiens[["chrY"]]) -## chrInfo[25] <- length(Hsapiens[["chrM"]]) - -runExomeAncestry(pedStudy=ped, studyDF=studyDF, - pathProfileGDS=pathProfileGDS, - pathGeno=pathGeno, - pathOut=pathOut, - fileReferenceGDS=fileGDS, - fileReferenceAnnotGDS=fileAnnotGDS, - chrInfo=chrInfo, - syntheticRefDF=dataRef, - genoSource="snp-pileup") -list.files(pathOut) -list.files(file.path(pathOut, ped$Name.ID[1])) - -#################################################################### -## Show the ancestry inference (SuperPop) and -## optimal number of PCA component D -## optimal number neighbour K -#################################################################### -resAncestry <- read.csv(file.path(pathOut, - paste0(ped$Name.ID[1], ".Ancestry.csv"))) -resAncestry +########################################################################### +## Create a graph showing the perfomance for the synthetic data +## The output is a ggplot object +########################################################################### +createAUROCGraph(dfAUROC=resOut$paraSample$dfAUROC, title="Example ex1") -## Remove temporary files created for this demo -unlink(pathProfileGDS, recursive=TRUE, force=TRUE) -unlink(pathOut, recursive=TRUE, force=TRUE) ``` +In this illustrative example, the performance estimates are lower than expected +with a realistic sequence profile and a complete reference population file. +

-The *runExomeAncestry()* function generates 3 types of files -in the *OUTPUT* directory. +# Format population reference dataset (optional) -* The ancestry inference CSV file (".Ancestry.csv" file) -* The inference information RDS file (".infoCall.rds" file) -* The parameter information RDS files from the synthetic inference ("KNN.synt.*.rds" files in a sub-directory) -In addition, a sub-directory (named using the *profile ID*) is -also created. +```{r graphStep1, echo=FALSE, fig.align="center", fig.cap="Step 1 - Provide population reference data", out.width='120%', results='asis', warning=FALSE, message=FALSE} +knitr::include_graphics("Step1_population_file_v01.png") +``` -
-
+A population reference dataset with known ancestry is required to infer +ancestry. + +Three important reference files, containing formatted information about +the reference dataset, are required: +- The population reference GDS File +- The population reference SNV Annotation GDS file +- The population reference SNV Retained VCF file (optional) -# Pre-processed files are available -Pre-processed files, such as the 1KG GDS file, are available at this address: +The formats of those files are described in +the [Population reference dataset GDS files](Create_Reference_GDS_File.html) +vignette. +The reference files associated to +the Cancer Research associated paper are available. Note that these +pre-processed files are for 1000 Genomes (1KG), in hg38. The files are +available here: + + [https://labshare.cshl.edu/shares/krasnitzlab/aicsPaper](https://labshare.cshl.edu/shares/krasnitzlab/aicsPaper) + + +The size of the 1KG GDS file +is 15GB. -Beware that some of these files are large. +The 1KG GDS file is mapped on +hg38 [@Lowy-Gallego2019a]. + +This section can be skipped if +you choose to use the pre-processed files.

+ # Session info -Here is the output of `sessionInfo()` in the enviroment in which this document was -compiled: +Here is the output of `sessionInfo()` in the environment in which this +document was compiled: ```{r sessionInfo, echo=FALSE} sessionInfo() diff --git a/vignettes/Step1_population_file_v01.png b/vignettes/Step1_population_file_v01.png new file mode 100644 index 000000000..926954f47 Binary files /dev/null and b/vignettes/Step1_population_file_v01.png differ diff --git a/vignettes/Wrappers.Rmd b/vignettes/Wrappers.Rmd new file mode 100644 index 000000000..664b92656 --- /dev/null +++ b/vignettes/Wrappers.Rmd @@ -0,0 +1,685 @@ +--- +title: "Using wrappper functions" +author: Pascal Belleau, Astrid Deschênes and Alexander Krasnitz +output: + BiocStyle::html_document: + number_sections: yes + toc: true + pkgdown: + number_sections: yes + as_is: true +urlcolor: darkred +linkcolor: darkred +bibliography: aicsBiblio.bibtex +vignette: > + %\VignetteIndexEntry{Using wrappper functionss} + %\VignettePackage{RAIDS} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r style, echo=FALSE, results='hide', warning=FALSE, message=FALSE} +BiocStyle::markdown() + +suppressPackageStartupMessages({ + library(knitr) + library(RAIDS) + library(gdsfmt) +}) + +set.seed(121444) +``` + +
+**Package**: `r Rpackage("RAIDS")`
+**Authors**: `r packageDescription("RAIDS")[["Author"]]`
+**Version**: `r packageDescription("RAIDS")$Version`
+**Compiled date**: `r Sys.Date()`
+**License**: `r packageDescription("RAIDS")[["License"]]`
+ + +
+
+ + + + +This vignette explains, in further details, the used of the wrapper functions +that were developed for a previous release of RAIDS. + +While those functions are still +working, we recommend using the new functions as described in the main +vignette. + +
+
+ +# Main Steps + + +This is an overview of genetic ancestry inference from cancer-derived +molecular data: + +```{r graphMainSteps, echo=FALSE, fig.align="center", fig.cap="An overview of the genetic ancestry inference process.", out.width='130%', results='asis', warning=FALSE, message=FALSE} +knitr::include_graphics("MainSteps_v04.png") +``` + +The main steps are: + +**Step 1.** Format reference data from the population reference dataset (optional) + +**Step 2.1** Optimize ancestry inference parameters + +**Step 2.2** Infer ancestry for the subjects of the external study + +These steps are described in detail in the following. Steps 2.1 and 2.2 can be +run together using one wrapper function. + +
+
+ + +## Main Step - Ancestry Inference + +A wrapper function encapsulates multiple steps of the workflow. + +```{r graphWrapper, echo=FALSE, fig.align="center", fig.cap="Final step - The wrapper function encapsulates multiple steps of the workflow.", out.width='120%', results='asis', warning=FALSE, message=FALSE} +knitr::include_graphics("MainSteps_Wrapper_v04.png") +``` + +In summary, the wrapper function generates the synthetic dataset and uses +it to selected the optimal parameters before calling the genetic ancestry +on the current profiles. + +According to the type of input data (RNA or DNA), a specific wrapper function +is available. + +
+ +### DNA Data - Wrapper function to run ancestry inference on DNA data + +The wrapper function, called _runExomeAncestry()_, requires 4 files as input: + +- The **population reference GDS file** +- The **population reference SNV Annotation GDS file** +- The **Profile SNP file** (one per sample present in the study) +- The **Profile PED RDS file** (one file with information for all +profiles in the study) + +In addition, a *data.frame* containing the general information about the +study is also required. The *data.frame* must contain those 3 columns: + +- _study.id_: The study identifier (example: TCGA-BRCA). +- _study.desc_: The description of the study. +- _study.platform_: The type of sequencing (example: RNA-seq). + +
+ +#### **Population reference files** + +For demonstration purpose, a small +**population reference GDS file** (called _ex1_good_small_1KG.gds_) and a small +**population reference SNV Annotation GDS file** (called +_ex1_good_small_1KG_Annot.gds_) are +included in this package. Beware that those two files should not be used to +run a real ancestry inference.The results obtained with those files won't be +reliable. + +The required **population reference GDS file** and +**population reference SNV Annotation GDS file** should be stored in the same +directory. In the example below, this directory is referred to +as **pathReference**. + +
+ +#### **Profile SNP file** + +The **Profile SNP file** can be either in a VCF format or in a generic format. + +The **Profile SNP VCF file** follows the VCF standard with at least +those genotype fields: _GT_, _AD_ and _DP_. The identifier of the genotype +in the VCF file must correspond to the profile identifier _Name.ID_. +The SNVs must be germline variants and should include the genotype of the +wild-type homozygous at the selected positions in the reference. One file per +profile is need and the VCF file must be gzipped. + +Note that the name assigned to the **Profile SNP VCF file** has to +correspond to the profile identifier _Name.ID_ in the following analysis. +For example, a SNP file called "Sample.01.vcf.gz" would be +associated to the "Sample.01" profile. + +A generic SNP file can replace the VCF file. The **Profile SNP Generic file** +format is coma separated and the mandatory columns are: + +* _Chromosome_: The name of the chromosome +* _Position_: The position on the chromosome +* _Ref_: The reference nucleotide +* _Alt_: The aternative nucleotide +* _Count_: The total count +* _File1R_: The count for the reference nucleotide +* _File1A_: The count for the alternative nucleotide + +Beware that the starting position in the **population reference GDS File** is +zero (like BED files). The **Profile SNP Generic file** should also start +at position zero. + +Note that the name assigned to the **Profile SNP Generic file** has to +correspond to the profile identifier _Name.ID_ in the following analysis. +For example, a SNP file called "Sample.01.generic.txt.gz" would be +associated to the "Sample.01" profile. + +
+ +#### **Profile PED RDS file** + +The **Profile PED RDS file** must contain a *data.frame* describing all +the profiles to be analyzed. These 5 mandatory columns: + +- _Name.ID_: The unique sample identifier. The associated **profile SNP file** +should be called "Name.ID.txt.gz". +- _Case.ID_: The patient identifier associated to the sample. +- _Sample.Type_: The information about the profile tissue source +(primary tumor, metastatic tumor, normal, etc..). +- _Diagnosis_: The donor's diagnosis. +- _Source_: The source of the profile sequence data (example: dbGAP_XYZ). + +Important: The row names of the *data.frame* must be the profiles *Name.ID*. + +This file is referred to as the **Profile PED RDS file** (PED for pedigree). +Alternatively, the PED information can be saved in another type of +file (CVS, etc..) as long as the *data.frame* information can be regenerated +in R (with _read.csv()_ or else). + +
+ +#### **Example** + +This example run an ancestry inference on an exome sample. Both population +reference files are demonstration files and should not be +used for a real ancestry inference. Beware that running an ancestry inference +on real data will take longer to run. + +```{r runExomeAncestry, echo=TRUE, eval=TRUE, collapse=FALSE, warning=FALSE, message=FALSE} +############################################################################# +## Load required packages +############################################################################# +library(RAIDS) +library(gdsfmt) + +## Path to the demo 1KG GDS file is located in this package +dataDir <- system.file("extdata", package="RAIDS") + +############################################################################# +## Load the information about the profile +############################################################################# +data(demoPedigreeEx1) +head(demoPedigreeEx1) + +############################################################################# +## The population reference GDS file and SNV Annotation GDS file +## need to be located in the same directory. +## Note that the population reference GDS file used for this example is a +## simplified version and CANNOT be used for any real analysis +############################################################################# +pathReference <- file.path(dataDir, "tests") + +fileGDS <- file.path(pathReference, "ex1_good_small_1KG.gds") +fileAnnotGDS <- file.path(pathReference, "ex1_good_small_1KG_Annot.gds") + +############################################################################# +## A data frame containing general information about the study +## is also required. The data frame must have +## those 3 columns: "study.id", "study.desc", "study.platform" +############################################################################# +studyDF <- data.frame(study.id="MYDATA", + study.desc="Description", + study.platform="PLATFORM", + stringsAsFactors=FALSE) + +############################################################################# +## The Sample SNP VCF files (one per sample) need +## to be all located in the same directory. +############################################################################# +pathGeno <- file.path(dataDir, "example", "snpPileup") + +############################################################################# +## Fix RNG seed to ensure reproducible results +############################################################################# +set.seed(3043) + +############################################################################# +## Select the profiles from the population reference GDS file for +## the synthetic data. +## Here we select 2 profiles from the simplified 1KG GDS for each +## subcontinental-level. +## Normally, we use 30 profile for each +## subcontinental-level but it is too big for the example. +## The 1KG files in this example only have 6 profiles for each +## subcontinental-level (for demo purpose only). +############################################################################# +gds1KG <- snpgdsOpen(fileGDS) +dataRef <- select1KGPop(gds1KG, nbProfiles=2L) +closefn.gds(gds1KG) + +## GenomeInfoDb and BSgenome are required libraries to run this example +if (requireNamespace("GenomeInfoDb", quietly=TRUE) && + requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) { + + ## Chromosome length information + ## chr23 is chrX, chr24 is chrY and chrM is 25 + chrInfo <- GenomeInfoDb::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25] + + ########################################################################### + ## The path where the Sample GDS files (one per sample) + ## will be created needs to be specified. + ########################################################################### + pathProfileGDS <- file.path(tempdir(), "exampleDNA", "out.tmp") + + ########################################################################### + ## The path where the result files will be created needs to + ## be specified + ########################################################################### + pathOut <- file.path(tempdir(), "exampleDNA", "res.out") + + ## Example can only be run if the current directory is in writing mode + if (!dir.exists(file.path(tempdir(), "exampleDNA"))) { + + dir.create(file.path(tempdir(), "exampleDNA")) + dir.create(pathProfileGDS) + dir.create(pathOut) + + ######################################################################### + ## The wrapper function generates the synthetic dataset and uses it + ## to selected the optimal parameters before calling the genetic + ## ancestry on the current profiles. + ## All important information, for each step, are saved in + ## multiple output files. + ## The 'genoSource' parameter has 2 options depending on how the + ## SNP files have been generated: + ## SNP VCF files have been generated: + ## "VCF" or "generic" (other software) + ## + ######################################################################### + runExomeAncestry(pedStudy=demoPedigreeEx1, studyDF=studyDF, + pathProfileGDS=pathProfileGDS, + pathGeno=pathGeno, + pathOut=pathOut, + fileReferenceGDS=fileGDS, + fileReferenceAnnotGDS=fileAnnotGDS, + chrInfo=chrInfo, + syntheticRefDF=dataRef, + genoSource="VCF") + list.files(pathOut) + list.files(file.path(pathOut, demoPedigreeEx1$Name.ID[1])) + + ####################################################################### + ## The file containing the ancestry inference (SuperPop column) and + ## optimal number of PCA component (D column) + ## optimal number of neighbours (K column) + ####################################################################### + resAncestry <- read.csv(file.path(pathOut, + paste0(demoPedigreeEx1$Name.ID[1], ".Ancestry.csv"))) + print(resAncestry) + + ## Remove temporary files created for this demo + unlink(pathProfileGDS, recursive=TRUE, force=TRUE) + unlink(pathOut, recursive=TRUE, force=TRUE) + unlink(file.path(tempdir(), "exampleDNA"), recursive=TRUE, force=TRUE) + } +} + + +``` + +
+
+ +The *runExomeAncestry()* function generates 3 types of files +in the *pathOut* directory. + +* The ancestry inference CSV file (**".Ancestry.csv"** file) +* The inference information RDS file (**".infoCall.rds"** file) +* The parameter information RDS files from the synthetic inference +(__"KNN.synt.__*__.rds"__ files in a sub-directory) + +In addition, a sub-directory (named using the *profile ID*) is +also created. + +The inferred ancestry is stored in the ancestry inference CSV +file (**".Ancestry.csv"** file) which also contains those columns: + +* _sample.id_: The unique identifier of the sample +* _D_: The optimal PCA dimension value used to infer the ancestry +* _k_: The optimal number of neighbors value used to infer the ancestry +* _SuperPop_: The inferred ancestry + +
+
+ + + +### RNA data - Wrapper function to run ancestry inference on RNA data + +The process is the same as for the DNA but use the wrapper function +called _runRNAAncestry()_. Internally the data is process differently. +It requires 4 files as input: + +- The **population reference GDS file** +- The **population reference SNV Annotation GDS file** +- The **Profile SNP file** (one per sample present in the study) +- The **Profile PED RDS file** (one file with information for all +profiles in the study) + +A *data.frame* containing the general information about the study is +also required. The *data.frame* must contain those 3 columns: + +- _study.id_: The study identifier (example: TCGA-BRCA). +- _study.desc_: The description of the study. +- _study.platform_: The type of sequencing (example: RNA-seq). + +
+ +#### **Population reference files** + +For demonstration purpose, a small +**population reference GDS file** (called _ex1_good_small_1KG.gds_) and a small +**population reference SNV Annotation GDS file** (called +_ex1_good_small_1KG_Annot.gds_) are +included in this package. Beware that those two files should not be used to +run a real ancestry inference.The results obtained with those files won't be +reliable. + +The required **population reference GDS file** and +**population reference SNV Annotation GDS file** should be stored in the same +directory. In the example below, this directory is referred to +as **pathReference**. + +
+ +#### **Profile SNP file** + +The **Profile SNP file** can be either in a VCF format or in a generic format. + +The **Profile SNP VCF file** follows the VCF standard with at least +those genotype fields: _GT_, _AD_ and _DP_. The identifier of the genotype +in the VCF file must correspond to the profile identifier _Name.ID_. +The SNVs must be germline variants and should include the genotype of the +wild-type homozygous at the selected positions in the reference. One file per +profile is need and the VCF file must be gzipped. + +Note that the name assigned to the **Profile SNP VCF file** has to +correspond to the profile identifier _Name.ID_ in the following analysis. +For example, a SNP file called "Sample.01.vcf.gz" would be +associated to the "Sample.01" profile. + +A generic SNP file can replace the VCF file. The **Profile SNP Generic file** +format is coma separated and the mandatory columns are: + +* _Chromosome_: The name of the chromosome +* _Position_: The position on the chromosome +* _Ref_: The reference nucleotide +* _Alt_: The aternative nucleotide +* _Count_: The total count +* _File1R_: The count for the reference nucleotide +* _File1A_: The count for the alternative nucleotide + +Beware that the starting position in the **population reference GDS File** is +zero (like BED files). The **Profile SNP Generic file** should also start +at position zero. + +Note that the name assigned to the **Profile SNP Generic file** has to +correspond to the profile identifier _Name.ID_ in the following analysis. +For example, a SNP file called "Sample.01.generic.txt.gz" would be +associated to the "Sample.01" profile. + +
+ +#### **Profile PED RDS file** + +The **Profile PED RDS file** must contain a *data.frame* describing all +the profiles to be analyzed. These 5 mandatory columns: + +- _Name.ID_: The unique sample identifier. The associated **profile SNP file** +should be called "Name.ID.txt.gz". +- _Case.ID_: The patient identifier associated to the sample. +- _Sample.Type_: The information about the profile tissue source +(primary tumor, metastatic tumor, normal, etc..). +- _Diagnosis_: The donor's diagnosis. +- _Source_: The source of the profile sequence data (example: dbGAP_XYZ). + +Important: The row names of the *data.frame* must be the profiles _Name.ID_. + +This file is referred to as the **Profile PED RDS file** (PED for pedigree). +Alternatively, the PED information can be saved in another type of +file (CVS, etc..) as long as the *data.frame* information can be regenerated +in R (with _read.csv()_ or else). + +
+ +#### **Example** + +This example run an ancestry inference on an RNA sample. Both population +reference files are demonstration files and should not be +used for a real ancestry inference. Beware that running an ancestry inference +on real data will take longer to run. + +```{r runRNAAncestry, echo=TRUE, eval=TRUE, collapse=FALSE, warning=FALSE, message=FALSE} +############################################################################# +## Load required packages +############################################################################# +library(RAIDS) +library(gdsfmt) + +## Path to the demo 1KG GDS file is located in this package +dataDir <- system.file("extdata", package="RAIDS") + +############################################################################# +## Load the information about the profile +############################################################################# +data(demoPedigreeEx1) +head(demoPedigreeEx1) + +############################################################################# +## The population reference GDS file and SNV Annotation GDS file +## need to be located in the same directory. +## Note that the population reference GDS file used for this example is a +## simplified version and CANNOT be used for any real analysis +############################################################################# +pathReference <- file.path(dataDir, "tests") + +fileGDS <- file.path(pathReference, "ex1_good_small_1KG.gds") +fileAnnotGDS <- file.path(pathReference, "ex1_good_small_1KG_Annot.gds") + +############################################################################# +## A data frame containing general information about the study +## is also required. The data frame must have +## those 3 columns: "study.id", "study.desc", "study.platform" +############################################################################# +studyDF <- data.frame(study.id="MYDATA", + study.desc="Description", + study.platform="PLATFORM", + stringsAsFactors=FALSE) + +############################################################################# +## The Sample SNP VCF files (one per sample) need +## to be all located in the same directory. +############################################################################# +pathGeno <- file.path(dataDir, "example", "snpPileupRNA") + +############################################################################# +## Fix RNG seed to ensure reproducible results +############################################################################# +set.seed(3043) + +############################################################################# +## Select the profiles from the population reference GDS file for +## the synthetic data. +## Here we select 2 profiles from the simplified 1KG GDS for each +## subcontinental-level. +## Normally, we use 30 profile for each +## subcontinental-level but it is too big for the example. +## The 1KG files in this example only have 6 profiles for each +## subcontinental-level (for demo purpose only). +############################################################################# +gds1KG <- snpgdsOpen(fileGDS) +dataRef <- select1KGPop(gds1KG, nbProfiles=2L) +closefn.gds(gds1KG) + +## GenomeInfoDb and BSgenome are required libraries to run this example +if (requireNamespace("GenomeInfoDb", quietly=TRUE) && + requireNamespace("BSgenome.Hsapiens.UCSC.hg38", quietly=TRUE)) { + + ## Chromosome length information + ## chr23 is chrX, chr24 is chrY and chrM is 25 + chrInfo <- GenomeInfoDb::seqlengths(BSgenome.Hsapiens.UCSC.hg38::Hsapiens)[1:25] + + ############################################################################# + ## The path where the Sample GDS files (one per sample) + ## will be created needs to be specified. + ############################################################################# + pathProfileGDS <- file.path(tempdir(), "exampleRNA", "outRNA.tmp") + + ############################################################################# + ## The path where the result files will be created needs to + ## be specified + ############################################################################# + pathOut <- file.path(tempdir(), "exampleRNA", "resRNA.out") + + ## Example can only be run if the current directory is in writing mode + if (!dir.exists(file.path(tempdir(), "exampleRNA"))) { + + dir.create(file.path(tempdir(), "exampleRNA")) + dir.create(pathProfileGDS) + dir.create(pathOut) + + ######################################################################### + ## The wrapper function generates the synthetic dataset and uses it + ## to selected the optimal parameters before calling the genetic + ## ancestry on the current profiles. + ## All important information, for each step, are saved in + ## multiple output files. + ## The 'genoSource' parameter has 2 options depending on how the + ## SNP files have been generated: + ## SNP VCF files have been generated: + ## "VCF" or "generic" (other software) + ######################################################################### + runRNAAncestry(pedStudy=demoPedigreeEx1, studyDF=studyDF, + pathProfileGDS=pathProfileGDS, + pathGeno=pathGeno, + pathOut=pathOut, + fileReferenceGDS=fileGDS, + fileReferenceAnnotGDS=fileAnnotGDS, + chrInfo=chrInfo, + syntheticRefDF=dataRef, + blockTypeID="GeneS.Ensembl.Hsapiens.v86", + genoSource="VCF") + + list.files(pathOut) + list.files(file.path(pathOut, demoPedigreeEx1$Name.ID[1])) + + ######################################################################### + ## The file containing the ancestry inference (SuperPop column) and + ## optimal number of PCA component (D column) + ## optimal number of neighbours (K column) + ######################################################################### + resAncestry <- read.csv(file.path(pathOut, + paste0(demoPedigreeEx1$Name.ID[1], ".Ancestry.csv"))) + print(resAncestry) + + ## Remove temporary files created for this demo + unlink(pathProfileGDS, recursive=TRUE, force=TRUE) + unlink(pathOut, recursive=TRUE, force=TRUE) + unlink(file.path(tempdir(), "example"), recursive=TRUE, force=TRUE) + } +} + +``` + +
+
+ +The *runRNAAncestry()* function generates 3 types of files +in the *pathOut* directory. + +* The ancestry inference CSV file (**".Ancestry.csv"** file) +* The inference information RDS file (**".infoCall.rds"** file) +* The parameter information RDS files from the synthetic inference +(__"KNN.synt.__*__.rds"__ files in a sub-directory) + +In addition, a sub-directory (named using the *profile ID*) is +also created. + +The inferred ancestry is stored in the ancestry inference CSV +file (**".Ancestry.csv"** file) which also contains those columns: + +* _sample.id_: The unique identifier of the sample +* _D_: The optimal PCA dimension value used to infer the ancestry +* _k_: The optimal number of neighbors value used to infer the ancestry +* _SuperPop_: The inferred ancestry + + +
+
+ + +## Format population reference dataset (optional) + + +```{r graphStep1, echo=FALSE, fig.align="center", fig.cap="Step 1 - Formatting the information from the population reference dataset (optional)", out.width='120%', results='asis', warning=FALSE, message=FALSE} +knitr::include_graphics("MainSteps_Step1_v04.png") +``` + + +A population reference dataset with known ancestry is required to infer +ancestry. + +Three important reference files, containing formatted information about +the reference dataset, are required: + +- The population reference GDS File +- The population reference SNV Annotation GDS file +- The population reference SNV Retained VCF file + + +The format of those files are described +the [Population reference dataset GDS files](Create_Reference_GDS_File.html) +vignette. + +The reference files associated to +the Cancer Research associated paper are available. Note that these +pre-processed files are for 1000 Genomes (1KG), in hg38. The files are +available here: + + +[https://labshare.cshl.edu/shares/krasnitzlab/aicsPaper](https://labshare.cshl.edu/shares/krasnitzlab/aicsPaper) + + +The size of the 1KG GDS file +is 15GB. + +The 1KG GDS file is mapped on +hg38 [@Lowy-Gallego2019a]. + +This section can be skipped if +you choose to use the pre-processed files. + +
+
+ + + +# Session info + +Here is the output of `sessionInfo()` in the environment in which this +document was compiled: + +```{r sessionInfo, echo=FALSE} +sessionInfo() +``` + +
+
+ + +# References + diff --git a/vignettes/aicsBiblio.bibtex b/vignettes/aicsBiblio.bibtex index fdbb5a2ac..90f8d9be9 100644 --- a/vignettes/aicsBiblio.bibtex +++ b/vignettes/aicsBiblio.bibtex @@ -77,3 +77,30 @@ year = {2016} } +@article{Morgan2023, + author = {Morgan, M and Pagès H and Obenchain V and Hayden N}, + doi = {10.18129/B9.bioc.Rsamtools}, + edition = {2023}, + journal = {Bioconductor}, + title = {{Rsamtools: Binary alignment (BAM), FASTA, variant call (BCF), and tabix file import. R package version 2.16.0}}, + url = {https://bioconductor.org/packages/release/bioc/html/Rsamtools.html}, + year = {2023} +} + +@article{Zheng2012, + author = {Zheng, Xiuwen and Levine, David and Shen, Jess and Gogarten, Stephanie M. and Laurie, Cathy and Weir, Bruce S.}, + title = "{A high-performance computing toolset for relatedness and principal component analysis of SNP data}", + journal = {Bioinformatics}, + volume = {28}, + number = {24}, + pages = {3326-3328}, + year = {2012}, + month = {10}, + issn = {1367-4803}, + doi = {10.1093/bioinformatics/bts606}, + url = {https://doi.org/10.1093/bioinformatics/bts606}, + eprint = {https://academic.oup.com/bioinformatics/article-pdf/28/24/3326/48879518/bioinformatics\_28\_24\_3326.pdf}, +} + + +