diff --git a/.Rbuildignore b/.Rbuildignore new file mode 100644 index 0000000..931662b --- /dev/null +++ b/.Rbuildignore @@ -0,0 +1,3 @@ +^.*\.Rproj$ +^\.Rproj\.user$ +^README\.Rmd$ diff --git a/.gitignore b/.gitignore index 7487699..a219344 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,6 @@ .Ruserdata .DS_Store inst/doc +.RDataTmp +docs/_build/html/.buildinfo +.github diff --git a/DESCRIPTION b/DESCRIPTION index dd87e87..fe5aca2 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,16 +1,16 @@ Package: mixder Type: Package Title: A workflow for performing SNP mixture deconvolution -Version: 0.7.5 +Version: 1.0 Author: Rebecca Mitchell Maintainer: Rebecca Mitchell Description: A workflow for performing SNP mixture deconvolution of ForenSeq Kintelligence SNPs. Mixture deconvolution is performed using EuroForMix (https://github.com/oyvble/euroformix/). After mixture deconvolution, the user can choose to calculate metrics such as genotype accuracy and heterozygosity for a range of allele probability thresholds (useful for validation work) or create GEDmatch PRO reports utilizing specified allele probability thresholds. License: file LICENSE -Imports: dplyr, euroformix, ggplot2, glue, methods, prompter, readxl, rlang, shiny, shinyFiles, shinyjs, tibble, tidyr +Imports: dplyr, euroformix, ggplot2, glue, kgp, methods, plotly, prompter, readxl, rlang, shiny, shinyFiles, shinyjs, tibble, tidyr RoxygenNote: 7.3.2 Encoding: UTF-8 Depends: - R (>= 2.10) + R (>= 3.5) LazyData: true Suggests: knitr, diff --git a/NAMESPACE b/NAMESPACE index b9bdf38..5bf588c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,8 +1,10 @@ # Generated by roxygen2: do not edit by hand +export(ancestry_prediction) export(assigned_A2) export(calc_metrics) export(calculate_at) +export(centroids) export(check_3_or_4_col) export(check_allele_calls) export(check_allele_probabilities) @@ -37,8 +39,11 @@ export(run_indiv_efm_set) export(run_workflow) export(write_tables) import(dplyr) -import(ggplot2) +import(ggplot2, except = last_plot) import(glue) +import(kgp) +import(parallel) +import(plotly) import(prompter) import(shiny) import(shinyFiles) diff --git a/NEWS.md b/NEWS.md index 1988279..21b5328 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,8 @@ + +## 1.0: October 3, 2025 +- Implemented ancestry prediction tool +- Additional population-specific population allele frequency data included in package. + ## Version 0.7.5: September 24, 2025 - Updated documentation diff --git a/R/ancestry_prediction.R b/R/ancestry_prediction.R new file mode 100644 index 0000000..f9baaf6 --- /dev/null +++ b/R/ancestry_prediction.R @@ -0,0 +1,106 @@ +# ------------------------------------------------------------------------------------------------- +# Copyright (c) 2024, DHS. +# +# This file is part of MixDeR and is licensed under the BSD license: see LICENSE. +# +# This software was prepared for the Department of Homeland Security (DHS) by the Battelle National +# Biodefense Institute, LLC (BNBI) as part of contract HSHQDC-15-C-00064 to manage and operate the +# National Biodefense Analysis and Countermeasures Center (NBACC), a Federally Funded Research and +# Development Center. +# ------------------------------------------------------------------------------------------------- + +#' Title Ancestry prediction using PCA +#' +#' @param report inferred genotypes +#' @param path write path +#' @param id sample ID +#' @param analysis_type mixure deconvolution type (conditioned vs. unconditioned) +#' @param groups How to color PCA plots (superpopulations and/or subpopulations) +#' +#' @import kgp +#' @import plotly +#' +#' @return NA +#' @export +#' +ancestry_prediction = function(report, path, id, analysis_type, contrib_status, testsnps, groups) { + if (testsnps == "All Autosomal SNPs") { + plotid="AllSNPs" + geno=mixder::ancestry_1000G_allsamples + } else { + plotid="AncestrySNPsOnly" + geno=mixder::ancestrysnps_1000G_allsamples + } + ncols=ncol(geno) + geno_filt=geno[,c(7:ncols)] + snps = data.frame("snp_id"=colnames(geno_filt)) + snps = snps %>% + separate(.data$snp_id, c("rsid", "ref_allele"), remove=F) + snps$order = seq(1:nrow(snps)) + merged_alleles = merge(snps, report, by="rsid", all.x=T) %>% + arrange(order) + ## count alleles + merged_alleles$num_alt = ifelse(merged_alleles$Allele1==merged_alleles$ref_allele & merged_alleles$Allele2==merged_alleles$ref_allele, 2, ifelse(merged_alleles$Allele1==merged_alleles$ref_allele | merged_alleles$Allele2==merged_alleles$ref_allele, 1, 0)) + + ## re-format to match 1000G samples + formatted_sample = merged_alleles %>% + select(.data$snp_id, .data$num_alt) %>% + pivot_wider(names_from=.data$snp_id, values_from=.data$num_alt) + + ## add unknown to 1000G genotypes + geno_filt_unk = rbind(geno_filt, formatted_sample) + + message("Running PCA
") + ## remove any SNPs with NA values (in unknown sample) + betaRedNAOmit <- geno_filt_unk %>% + select_if(~ !any(is.na(.))) + + ##perform PCA + pcaRed <- stats::prcomp(betaRedNAOmit, center=TRUE, scale=FALSE) + + ## create data table of PCs + PCs = data.frame(pcaRed$x) + + ## add unknown to ancestry and genotype IDs + geno_unk = geno %>% + add_row(IID="Unk") + ## merge genotypes with ancestry info; need to preserve order to match to PCA data + geno_ancestry=merge(geno_unk, mixder::ancestry_colors, by.x="IID", by.y="id") + + ## add ancestry info to PC data + newcol=ncols+1 + newcol2=ncols+4 + PCs_anc = cbind(geno_ancestry[,c(newcol:newcol2)], data.frame(PCs[,c(1:10)])) + + + centroids(groups, PCs_anc, glue("{path}/PCA_plots"), glue("{id}_{contrib_status}_{analysis_type}_{plotid}")) + + dir.create(file.path(path, "PCA_plots"), showWarnings = FALSE, recursive=TRUE) + + if ("Superpopulations (AFR/AMR/EAS/EUR/SAS Only)" %in% groups) { + pal = unique(geno_ancestry$superpop_color) + pal = setNames(pal, unique(geno_ancestry$reg)) + + fig = plot_ly(PCs_anc, x = ~PC1, y = ~PC2, z = ~PC3, color = ~reg, colors=pal, size=10) + fig = fig %>% add_markers() + fig = fig %>% layout(scene = list(xaxis = list(title = 'PC1'), + yaxis = list(title = 'PC2'), + zaxis = list(title = 'PC3')), + title=list(text=glue("{ncol(betaRedNAOmit)} SNPs; {id} {contrib_status} {analysis_type} Superpopulations"))) + + htmlwidgets::saveWidget(as_widget(fig), glue("{path}/PCA_plots/{id}_{contrib_status}_{analysis_type}_{plotid}_superpop_3D_PCAPlot.html")) + } + if ("Subpopulations" %in% groups) { + pal_sub = unique(geno_ancestry$color) + pal_sub = setNames(pal_sub, unique(geno_ancestry$population)) + + fig_sub = plot_ly(PCs_anc, x = ~PC1, y = ~PC2, z = ~PC3, color = ~population, colors=pal_sub, size=10) + fig_sub = fig_sub %>% add_markers() + fig_sub = fig_sub %>% layout(scene = list(xaxis = list(title = 'PC1'), + yaxis = list(title = 'PC2'), + zaxis = list(title = 'PC3')), + title=list(text=glue("{ncol(betaRedNAOmit)} SNPs; {id} {contrib_status} {analysis_type} Subpopulations"))) + + htmlwidgets::saveWidget(as_widget(fig_sub), glue("{path}/PCA_plots/{id}_{contrib_status}_{analysis_type}_{plotid}_subpopulations_3D_PCAPlot.html")) + } +} diff --git a/R/app.R b/R/app.R index 21fa428..4c6aecb 100644 --- a/R/app.R +++ b/R/app.R @@ -18,80 +18,77 @@ #' mixder = function() { ui = fluidPage( + use_prompt(), + tags$head( + tags$style( + HTML("[class*=hint--][aria-label]:after { + white-space: pre; + }") + ) + ), # App title - titlePanel("Running Mixture Deconvolution using EFM"), + titlePanel("MixDeR: A Mixture Deconvolution Workflow for FGG"), sidebarPanel(width=6, use_prompt(), - checkboxInput("run_mixdeconv", tags$span("Run EFM Mixture Deconvolution", tags$span( + checkboxInput("skip_ancestry", tags$span("Skip Ancestry Prediction Step", tags$span( icon( name = "question-circle", ) ) |> - add_prompt(message = "Check box to run mixture deconvolution using EuroForMix. Not required if run previously using MixDeR.", position = "right") - ), value = TRUE), + add_prompt(message = "Check box to skip ancestry prediction\nstep and move to mixture deconvolution.", position = "right") + ), value = FALSE), + conditionalPanel(condition = "input.skip_ancestry == 0", uiOutput("ancestry_text"), uiOutput("pcagroups"), uiOutput("ancestry_snps")), checkboxInput("uncond", tags$span("Unconditioned Analysis", tags$span(icon( name = "question-circle", ) ) |> - add_prompt(message = "An unconditioned analysis assumes no known contributor to the mixture and therefore does not require known genotypes to be provided.", position = "right") + add_prompt(message = "An unconditioned analysis assumes no known contributor to the mixture and therefore\ndoes not require known genotypes to be provided.", position = "right") ), value = FALSE), checkboxInput("cond", tags$span("Conditioned Analysis", tags$span(icon( name = "question-circle", ) ) |> - add_prompt(message = "A conditioned analysis assumes a single known contributor to the mixture. User will select which reference sample to condition on after providing the Reference Sample Report Folder.", position = "right") + add_prompt(message = "A conditioned analysis assumes a single known contributorto the mixture.\nUser will select which reference sample to condition on after providing\nthe Reference Sample Report Folder.", position = "right") ), value = FALSE), - selectInput("method", tags$span("Method to run after mixture deconvolution?", tags$span(icon( - name = "question-circle", - ) - ) |> - add_prompt(message = "Optional- if selecting Calculate Metrics, must include reference genotypes.", position = "right") - ), c("", "Calculate Metrics", "Create GEDmatch PRO Report")), - conditionalPanel(condition = "input.twofreqs == 0", uiOutput("freqselect")), - conditionalPanel(condition = "input.twofreqs == 1", uiOutput("freqselect_major"), uiOutput("freqselect_minor")), - checkboxInput("twofreqs", tags$span("Use Different Allele Frequency Files For Each Contributor?", tags$span(icon( - name = "question-circle", - ) - ) |> - add_prompt(message = "Will allow user to select or upload different allele frequency data for each contributor.", position = "right") - ), value = FALSE), - conditionalPanel(condition = "input.twofreqs == 0 & input.uploadfreq == 'Upload Custom'", uiOutput("freq_GetFile"), uiOutput("freq_text")), - conditionalPanel(condition = "input.twofreqs == 1 & input.uploadfreq_major == 'Upload Custom'", uiOutput("freq_GetFile_major"), uiOutput("freq_text_major")), - conditionalPanel(condition = "input.twofreqs == 1 & input.uploadfreq_minor == 'Upload Custom'", uiOutput("freq_GetFile_minor"), uiOutput("freq_text_minor")), shinyFilesButton("sample_GetFile", "Select a Sample Manifest File", "Select a sample manifest", multiple = FALSE, buttonType = "default", class = NULL), tags$span(icon( name = "question-circle", ) ) |> - add_prompt(message = "A tab-delimited file containing the list of samples to run MixDeR.
It must contain two columns (SampleID and ReplicateID). Each row contains the ID of a single sample or the IDs of both the sample and replicate.", position = "right"), + add_prompt(message = "A tab-delimited file containing the list of samples to run MixDeR.\nIt must contain two columns (SampleID and ReplicateID).\nEach row contains the ID of a single sample or the IDs of\nboth the sample and replicate.", position = "right"), textOutput("sample_file"), shinyDirButton("kin_prefix", "Select Folder containing Mixture Sample Reports", "Please select a folder containing Mixture Sample Reports", buttonType = "default", class = NULL), tags$span(icon( name = "question-circle", - ) - ) |> - add_prompt(message = "Select a folder containing the Mixture Sample Reports.", position = "right"), + ) + ) |> + add_prompt(message = "Select a folder containing the Mixture Sample Reports.", position = "right"), textOutput("kin_inpath"), conditionalPanel(condition = "input.method == 'Calculate Metrics' | input.cond == 1", uiOutput("ref_GetFile"), uiOutput("ref_text")), conditionalPanel(condition = "input.cond == 1", uiOutput("ref_selector")), + conditionalPanel(condition = "input.skip_ancestry == 1", uiOutput("runmd"), uiOutput("twofreqs"), uiOutput("method")), + conditionalPanel(condition = "input.skip_ancestry == 1 & input.twofreqs == 0", uiOutput("freqselect")), + conditionalPanel(condition = "input.twofreqs == 1", uiOutput("freqselect_major"), uiOutput("freqselect_minor")), + conditionalPanel(condition = "input.skip_ancestry == 1 & input.twofreqs == 0 & input.uploadfreq == 'Upload Custom'", uiOutput("freq_GetFile"), uiOutput("freq_text")), + conditionalPanel(condition = "input.twofreqs == 1 & input.uploadfreq_major == 'Upload Custom'", uiOutput("freq_GetFile_major"), uiOutput("freq_text_major")), + conditionalPanel(condition = "input.twofreqs == 1 & input.uploadfreq_minor == 'Upload Custom'", uiOutput("freq_GetFile_minor"), uiOutput("freq_text_minor")), conditionalPanel(condition = "input.method == 'Calculate Metrics' | input.method == 'Create GEDmatch PRO Report'", uiOutput("filter_missing")), conditionalPanel(condition = "input.method == 'Calculate Metrics'", uiOutput("major_selector"), uiOutput("minor_selector"), uiOutput("metrics_A1min"), uiOutput("metrics_A1max"), uiOutput("metrics_A2min"), uiOutput("metrics_A2max")), - conditionalPanel(condition = "input.method == 'Create GEDmatch PRO Report'", uiOutput("min_cont_prob"), uiOutput("report_A1"), uiOutput("report_A2")), - conditionalPanel(condition = "input.run_mixdeconv == 1", uiOutput("sets"), uiOutput("keep_bins"), uiOutput("staticAT"), uiOutput("dynamicAT")), - textInput("output", tags$span("Output Folder Name", tags$span(icon( - name = "question-circle", - ) - ) |> - add_prompt(message = "This folder will be created in the specified SNP files folder to store generated output. - If not running EFM, it is required to specify the name of the folder containing previously generated EFM output.", position = "right") - ), "output"), + conditionalPanel(condition = "input.skip_ancestry == 0 | input.method == 'Create GEDmatch PRO Report'", uiOutput("min_cont_prob"), uiOutput("report_A1"), uiOutput("report_A2")), + conditionalPanel(condition = "input.skip_ancestry == 0 | input.run_mixdeconv == 1", uiOutput("sets"), uiOutput("keep_bins"), uiOutput("staticAT"), uiOutput("dynamicAT")), numericInput("minimum_snps", tags$span("Minimum Number of SNPs", tags$span( icon( name = "question-circle", ) ) |> - add_prompt(message = "The minimum number of SNPs to retain either for calculating metrics or for creating the GEDmatch PRO report.", position = "right") + add_prompt(message = "The minimum number of SNPs to retain either for\ncalculating metrics or for creating the GEDmatch PRO report.", position = "right") ), value=6000), + textInput("output", tags$span("Output Folder Name", tags$span(icon( + name = "question-circle", + ) + ) |> + add_prompt(message = "This folder will be created in the specified SNP files folder to store generated output.\nIf not running EFM, it is required to specify the name of the folder containing previously generated EFM output.", position = "right") + ), "output"), shinyjs::useShinyjs(), actionButton("Submit", "Run MixDeR") ), @@ -102,6 +99,63 @@ mixder = function() { # Define server server = function(input, output, session) { + output$ref_GetFile = renderUI({ + fluidRow( + column(10, + shinyDirButton("ref_GetFile", "Select Folder containing References" , + title = "Please select a folder containing reference genotypes:", multiple = FALSE, + buttonType = "default", class = NULL), + tags$span( + icon( + name = "question-circle", + ) + ) |> + add_prompt(message = "A folder containing the reference genotypes in either\nthe UAS Sample Report format or in a CSV file (see README for specific formatting).\nIf both are present, MixDeR will use the CSV file.", position = "right") + )) + }) + output$ref_text = renderUI({ + textOutput("refs_file") + }) + output$ref_selector = renderUI({ + if (isTruthy(refs())) { + sampleids = get_ids(refs()) + selectInput("ref_selector", + label = tags$span("Select Reference(s) to Condition on:", tags$span( + icon( + name = "question-circle", + ) + ) |> + add_prompt(message = "Select one or more references to condition on.\nIf more than one selected, the mixture(s) will be\nconditioned on each reference separately.", position = "right") + ), + choices = sampleids, multiple = TRUE) + } + }) + + output$runmd = renderUI({ + checkboxInput("run_mixdeconv", tags$span("Run EFM Mixture Deconvolution", tags$span( + icon( + name = "question-circle", + ) + ) |> + add_prompt(message = "Check box to run mixture deconvolution using EuroForMix.\nNot required if run previously using MixDeR.", position = "right") + ), value = TRUE) + }) + output$method = renderUI({ + selectInput("method", tags$span("Method to run after mixture deconvolution?", tags$span(icon( + name = "question-circle", + ) + ) |> + add_prompt(message = "Optional- if selecting Calculate Metrics,\nmust include reference genotypes.", position = "right") + ), c("", "Calculate Metrics", "Create GEDmatch PRO Report")) + }) + output$twofreqs = renderUI({ + checkboxInput("twofreqs", tags$span("Use Different Allele Frequency Files For Each Contributor?", tags$span(icon( + name = "question-circle", + ) + ) |> + add_prompt(message = "Will allow user to select or upload different\nallele frequency data for each contributor.", position = "right") + ), value = FALSE) + }) output$freq_GetFile = renderUI({ fluidRow(column(10,shinyFilesButton("freq_GetFile", "Select an Allele Frequency file" , title = "Please select an allele frequency file:", multiple = FALSE, @@ -135,37 +189,27 @@ server = function(input, output, session) { output$freq_text_minor = renderUI({ textOutput("freq_file_minor") }) + output$sample_file_text = renderUI({ - output$ref_GetFile = renderUI({ - fluidRow( - column(10, - shinyDirButton("ref_GetFile", "Select Folder containing References" , - title = "Please select a folder containing reference genotypes:", multiple = FALSE, - buttonType = "default", class = NULL), - tags$span( - icon( - name = "question-circle", - ) - ) |> - add_prompt(message = "A folder containing the reference genotypes in either the UAS Sample Report format or in a CSV file (see README for specific formatting). If both are present, MixDeR will use the CSV file.", position = "right") - )) }) - output$ref_text = renderUI({ - textOutput("refs_file") + output$ancestry_text = renderUI({ + HTML("Optional: Ancestry Prediction Tool using PCA
Use this tool to assist in predicting the ancestry of each contributor. The population-specific allele frequency file can then be used in the next mixture deconvolution step. Select the above box to skip this step and move forward to mixture deconvolution.
See the README for more information.

") }) - output$ref_selector = renderUI({ - if (isTruthy(refs())) { - sampleids = get_ids(refs()) - selectInput("ref_selector", - label = tags$span("Select Reference(s) to Condition on:", tags$span( - icon( - name = "question-circle", - ) - ) |> - add_prompt(message = "Select one or more references to condition on. If more than one selected, the mixture(s) will be conditioned on each reference separately.", position = "right") - ), - choices = sampleids, multiple = TRUE) - } + output$ancestry_snps = renderUI({ + selectInput("ancestry_snps", tags$span("SNPs to Use for Ancestry Prediction:", tags$span(icon( + name = "question-circle") + ) |> + prompter::add_prompt(message = "Select whether to use all autosomal SNPs or only ancestry SNPs for ancestry prediction.\nWarning: using all SNPs may result in longer analysis time!", position = "right") + ), c("Ancestry SNPs Only", "All Autosomal SNPs")) + }) + output$pcagroups = renderUI({ + checkboxGroupInput("pcagroups", tags$span("Select Groups for Ancestry Prediction:", tags$span( + icon( + name = "question-circle", + ) + ) |> + add_prompt(message = "PCA plots are colored by the selected grouping to identify ancestry. Both may be selected.\nSubpopulations include 26 subgroups making up the stated superpopulations.", position = "right") + ), choices = list("Superpopulations (AFR/AMR/EAS/EUR/SAS Only)", "Subpopulations"), selected = "Superpopulations (AFR/AMR/EAS/EUR/SAS Only)") }) output$major_selector = renderUI({ if (isTruthy(refs())) { @@ -176,7 +220,7 @@ server = function(input, output, session) { name = "question-circle", ) ) |> - add_prompt(message = "When calculating metrics for an unconditioned analysis, the major contributor sample ID is required for calculating the genotyping accuracy. Please select the correct sample from the Dropdown menu.", position = "right") + add_prompt(message = "When calculating metrics for an unconditioned analysis, the major contributor sample ID is required for calculating the genotyping accuracy.\nPlease select the correct sample from the Dropdown menu.", position = "right") ), choices = sampleids) } @@ -190,7 +234,7 @@ server = function(input, output, session) { name = "question-circle", ) ) |> - add_prompt(message = "When calculating metrics for an unconditioned analysis, the minor contributor sample ID is required for calculating the genotyping accuracy. Please select the correct sample from the Dropdown menu.", position = "right") + add_prompt(message = "When calculating metrics for an unconditioned analysis,\nthe minor contributor sample ID is required for calculating the genotyping accuracy.\nPlease select the correct sample from the Dropdown menu.", position = "right") ), choices = sampleids) } @@ -201,7 +245,7 @@ server = function(input, output, session) { name = "question-circle", ) ) |> - add_prompt(message = "The number of SNP bins a mixture SNP profile is divided into. This tells MixDeR how many SNP files to process for each mixture. The default is 10.", position = "right") + add_prompt(message = "The number of SNP bins a mixture SNP profile is divided into.\nThis tells MixDeR how many SNP files to process for each mixture.\nThe default is 10.", position = "right") ), value=10) }) output$keep_bins = renderUI({ @@ -210,7 +254,7 @@ server = function(input, output, session) { name = "question-circle", ) ) |> - add_prompt(message = "Check to use previously created bins, if detected by MixDeR. Otherwise, will create new SNP bins (and files) and write over any existing files.", position = "right") + add_prompt(message = "Check to use previously created bins, if detected by MixDeR.\nOtherwise, will create new SNP bins (and files) and write over any existing files.", position = "right") ), value=TRUE) }) output$staticAT = renderUI({ @@ -219,7 +263,7 @@ server = function(input, output, session) { name = "question-circle", ) ) |> - add_prompt(message = "The static analytical threshold indicates the minimum number of reads required to include a called allele it in the deconvolution at a particular SNP. The default is 10 reads.", position = "right") + add_prompt(message = "The static analytical threshold indicates the minimum number of reads\nrequired to include a called allele it in the deconvolution at a particular SNP.\nThe default is 10 reads.", position = "right") ), value=10) }) output$dynamicAT = renderUI({ @@ -228,7 +272,7 @@ server = function(input, output, session) { name = "question-circle", ) ) |> - add_prompt(message = "The dynamic analytical thresholds indicates the percentage of number of total reads to set the minimum number of reads required to include a called allele it in the deconvolution at a particular SNP. For example, if a SNP has 100 total reads, a 10% dynamic AT would require an allele to have at least 10 reads to be included.", position = "right") + add_prompt(message = "The dynamic analytical thresholds indicates the percentage of number of total reads\nto set the minimum number of reads required to include a called allele it in\nthe deconvolution at a particular SNP.\nFor example, if a SNP has 100 total reads, a 10% dynamic AT would require an allele\nto have at least 10 reads to be included.", position = "right") ), value=0.015) }) output$min_cont_prob = renderUI({ @@ -237,7 +281,7 @@ server = function(input, output, session) { name = "question-circle", ) ) |> - add_prompt(message = "By default, the SNP profile for the minor contributor contains the specified minimum number of SNPs (i.e. the top 6,000 SNPs ordered by allele 1 probability). This will instead apply the allele 1 probability threshold for the created SNP profile for the minor contributor, assuming a SNP profile can be generated that meets the specified minimum number of SNPs.", position = "right") + add_prompt(message = "By default, the SNP profile for the minor contributor contains the\nspecified minimum number of SNPs (i.e. the top 6,000 SNPs ordered by allele 1 probability).\nThis will instead apply the allele 1 probability threshold for the\ncreated SNP profile for the minor contributor, assuming a SNP profile can be generated\nthat meets the specified minimum number of SNPs.", position = "right") )) }) output$freqselect = renderUI({ @@ -245,41 +289,41 @@ server = function(input, output, session) { name = "question-circle", ) ) |> - add_prompt(message = "Select allele frequency data. Options are global population datasets (1000G Phase 3 or gnomAD v4) or upload your own file. See README for more details.", position = "right") - ), choices = c("Global - 1000G", "Global - gnomAD", "Upload Custom")) + add_prompt(message = "Select allele frequency data.\nOptions are global population datasets (1000G Phase 3 or gnomAD v4),\nindividual superpopulation 1000G (AFR,AMR,EAS,EUR,SAS) datasets\nor upload your own file.\nSee README for more details.", position = "right") + ), choices = c("Global - 1000G", "Global - gnomAD", "AFR - 1000G", "AMR - 1000G", "EAS - 1000G", "EUR - 1000G", "SAS - 1000G", "Upload Custom")) }) output$freqselect_major = renderUI({ selectInput("uploadfreq_major", tags$span("Select Allele Frequency Data for the Major Contributor", tags$span(icon( name = "question-circle", ) ) |> - add_prompt(message = "Select allele frequency data for the major contributor. Options are global population datasets (1000G Phase 3 or gnomAD v4) or upload your own file. See README for more details.", position = "right") - ), choices = c("Global - 1000G", "Global - gnomAD", "Upload Custom")) + add_prompt(message = "Select allele frequency data for the major contributor.\nOptions are global population datasets (1000G Phase 3 or gnomAD v4),\nindividual superpopulation 1000G (AFR,AMR,EAS,EUR,SAS) datasets,\nor upload your own file.\nSee README for more details.", position = "right") + ), choices = c("Global - 1000G", "Global - gnomAD", "AFR - 1000G", "AMR - 1000G", "EAS - 1000G", "EUR - 1000G", "SAS - 1000G", "Upload Custom")) }) output$freqselect_minor = renderUI({ selectInput("uploadfreq_minor", tags$span("Select Allele Frequency Data for the Minor Contributor", tags$span(icon( name = "question-circle", ) ) |> - add_prompt(message = "Select allele frequency data for the minor contributor. Options are global population datasets (1000G Phase 3 or gnomAD v4) or upload your own file. See README for more details.", position = "right") - ), choices = c("Global - 1000G", "Global - gnomAD", "Upload Custom")) + add_prompt(message = "Select allele frequency data for the minor contributor.\nOptions are global population datasets (1000G Phase 3 or gnomAD v4),\nindividual superpopulation 1000G (AFR,AMR,EAS,EUR,SAS) datasets,\nor upload your own file.\nSee README for more details.", position = "right") + ), choices = c("Global - 1000G", "Global - gnomAD", "AFR - 1000G", "AMR - 1000G", "EAS - 1000G", "EUR - 1000G", "SAS - 1000G","Upload Custom")) }) output$report_A1 = renderUI({ - numericInput("A1_threshold", tags$span("Allele 1 Probability Threshold to create GEDmatch PRO Report", tags$span( + numericInput("A1_threshold", tags$span("Allele 1 Probability Threshold", tags$span( icon( name = "question-circle", ) ) |> - add_prompt(message = "This sets the allele 1 probability threshold for creating the GEDmatch PRO report. Any SNP with an allele 1 probability below the threshold will be removed from the report.", position = "right") + add_prompt(message = "This sets the allele 1 probability threshold for creating the GEDmatch PRO report.\nAny SNP with an allele 1 probability below the threshold will be removed from the report.", position = "right") ), value=0.99, min = 0, max = 1) }) output$report_A2 = renderUI({ - numericInput("A2_threshold", tags$span("Allele 2 Probability Threshold to create GEDmatch PRO Report", tags$span( + numericInput("A2_threshold", tags$span("Allele 2 Probability Threshold", tags$span( icon( name = "question-circle", ) ) |> - add_prompt(message = "This sets the allele 2 probability threshold for creating the GEDmatch PRO report. Any SNP with an allele 2 probability below the threshold will be removed from the report.", position = "right") + add_prompt(message = "This sets the allele 2 probability threshold for creating the GEDmatch PRO report.\nAny SNP with an allele 2 probability below the threshold will be removed from the report.", position = "right") ), value=0.60, min = 0, max = 1) }) output$filter_missing = renderUI({ @@ -288,7 +332,7 @@ server = function(input, output, session) { name = "question-circle", ) ) |> - add_prompt(message = "This will remove a SNP if either of its alleles are missing. If not checked, a SNP will be removed if allele 1 is missing or will report the SNP as homozygous for allele 1 if allele 2 is missing.", position = "right") + add_prompt(message = "This will remove a SNP if either of its alleles are missing.\nIf not checked, a SNP will be removed if allele 1 is missing\nor will report the SNP as homozygous for allele 1 if allele 2 is missing.", position = "right") )) }) output$metrics_A1min = renderUI({ @@ -297,7 +341,7 @@ server = function(input, output, session) { name = "question-circle", ) ) |> - add_prompt(message = "When calculating metrics, a range of allele 1 probability thresholds can be used to calculate the metrics at each combination of allele 1 and allele 2 probability thresholds. This sets the minimum allele 1 probability threshold. The threshold increases in increments of 0.01.", position = "right") + add_prompt(message = "When calculating metrics, a range of allele 1 probability thresholds\ncan be used to calculate the metrics at each combination of\nallele 1 and allele 2 probability thresholds.\nThis sets the minimum allele 1 probability threshold.\nThe threshold increases in increments of 0.01.", position = "right") ), value=0, min = 0, max = 1) }) output$metrics_A1max = renderUI({ @@ -306,7 +350,7 @@ server = function(input, output, session) { name = "question-circle", ) ) |> - add_prompt(message = "When calculating metrics, a range of allele 1 probability thresholds can be used to calculate the metrics at each combination of allele 1 and allele 2 probability thresholds. This sets the maximum allele 1 probability threshold. The threshold increases in increments of 0.01.", position = "right") + add_prompt(message = "When calculating metrics, a range of allele 1 probability thresholds\ncan be used to calculate the metrics at each combination of\nallele 1 and allele 2 probability thresholds.\nThis sets the maximum allele 1 probability threshold.\nThe threshold increases in increments of 0.01.", position = "right") ), value=1, min = 0, max = 1) }) output$metrics_A2min = renderUI({ @@ -315,7 +359,7 @@ server = function(input, output, session) { name = "question-circle", ) ) |> - add_prompt(message = "When calculating metrics, a range of allele 2 probability thresholds can be used to calculate the metrics at each combination of allele 1 and allele 2 probability thresholds. This sets the minimum allele 2 probability threshold. The threshold increases in increments of 0.01.", position = "right") + add_prompt(message = "When calculating metrics, a range of allele 2 probability thresholds\ncan be used to calculate the metrics at each combination of allele 1 and allele 2 probability thresholds.\nThis sets the minimum allele 2 probability threshold.\nThe threshold increases in increments of 0.01.", position = "right") ), value=0, min = 0, max = 1) }) output$metrics_A2max = renderUI({ @@ -324,7 +368,7 @@ server = function(input, output, session) { name = "question-circle", ) ) |> - add_prompt(message = "When calculating metrics, a range of allele 2 probability thresholds can be used to calculate the metrics at each combination of allele 1 and allele 2 probability thresholds. This sets the maximum allele 2 probability threshold. The threshold increases in increments of 0.01.", position = "right") + add_prompt(message = "When calculating metrics, a range of allele 2 probability thresholds\ncan be used to calculate the metrics at each combination of allele 1 and allele 2 probability thresholds.\nThis sets the maximum allele 2 probability threshold.\nThe threshold increases in increments of 0.01.", position = "right") ), value=1, min = 0, max = 1) }) volumes = getVolumes() @@ -384,7 +428,7 @@ server = function(input, output, session) { observeEvent(input$Submit, { sample_list = read.table(samplefile()$datapath, sep="\t", header=T) date = glue("{Sys.Date()}_{format(Sys.time(), '%H_%M_%S')}") - create_config(date, input$twofreqs, ifelse(!isTruthy(freq()$datapath), input$uploadfreq, freq()$datapath), ifelse(!isTruthy(freq_major()$datapath), input$uploadfreq_major, freq_major()$datapath), ifelse(!isTruthy(freq_minor()$datapath), input$uploadfreq_minor, freq_minor()$datapath), refs(), samplefile()$datapath, input$output, input$run_mixdeconv, input$uncond, input$ref_selector, input$method, input$sets, kin_inpath(), input$dynamicAT, input$staticAT, input$minimum_snps, input$A1_threshold, input$A2_threshold, input$A1_threshmin_metrics, input$A1_threshmax_metrics, input$A2_threshmin_metrics, input$A2_threshmax_metrics, input$major_selector, input$minor_selector, input$filter_missing) + create_config(date, input$twofreqs, ifelse(!isTruthy(freq()$datapath), input$uploadfreq, freq()$datapath), ifelse(!isTruthy(freq_major()$datapath), input$uploadfreq_major, freq_major()$datapath), ifelse(!isTruthy(freq_minor()$datapath), input$uploadfreq_minor, freq_minor()$datapath), refs(), samplefile()$datapath, input$output, input$run_mixdeconv, input$uncond, input$ref_selector, input$method, input$sets, kin_inpath(), input$dynamicAT, input$staticAT, input$minimum_snps, input$A1_threshold, input$A2_threshold, input$A1_threshmin_metrics, input$A1_threshmax_metrics, input$A2_threshmin_metrics, input$A2_threshmax_metrics, input$major_selector, input$minor_selector, input$filter_missing, input$skip_ancestry, input$ancestry_snps, input$pcagroups) if (isTruthy(refs())) { withProgress(message = "Loading References", value = 0, { if (!file.exists(glue("{refs()}/EFM_references.csv"))) { @@ -406,7 +450,7 @@ server = function(input, output, session) { incProgress((row-1)/n, detail = glue("On Sample {row} of {n}")) withCallingHandlers({ shinyjs::html(id = "text", html = "") - run_workflow(date, id, replicate_id, input$twofreqs, ifelse(!isTruthy(freq()$datapath), input$uploadfreq, freq()$datapath),ifelse(!isTruthy(freq_major()$datapath), input$uploadfreq_major, freq_major()$datapath), ifelse(!isTruthy(freq_minor()$datapath), input$uploadfreq_minor, freq_minor()$datapath), refData, refs(), samplefile()$datapath, input$output, input$run_mixdeconv, input$uncond, input$ref_selector, input$method, input$sets, kin_inpath(), input$dynamicAT, input$staticAT, input$minimum_snps, input$A1_threshold, input$A2_threshold, input$A1_threshmin_metrics, input$A1_threshmax_metrics, input$A2_threshmin_metrics, input$A2_threshmax_metrics, input$major_selector, input$minor_selector, input$min_cont_prob, input$keep_bins, input$filter_missing) + run_workflow(date, id, replicate_id, input$twofreqs, ifelse(!isTruthy(freq()$datapath), input$uploadfreq, freq()$datapath),ifelse(!isTruthy(freq_major()$datapath), input$uploadfreq_major, freq_major()$datapath), ifelse(!isTruthy(freq_minor()$datapath), input$uploadfreq_minor, freq_minor()$datapath), refData, refs(), samplefile()$datapath, input$output, input$run_mixdeconv, input$uncond, input$ref_selector, input$method, input$sets, kin_inpath(), input$dynamicAT, input$staticAT, input$minimum_snps, input$A1_threshold, input$A2_threshold, input$A1_threshmin_metrics, input$A1_threshmax_metrics, input$A2_threshmin_metrics, input$A2_threshmax_metrics, input$major_selector, input$minor_selector, input$min_cont_prob, input$keep_bins, input$filter_missing, input$skip_ancestry, input$ancestry_snps, input$pcagroups) }, message = function(m) { shinyjs::html(id = "text", html = m$message, add = TRUE) diff --git a/R/centroids.R b/R/centroids.R new file mode 100644 index 0000000..79296ff --- /dev/null +++ b/R/centroids.R @@ -0,0 +1,86 @@ + +#' Centroid Calculations and Plotting +#' +#' @param groups Groups used for PCA (Superpopulations and/or Subpopulations) +#' @param pca dataframe of PCs and ancestry +#' @param inpath input path +#' @param ID ID for creating file name(s) +#' +#' +#' @export +#' +centroids = function(groups, pca, inpath, ID) { + dir.create(file.path(inpath, "Centroids_Plots"), showWarnings = FALSE, recursive=TRUE) + + ancestry_colors = read.table("/Users/rebecca.mitchell/Desktop/ancestry_colors.txt", header=T, sep="\t") %>% + add_row(id = "Unk", reg = "Unk", population = "Unk", color="red", superpop_color="red") %>% + add_row(id= "Centroid", reg = "Centroid", population = "Centroid", color = "black", superpop_color="black") + + if ("Superpopulations (AFR/AMR/EAS/EUR/SAS Only)" %in% groups) { + pca.centroids.pop = aggregate(pca[,5:14], list(Type = pca$reg), mean) + + superpop_dist=data.frame() + for (pop in unique(pca.centroids.pop$Type)){ + if (pop != "Unk"){ + superpop_dist_calc=stats::dist(rbind(pca.centroids.pop[pca.centroids.pop$Type == "Unk",2:4],pca.centroids.pop[pca.centroids.pop$Type == pop,2:4]), method = "euclidean") + superpop_dist=rbind(superpop_dist, data.frame(Superpopulation=pop, Distance=superpop_dist_calc)) + } + } + superpop_dist_final = superpop_dist %>% + arrange(.data$Distance) + + write.table(superpop_dist_final, glue("{inpath}/Centroids_Plots/{ID}_Superpopulations_centroids_Calculations.txt"), row.names=F, quote=F, col.names=T, sep="\t") + + plot_df_super = pca.centroids.pop %>% + filter(.data$Type!="Unk") + + pal = unique(mixder::ancestry_colors$superpop_color) + pal = setNames(pal, unique(mixder::ancestry_colors$reg)) + + fig_super = plot_ly() + + fig_super = fig_super %>% add_markers(data=pca, x = ~PC1, y = ~PC2, z = ~PC3, color = ~reg, colors=pal, size=10) + fig_super = fig_super %>% add_markers(data=plot_df_super, x=~PC1, y=~PC2, z=~PC3, color="Centroid", colors="black", size=10) + fig_super = fig_super %>% layout(scene = list(xaxis = list(title = 'PC1'), + yaxis = list(title = 'PC2'), + zaxis = list(title = 'PC3'))) + + htmlwidgets::saveWidget(as_widget(fig_super), glue("{inpath}/Centroids_Plots/{ID}_Superpopulations_centroids.html")) + + } + + if ("Subpopulations" %in% groups) { + + pca.centroids.subpop = aggregate(pca[,5:14], list(Type = pca$population), mean) + + subpop_dist=data.frame() + for (pop in unique(pca.centroids.subpop$Type)){ + if (pop != "Unk"){ + subpop_dist_calc=stats::dist(rbind(pca.centroids.subpop[pca.centroids.subpop$Type == "Unk",2:4],pca.centroids.subpop[pca.centroids.subpop$Type == pop,2:4]), method = "euclidean") + subpop_dist=rbind(subpop_dist, data.frame(Subpopulation=pop, Distance=subpop_dist_calc)) + } + } + subpop_dist_final = subpop_dist %>% + arrange(.data$Distance) + + write.table(subpop_dist_final, glue("{inpath}/Centroids_Plots/{ID}_Subpopulations_centroids_Calculations.txt"), row.names=F, quote=F, col.names=T, sep="\t") + ## subpopulation plot + + plot_df_sub = pca.centroids.subpop %>% + filter(.data$Type!="Unk") + + pal = unique(mixder::ancestry_colors$color) + #pal = append(pal, "black") + pal = setNames(pal, unique(mixder::ancestry_colors$population)) + + fig_sub = plot_ly() + + fig_sub = fig_sub %>% add_markers(data=pca, x = ~PC1, y = ~PC2, z = ~PC3, color = ~population, colors=pal, size=10) + fig_sub = fig_sub %>% add_markers(data=plot_df_sub, x=~PC1, y=~PC2, z=~PC3, color="Centroid", colors="black", size=10) + fig_sub = fig_sub %>% layout(scene = list(xaxis = list(title = 'PC1'), + yaxis = list(title = 'PC2'), + zaxis = list(title = 'PC3'))) + + htmlwidgets::saveWidget(as_widget(fig_sub), glue("{inpath}/Centroids_Plots/{ID}_Subpopulations_centroids.html")) + } +} diff --git a/R/create_config.R b/R/create_config.R index b211557..03e2326 100644 --- a/R/create_config.R +++ b/R/create_config.R @@ -37,10 +37,13 @@ #' @param major assumed major contributor of mixture #' @param minor assumed minor contributor of mixture #' @param filter_missing TRUE/FALSE whether to filter SNPs if second allele is missing (99) +#' @param skipancestry TRUE/FALSE whether to skip ancestry prediction step +#' @param pcasnps SNPs used for PCA (ancestry prediction) +#' @param pcagroups Groups used for PCA (ancestry prediction), either Superpopulations or Subpopulations #' #' @export #' -create_config = function(date, twofreqs, freq_all, freq_major, freq_minor, refs, sample_path, out_path, run_mixdeconv, unconditioned, cond, method, sets, kinpath, dynamicAT, staticAT, minimum_snps, A1_threshold, A2_threshold, A1min, A1max, A2min, A2max, major, minor, filter_missing){ +create_config = function(date, twofreqs, freq_all, freq_major, freq_minor, refs, sample_path, out_path, run_mixdeconv, unconditioned, cond, method, sets, kinpath, dynamicAT, staticAT, minimum_snps, A1_threshold, A2_threshold, A1min, A1max, A2min, A2max, major, minor, filter_missing, skipancestry, pcasnps, pcagroups){ config = setNames(data.frame(matrix(ncol=2, nrow=0)), c("Setting", "Value")) config = rbind(config, data.frame(Setting="MixDeR Version:", Value=getNamespaceVersion("mixder")[["version"]])) config = rbind(config, data.frame(Setting="EuroForMix Version:", Value=getNamespaceVersion("euroformix")[["version"]])) @@ -49,6 +52,11 @@ create_config = function(date, twofreqs, freq_all, freq_major, freq_minor, refs, config = rbind(config, data.frame(Setting="Path to references:", Value=refs)) } config = rbind(config, data.frame(Setting="Path to mixtures:", Value=kinpath)) + config = rbind(config, data.frame(Setting="Run Ancestry Prediction Step:", Value=!skipancestry)) + if (!isTruthy(skipancestry)) { + config = rbind(config, data.frame(Setting="SNPs Used for PCA (Ancestry Prediction):", Value=pcasnps)) + config = rbind(config, data.frame(Setting="Groups Used for PCA (Ancestry Prediction):", Value=pcagroups)) + } if (isTruthy(run_mixdeconv)) { if (isTruthy(twofreqs)) { config = rbind(config, data.frame(Setting="Frequency data Major Contributor:", Value=freq_major)) @@ -66,9 +74,9 @@ create_config = function(date, twofreqs, freq_all, freq_major, freq_minor, refs, config = rbind(config, data.frame(Setting="Running mixture deconvolution:", Value=run_mixdeconv)) config = rbind(config, data.frame(Setting="Running unconditioned deconvolution:", Value=unconditioned)) if (isTruthy(cond)) { - config = rbind(config, data.frame(Setting="Running conditioned deconvolution on references:", Value=cond)) + config = rbind(config, data.frame(Setting="Running conditioned deconvolution on reference:", Value=cond)) } else { - config = rbind(config, data.frame(Setting="Running conditioned deconvolution on references:", Value=FALSE)) + config = rbind(config, data.frame(Setting="Running conditioned deconvolution on reference:", Value=FALSE)) } if (method == "") { config = rbind(config, data.frame(Setting="Method run post deconvolution:", Value="None")) @@ -77,8 +85,8 @@ create_config = function(date, twofreqs, freq_all, freq_major, freq_minor, refs, config = rbind(config, data.frame(Setting="Filtering SNPs if allele 2 is missing?", Value=filter_missing)) } if (method == "Create GEDmatch PRO Report"){ - config = rbind(config, data.frame(Setting="Allele 1 probability threshold (for GEDmatch PRO reports):", Value=A1_threshold)) - config = rbind(config, data.frame(Setting="Allele 2 probability threshold (for GEDmatch PRO reports:", Value=A2_threshold)) + config = rbind(config, data.frame(Setting="Allele 1 probability threshold (for GEDmatch PRO reports or ancestry prediction):", Value=A1_threshold)) + config = rbind(config, data.frame(Setting="Allele 2 probability threshold (for GEDmatch PRO reports or ancestry prediction):", Value=A2_threshold)) } else if (method == "Calculate Metrics") { config = rbind(config, data.frame(Setting="Allele 1 probability range for validation metric calculations:", Value=glue("{A1min}-{A1max}"))) config = rbind(config, data.frame(Setting="Allele 2 probability range for validation metric calculations:", Value=glue("{A2min}-{A2max}"))) diff --git a/R/data.R b/R/data.R index 10f62fb..8cc4ff9 100644 --- a/R/data.R +++ b/R/data.R @@ -50,10 +50,85 @@ #' \item{Probability}{Allele Probability} #' } #' -#' @source gnomAD v4 genomes (https://gnomad.broadinstitute.org/downloads#v4) +#' @source 1000 Genomes (https://www.internationalgenome.org/data) #' "popFreq_1000G" +#' Allele Frequency file using 1000G Phase 3 dataset for all AFR individuals +#' +#' Allele frequencies for 10,039 SNPs +#' +#' @format A list containing a 10039 elements (SNPs) with 4 rows: +#' \describe{ +#' \item{SNP}{SNP rsID} +#' \item{Allele}{Allele (A/C/G/T)} +#' \item{Probability}{Allele Probability} +#' } +#' +#' @source 1000 Genomes (https://www.internationalgenome.org/data) +#' +"popFreq_AFR" + +#' Allele Frequency file using 1000G Phase 3 dataset for all AMR individuals +#' +#' Allele frequencies for 10,039 SNPs +#' +#' @format A list containing a 10039 elements (SNPs) with 4 rows: +#' \describe{ +#' \item{SNP}{SNP rsID} +#' \item{Allele}{Allele (A/C/G/T)} +#' \item{Probability}{Allele Probability} +#' } +#' +#' @source 1000 Genomes (https://www.internationalgenome.org/data) +#' +"popFreq_AMR" + +#' Allele Frequency file using 1000G Phase 3 dataset for all EAS individuals +#' +#' Allele frequencies for 10,039 SNPs +#' +#' @format A list containing a 10039 elements (SNPs) with 4 rows: +#' \describe{ +#' \item{SNP}{SNP rsID} +#' \item{Allele}{Allele (A/C/G/T)} +#' \item{Probability}{Allele Probability} +#' } +#' +#' @source 1000 Genomes (https://www.internationalgenome.org/data) +#' +"popFreq_EAS" + +#' Allele Frequency file using 1000G Phase 3 dataset for all EUR individuals +#' +#' Allele frequencies for 10,039 SNPs +#' +#' @format A list containing a 10039 elements (SNPs) with 4 rows: +#' \describe{ +#' \item{SNP}{SNP rsID} +#' \item{Allele}{Allele (A/C/G/T)} +#' \item{Probability}{Allele Probability} +#' } +#' +#' @source 1000 Genomes (https://www.internationalgenome.org/data) +#' +"popFreq_EUR" + +#' Allele Frequency file using 1000G Phase 3 dataset for all SAS individuals +#' +#' Allele frequencies for 10,039 SNPs +#' +#' @format A list containing a 10039 elements (SNPs) with 4 rows: +#' \describe{ +#' \item{SNP}{SNP rsID} +#' \item{Allele}{Allele (A/C/G/T)} +#' \item{Probability}{Allele Probability} +#' } +#' +#' @source 1000 Genomes (https://www.internationalgenome.org/data) +#' +"popFreq_SAS" + #' SNP information json file #' #' Provides information on which SNPs need to be reverse complemented @@ -70,3 +145,33 @@ #' @source lusSTR Python package (https://github.com/bioforensics/lusSTR) #' "snpinfo" + + +#' 1000 Genomes genotypes for 10,030 autosomal SNPs for all 2,504 1000G samples +#' +#' Used to run PCA for ancestry prediction +#' +#' @format A dataframe containing 10,030 columns (each SNP) and 2,504 rows (each individual): +#' \describe{ +#' \item{SNP}{SNP rsID with reference allele} +#' \item{ID}{Sample ID of individual} +#' } +#' +#' @source 1000 Genomes (https://www.internationalgenome.org/data) +#' +"ancestry_1000G_allsamples" + + +#' 1000 Genomes genotypes for 54 ancestry SNPs for all 2,504 1000G samples +#' +#' Used to run PCA for ancestry prediction +#' +#' @format A dataframe containing 54 columns (each SNP) and 2,504 rows (each individual): +#' \describe{ +#' \item{SNP}{SNP rsID with reference allele} +#' \item{ID}{Sample ID of individual} +#' } +#' +#' @source 1000 Genomes (https://www.internationalgenome.org/data) +#' +"ancestrysnps_1000G_allsamples" diff --git a/R/gedmatch_metrics.R b/R/gedmatch_metrics.R index b978fa8..c8a6e90 100644 --- a/R/gedmatch_metrics.R +++ b/R/gedmatch_metrics.R @@ -20,7 +20,9 @@ #' @return data frame of compiled metrics #' @export #' -#'@importFrom stats sd +#' @importFrom stats sd +#' @rawNamespace import(ggplot2, except = last_plot) + gedmatch_metrics = function(report, A1_threshold, A2_threshold, min_num, path){ . = NULL report$A2_thresh = ifelse(report$A2_Prob >= A2_threshold & report$A2 != 99, report$A2, report$A1) diff --git a/R/load_freq.R b/R/load_freq.R index ca38a1e..b569283 100644 --- a/R/load_freq.R +++ b/R/load_freq.R @@ -26,24 +26,60 @@ load_freq = function(out_path, twofreqs, freq_both, freq_major, freq_minor) { } else if (freq_both == "Global - gnomAD"){ freq_minor = mixder::popFreq_gnomad freq_major = mixder::popFreq_gnomad - } else { + } else if (freq_both == "AFR - 1000G") { + freq_minor = mixder::popFreq_AFR + freq_major = mixder::popFreq_AFR + } else if (freq_both == "AMR - 1000G") { + freq_minor = mixder::popFreq_AMR + freq_major = mixder::popFreq_AMR + } else if (freq_both == "EAS - 1000G") { + freq_minor = mixder::popFreq_EAS + freq_major = mixder::popFreq_EAS + } else if (freq_both == "EUR - 1000G") { + freq_minor = mixder::popFreq_EUR + freq_major = mixder::popFreq_EUR + } else if (freq_both == "SAS - 1000G") { + freq_minor = mixder::popFreq_SAS + freq_major = mixder::popFreq_SAS + } else if (freq_both == "Upload Custom") { freq_minor = checking_af(freq_both, out_path) freq_major = freq_minor } } else { - if (freq_major != "Global - 1000G" & freq_major != "Global - gnomAD") { + if (freq_major == "Upload Custom") { freq_major = checking_af(freq_major, out_path) } else if (freq_major == "Global - 1000G") { freq_major = mixder::popFreq_1000G } else if (freq_major == "Global - gnomAD") { freq_major = mixder::popFreq_gnomad + } else if (freq_major == "AFR - 1000G") { + freq_major = mixder::popFreq_AFR + } else if (freq_major == "AMR - 1000G") { + freq_major = mixder::popFreq_AMR + } else if (freq_major == "EAS - 1000G") { + freq_major = mixder::popFreq_EAS + } else if (freq_major == "EUR - 1000G") { + freq_major = mixder::popFreq_EUR + } else if (freq_major == "SAS - 1000G") { + freq_major = mixder::popFreq_SAS } - if (freq_minor != "Global - 1000G" & freq_minor != "Global - gnomAD") { + + if (freq_minor == "Upload Custom") { freq_minor = checking_af(freq_minor, out_path) } else if (freq_minor == "Global - 1000G") { freq_minor = mixder::popFreq_1000G } else if (freq_minor == "Global - gnomAD") { freq_minor = mixder::popFreq_gnomad + } else if (freq_minor == "AFR - 1000G") { + freq_minor = mixder::popFreq_AFR + } else if (freq_minor == "AMR - 1000G") { + freq_minor = mixder::popFreq_AMR + } else if (freq_minor == "EAS - 1000G") { + freq_minor = mixder::popFreq_EAS + } else if (freq_minor == "EUR - 1000G") { + freq_minor = mixder::popFreq_EUR + } else if (freq_minor == "SAS - 1000G") { + freq_minor = mixder::popFreq_SAS } } return(list(freq_major, freq_minor)) diff --git a/R/run_efm.R b/R/run_efm.R index 07f42ee..848b752 100644 --- a/R/run_efm.R +++ b/R/run_efm.R @@ -25,15 +25,22 @@ #' #' @export #' -run_efm = function(date, popFreq, refData, id, replicate_id, inpath, out_path, attable, nsets, cond = NULL, uncond=TRUE, keep_bins=TRUE) { +#' @import parallel +#' +run_efm = function(date, popFreq, refData, id, replicate_id, inpath, out_path, attable, nsets, ancestry, cond = NULL, uncond=TRUE, keep_bins=TRUE) { + if (!ancestry) { + new_path = glue("{out_path}/ancestry_prediction/") + } else { + new_path = out_path + } if (replicate_id == "") { create_evid_all(inpath, id, nsets, keep_bins) - write_path = glue("{out_path}Single/{id}") + write_path = glue("{new_path}Single/{id}") log_name = id } else { create_evid_all(inpath, id, nsets, keep_bins) create_evid_all(inpath, replicate_id, nsets, keep_bins) - write_path = glue("{out_path}Replicates/{id}") + write_path = glue("{new_path}Replicates/{id}") log_name = glue("{id}_{replicate_id}") } snps_input = glue("{inpath}/snp_sets/") diff --git a/R/run_workflow.R b/R/run_workflow.R index e47c797..e63b520 100644 --- a/R/run_workflow.R +++ b/R/run_workflow.R @@ -43,16 +43,18 @@ #' @param minor_threshold If apply the allele 1 probability threshold to the minor contributor #' @param keep_bins To use existing SNP bins or create new bins (and files) #' @param filter_missing TRUE/FALSE whether to filter SNPs with either allele missing +#' @param skipancestry TRUE/FALSE whether to skip ancestry prediction +#' @param ancestrysnps SNPs to use for ancestry prediction (either ancestry only or all SNPs) +#' @param pcagroups How to color PCA plots (superpopulations and/or subpopulations) #' #' @export #' #'@import dplyr -#'@import ggplot2 #'@import glue #'@importFrom utils write.table write.csv read.table #'@importFrom grDevices dev.off png #'@importFrom methods show -run_workflow = function(date, id, replicate_id, twofreqs, freq_both, freq_major, freq_minor, refData, refs, sample_path, output, run_mixdeconv, unconditioned, cond, method, sets, kinpath, dynamicAT, staticAT, minimum_snps, A1_threshold, A2_threshold, A1min, A1max, A2min, A2max, major, minor, minor_threshold, keep_bins, filter_missing) { +run_workflow = function(date, id, replicate_id, twofreqs, freq_both, freq_major, freq_minor, refData, refs, sample_path, output, run_mixdeconv, unconditioned, cond, method, sets, kinpath, dynamicAT, staticAT, minimum_snps, A1_threshold, A2_threshold, A1min, A1max, A2min, A2max, major, minor, minor_threshold, keep_bins, filter_missing, skipancestry, ancestrysnps, pcagroups) { out_path = glue("{kinpath}/snp_sets/{output}/") if (replicate_id == "") { logfile = file(glue("{out_path}config_log_files/{date}/run_log_{id}_{date}.txt"), open = "wt") @@ -77,13 +79,16 @@ run_workflow = function(date, id, replicate_id, twofreqs, freq_both, freq_major, message(glue("Sample: {id}
")) message(glue("Replicate Sample: {replicate_id}
")) ## run EFM - if (run_mixdeconv) { + if (run_mixdeconv | !skipancestry) { attable = process_kinreport(id, replicate_id, kinpath, dynamicAT, staticAT) - if (twofreqs) { - efm_results_major = run_efm(date, popFreq[[1]], refData, id, replicate_id, kinpath, out_path, attable, sets, cond, uncond=unconditioned, keep_bins) - efm_results_minor = run_efm(date, popFreq[[2]], refData, id, replicate_id, kinpath, out_path, attable, sets, cond, uncond=unconditioned, keep_bins) + if (!skipancestry) { + efm_results_major = run_efm(date, popFreq[[1]], refData, id, replicate_id, kinpath, out_path, attable, sets, skipancestry, cond, uncond=unconditioned, keep_bins) + efm_results_minor = efm_results_major + } else if (twofreqs) { + efm_results_major = run_efm(date, popFreq[[1]], refData, id, replicate_id, kinpath, out_path, attable, sets, skipancestry, cond, uncond=unconditioned, keep_bins) + efm_results_minor = run_efm(date, popFreq[[2]], refData, id, replicate_id, kinpath, out_path, attable, sets, skipancestry, cond, uncond=unconditioned, keep_bins) } else { - efm_results_major = run_efm(date, popFreq[[1]], refData, id, replicate_id, kinpath, out_path, attable, sets, cond, uncond=unconditioned, keep_bins) + efm_results_major = run_efm(date, popFreq[[1]], refData, id, replicate_id, kinpath, out_path, attable, sets, skipancestry, cond, uncond=unconditioned, keep_bins) efm_results_minor = efm_results_major } } @@ -92,19 +97,25 @@ run_workflow = function(date, id, replicate_id, twofreqs, freq_both, freq_major, } else { type = "Replicates" } - write_path = paste0(out_path, type) + if (!skipancestry) { + write_path = paste0(out_path, "ancestry_prediction/", type) + } else { + write_path = paste0(out_path, type) + } ## create GEDmatch PRO report directory - if (method == "Create GEDmatch PRO Report") { + if (method == "Create GEDmatch PRO Report" | !skipancestry) { dir.create(file.path(write_path, "GEDMatchPROReports/Metrics"), showWarnings = FALSE, recursive=TRUE) } if (unconditioned) { - efm_v = getNamespaceVersion("euroformix")[["version"]] - if (substr(efm_v, 1,3)!="4.0" & substr(efm_v, 1,2) != "3.") { - major_c = "C2" - minor_c = "C1" - } else { - major_c = "C1" - minor_c = "C2" + if (run_mixdeconv | !skipancestry) { + efm_v = getNamespaceVersion("euroformix")[["version"]] + if (substr(efm_v, 1,3)!="4.0" & substr(efm_v, 1,2) != "3.") { + major_c = "C2" + minor_c = "C1" + } else { + major_c = "C1" + minor_c = "C2" + } } if (run_mixdeconv) { uncond_table_major = data.frame(efm_results_major[[1]]) %>% @@ -130,8 +141,28 @@ run_workflow = function(date, id, replicate_id, twofreqs, freq_both, freq_major, stop(glue("{uncond_filename_minor} does not exist. You may need to run EFM or check the correct SNP file input folder and Output folder are correct!")) } } - if (method == "Create GEDmatch PRO Report") { + if (method == "Create GEDmatch PRO Report" | !skipancestry) { message("Creating GEDmatch PRO report for major contributor in unconditioned analysis.
") + major_report = create_gedmatchpro_report(write_path, uncond_table_major, "C1", "major", minimum_snps, A1_threshold, A2_threshold, A1min, A1max, A2min, A2max, minor_threshold, filter_missing) + message("Creating GEDmatch PRO report for minor contributor in unconditioned analysis.
") + minor_report = create_gedmatchpro_report(write_path, uncond_table_minor, "C2", "minor", minimum_snps, A1_threshold, A2_threshold, A1min, A1max, A2min, A2max, minor_threshold, filter_missing) + if (method == "Create GEDmatch PRO Report") { + write.table(major_report[[1]], glue("{write_path}/GEDMatchPROReports/{id}_uncond_major_{type}_GEDmatchPROReport.txt"), col.names=T, sep="\t", row.names=F, quote=F) + write.csv(major_report[[2]], glue("{write_path}/GEDMatchPROReports/Metrics/{id}_uncond_major_{type}_GEDmatchPROReport_Metrics.csv"), row.names=F, quote=F) + png(glue("{write_path}/GEDMatchPROReports/Metrics/{id}_uncond_major_{type}_GEDmatchPROReport_Allele1_Probabilities_Density_Plot.png")) + show(major_report[[3]]) + dev.off() + write.table(minor_report[[1]], glue("{write_path}/GEDMatchPROReports/{id}_uncond_minor_{type}_GEDmatchPROReport.txt"), col.names=T, sep="\t", row.names=F, quote=F) + write.csv(minor_report[[2]], glue("{write_path}/GEDMatchPROReports/Metrics/{id}_uncond_minor_{type}_GEDmatchPROReport_metrics.csv"), row.names=F, quote=F) + png(glue("{write_path}/GEDMatchPROReports/Metrics/{id}_uncond_minor_{type}_GEDmatchPROReport_Allele1_Probabilities_Density_Plot.png")) + show(minor_report[[3]]) + dev.off() + } else if (!skipancestry) { + write.table(major_report[[1]], glue("{write_path}/{id}/unconditioned/{id}_uncond_major_{type}_Inferred_Genotypes.txt"), col.names=T, sep="\t", row.names=F, quote=F) + ancestry_prediction(major_report[[1]], glue("{write_path}/{id}/unconditioned/"), id, "unconditioned", "major", ancestrysnps, pcagroups) + write.table(minor_report[[1]], glue("{write_path}/{id}/unconditioned/{id}_uncond_minor_{type}_Inferred_Genotypes.txt"), col.names=T, sep="\t", row.names=F, quote=F) + ancestry_prediction(minor_report[[1]], glue("{write_path}/{id}/unconditioned/"), id, "unconditioned", "minor", ancestrysnps, pcagroups) + } major_report = create_gedmatchpro_report(write_path, uncond_table_major, major_c, "major", minimum_snps, A1_threshold, A2_threshold, A1min, A1max, A2min, A2max, minor_threshold, filter_missing) write.table(major_report[[1]], glue("{write_path}/GEDMatchPROReports/{id}_uncond_major_{type}_GEDmatchPROReport.txt"), col.names=T, sep="\t", row.names=F, quote=F) write.csv(major_report[[2]], glue("{write_path}/GEDMatchPROReports/Metrics/{id}_uncond_major_{type}_GEDmatchPROReport_Metrics.csv"), row.names=F, quote=F) @@ -157,7 +188,7 @@ run_workflow = function(date, id, replicate_id, twofreqs, freq_both, freq_major, if (isTruthy(cond)) { i = 3 for (cond_on in cond) { - if (run_mixdeconv) { + if (run_mixdeconv | !skipancestry) { mix_ratios = efm_results_major[[i+1]] write.table(mix_ratios, glue("{write_path}/{id}/conditioned/unk_conditioned_on_{cond_on}_mixture_ratios.tsv"), col.names=T, sep="\t", row.names=F, quote=F) contrib_status = ifelse(mean(mix_ratios$C1_Prob) > mean(mix_ratios$C2_Prob), "minor", "major") @@ -184,14 +215,19 @@ run_workflow = function(date, id, replicate_id, twofreqs, freq_both, freq_major, stop(glue("File {cond_filename} does not exist. You may need to run EFM or check the correct SNP file input folder and Output folder are correct!")) } } - if (method == "Create GEDmatch PRO Report") { + if (method == "Create GEDmatch PRO Report" | !skipancestry) { message(glue("Creating GEDmatch PRO report for {contrib_status} contributor conditioned on {cond_on} in conditioned analysis.
")) cond_report = create_gedmatchpro_report(write_path, get(glue("efm_table_{contrib_status}")), "C2", contrib_status, minimum_snps, A1_threshold, A2_threshold, A1min, A1max, A2min, A2max, minor_threshold, filter_missing) - write.table(cond_report[[1]], glue("{write_path}/GEDMatchPROReports/{id}_{contrib_status}_contrib_conditioned_on_{cond_on}_{type}_GEDmatchPROReport.txt"), col.names=T, sep="\t", row.names=F, quote=F) - write.csv(cond_report[[2]], glue("{write_path}/GEDMatchPROReports/Metrics/{id}_{contrib_status}_contrib_conditioned_on_{cond_on}_{type}_GEDmatchPROReport_Metrics.csv"), row.names=F, quote=F) - png(glue("{write_path}/GEDMatchPROReports/Metrics/{id}_{contrib_status}_contrib_{type}_GEDmatchPROReport_Allele1_Probabilities_Density_Plot.png")) - show(cond_report[[3]]) - dev.off() + if (method == "Create GEDmatch PRO Report") { + write.table(cond_report[[1]], glue("{write_path}/GEDMatchPROReports/{id}_{contrib_status}_contrib_conditioned_on_{cond_on}_{type}_GEDmatchPROReport.txt"), col.names=T, sep="\t", row.names=F, quote=F) + write.csv(cond_report[[2]], glue("{write_path}/GEDMatchPROReports/Metrics/{id}_{contrib_status}_contrib_conditioned_on_{cond_on}_{type}_GEDmatchPROReport_Metrics.csv"), row.names=F, quote=F) + png(glue("{write_path}/GEDMatchPROReports/Metrics/{id}_{contrib_status}_contrib_{type}_GEDmatchPROReport_Allele1_Probabilities_Density_Plot.png")) + show(cond_report[[3]]) + dev.off() + } else if (!skipancestry) { + write.table(cond_report[[1]], glue("{write_path}/{id}/conditioned/{id}_{contrib_status}_contrib_conditioned_on_{cond_on}_{type}_InferredGenotypes.txt"), col.names=T, sep="\t", row.names=F, quote=F) + ancestry_prediction(cond_report[[1]], glue("{write_path}/{id}/conditioned/"), id, paste0("conditioned_on_", cond_on), contrib_status, ancestrysnps, pcagroups) + } } else if (method == "Calculate Metrics") { unk = ifelse(contrib_status == "major", major, minor) ref = format_ref(refData, unk, refs) @@ -201,11 +237,13 @@ run_workflow = function(date, id, replicate_id, twofreqs, freq_both, freq_major, i = i + 2 } } - if (method == "") { + if (!skipancestry) { + message("Ancestry Prediction complete!") + } else if (run_mixdeconv) { message("Mixture Deconvolution complete!") } else if (method == "Calculate Metrics") { message("Calculating Metrics Complete!") - } else { + } else if (method == "Create GEDmatch PRO Report") { message("Creating GEDmatch PRO Reports Complete!") } #shiny::stopApp() diff --git a/README.Rmd b/README.Rmd index 5841dc5..ee89039 100644 --- a/README.Rmd +++ b/README.Rmd @@ -13,7 +13,8 @@ knitr::opts_chunk$set( ) ``` -# MixDeR - Current Version: 0.7.5 +# MixDeR - Current Version: 1.0 + @@ -24,6 +25,9 @@ This method requires extensive validation of the settings. MixDeR provides the o _Note: MixDeR (and EFM) assume the mixture samples are composed of two contributors. MixDeR is able to identify and alert the user to samples that may be potentially either single source or consist of a mixture with a large mixture ratio (i.e. > 1:100 ratio between contributors). In these scenarios, the user is warned to be cautious with the minor contributor inferred genotyping results._ + +MixDeR version 1.0 and later provides the option to perform ancestry prediction using principal component analysis (PCA). Additional information can be found below in the **Ancestry Prediction** section. + The MixDeR paper can be cited with the following: ``` Mitchell, R., Peck, M., Gorden, E., & Just, R. (2025). MixDeR: A SNP mixture @@ -40,5 +44,4 @@ library(mixder) mixder() ``` - - +Please see [MixDeR documentation](bioforensics.github.io/mixder/) for further information. diff --git a/README.md b/README.md index 4c54f17..dd3e3f1 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ -# MixDeR - Current Version: 0.7.5 +# MixDeR - Current Version: 1.0 @@ -25,6 +25,10 @@ with a large mixture ratio (i.e. \> 1:100 ratio between contributors). In these scenarios, the user is warned to be cautious with the minor contributor inferred genotyping results.* +MixDeR version 1.0 and later provides the option to perform ancestry +prediction using principal component analysis (PCA). Additional +information can be found below in the **Ancestry Prediction** section. + The MixDeR paper can be cited with the following: Mitchell, R., Peck, M., Gorden, E., & Just, R. (2025). MixDeR: A SNP mixture @@ -37,3 +41,6 @@ To launch the shiny app: library(mixder) mixder() + +Please see [MixDeR documentation](bioforensics.github.io/mixder/) for +further information. diff --git a/_pkgdown.yml b/_pkgdown.yml index 08041c7..fc6f340 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -10,5 +10,6 @@ articles: - Installation - Required_Files - Running_MixDecon + - Ancestry_Prediction_Tool - Valmetrics - gedmatch diff --git a/data/ancestry_1000G_all.rda b/data/ancestry_1000G_all.rda new file mode 100644 index 0000000..90c05af Binary files /dev/null and b/data/ancestry_1000G_all.rda differ diff --git a/data/ancestry_1000G_allsamples.rda b/data/ancestry_1000G_allsamples.rda new file mode 100644 index 0000000..9542adc Binary files /dev/null and b/data/ancestry_1000G_allsamples.rda differ diff --git a/data/ancestry_colors.rda b/data/ancestry_colors.rda new file mode 100644 index 0000000..2e950ca Binary files /dev/null and b/data/ancestry_colors.rda differ diff --git a/data/ancestrysnps_1000G_allsamples.rda b/data/ancestrysnps_1000G_allsamples.rda new file mode 100644 index 0000000..b73018a Binary files /dev/null and b/data/ancestrysnps_1000G_allsamples.rda differ diff --git a/data/popFreq_AFR.rda b/data/popFreq_AFR.rda new file mode 100644 index 0000000..9c9afe3 Binary files /dev/null and b/data/popFreq_AFR.rda differ diff --git a/data/popFreq_AMR.rda b/data/popFreq_AMR.rda new file mode 100644 index 0000000..f242920 Binary files /dev/null and b/data/popFreq_AMR.rda differ diff --git a/data/popFreq_EAS.rda b/data/popFreq_EAS.rda new file mode 100644 index 0000000..e9dfdd4 Binary files /dev/null and b/data/popFreq_EAS.rda differ diff --git a/data/popFreq_EUR.rda b/data/popFreq_EUR.rda new file mode 100644 index 0000000..f232665 Binary files /dev/null and b/data/popFreq_EUR.rda differ diff --git a/data/popFreq_SAS.rda b/data/popFreq_SAS.rda new file mode 100644 index 0000000..7cc4db6 Binary files /dev/null and b/data/popFreq_SAS.rda differ diff --git a/docs/404.html b/docs/404.html index 98ef47a..400bf96 100644 --- a/docs/404.html +++ b/docs/404.html @@ -20,7 +20,7 @@ mixder - 0.7.5 + 1.0 '),expand.on("click",(function(n){return e.toggleCurrent(t),n.stopPropagation(),!1})),t.prepend(expand)}))},reset:function(){var n=encodeURI(window.location.hash)||"#";try{var e=$(".wy-menu-vertical"),t=e.find('[href="'+n+'"]');if(0===t.length){var i=$('.document [id="'+n.substring(1)+'"]').closest("div.section");0===(t=e.find('[href="#'+i.attr("id")+'"]')).length&&(t=e.find('[href="#"]'))}if(t.length>0){$(".wy-menu-vertical .current").removeClass("current").attr("aria-expanded","false"),t.addClass("current").attr("aria-expanded","true"),t.closest("li.toctree-l1").parent().addClass("current").attr("aria-expanded","true");for(let n=1;n<=10;n++)t.closest("li.toctree-l"+n).addClass("current").attr("aria-expanded","true");t[0].scrollIntoView()}}catch(n){console.log("Error expanding nav for anchor",n)}},onScroll:function(){this.winScroll=!1;var n=this.win.scrollTop(),e=n+this.winHeight,t=this.navBar.scrollTop()+(n-this.winPosition);n<0||e>this.docHeight||(this.navBar.scrollTop(t),this.winPosition=n)},onResize:function(){this.winResize=!1,this.winHeight=this.win.height(),this.docHeight=$(document).height()},hashChange:function(){this.linkScroll=!0,this.win.one("hashchange",(function(){this.linkScroll=!1}))},toggleCurrent:function(n){var e=n.closest("li");e.siblings("li.current").removeClass("current").attr("aria-expanded","false"),e.siblings().find("li.current").removeClass("current").attr("aria-expanded","false");var t=e.find("> ul li");t.length&&(t.removeClass("current").attr("aria-expanded","false"),e.toggleClass("current").attr("aria-expanded",(function(n,e){return"true"==e?"false":"true"})))}},"undefined"!=typeof window&&(window.SphinxRtdTheme={Navigation:n.exports.ThemeNav,StickyNav:n.exports.ThemeNav}),function(){for(var n=0,e=["ms","moz","webkit","o"],t=0;t a.language.name.localeCompare(b.language.name)); + + const languagesHTML = ` +
+
Languages
+ ${languages + .map( + (translation) => ` +
+ ${translation.language.code} +
+ `, + ) + .join("\n")} +
+ `; + return languagesHTML; + } + + function renderVersions(config) { + if (!config.versions.active.length) { + return ""; + } + const versionsHTML = ` +
+
Versions
+ ${config.versions.active + .map( + (version) => ` +
+ ${version.slug} +
+ `, + ) + .join("\n")} +
+ `; + return versionsHTML; + } + + function renderDownloads(config) { + if (!Object.keys(config.versions.current.downloads).length) { + return ""; + } + const downloadsNameDisplay = { + pdf: "PDF", + epub: "Epub", + htmlzip: "HTML", + }; + + const downloadsHTML = ` +
+
Downloads
+ ${Object.entries(config.versions.current.downloads) + .map( + ([name, url]) => ` +
+ ${downloadsNameDisplay[name]} +
+ `, + ) + .join("\n")} +
+ `; + return downloadsHTML; + } + + document.addEventListener("readthedocs-addons-data-ready", function (event) { + const config = event.detail.data(); + + const flyout = ` +
+ + Read the Docs + v: ${config.versions.current.slug} + + +
+
+ ${renderLanguages(config)} + ${renderVersions(config)} + ${renderDownloads(config)} +
+
On Read the Docs
+
+ Project Home +
+
+ Builds +
+
+ Downloads +
+
+
+
Search
+
+
+ +
+
+
+
+ + Hosted by Read the Docs + +
+
+ `; + + // Inject the generated flyout into the body HTML element. + document.body.insertAdjacentHTML("beforeend", flyout); + + // Trigger the Read the Docs Addons Search modal when clicking on the "Search docs" input from inside the flyout. + document + .querySelector("#flyout-search-form") + .addEventListener("focusin", () => { + const event = new CustomEvent("readthedocs-search-show"); + document.dispatchEvent(event); + }); + }) +} + +if (themeLanguageSelector || themeVersionSelector) { + function onSelectorSwitch(event) { + const option = event.target.selectedIndex; + const item = event.target.options[option]; + window.location.href = item.dataset.url; + } + + document.addEventListener("readthedocs-addons-data-ready", function (event) { + const config = event.detail.data(); + + const versionSwitch = document.querySelector( + "div.switch-menus > div.version-switch", + ); + if (themeVersionSelector) { + let versions = config.versions.active; + if (config.versions.current.hidden || config.versions.current.type === "external") { + versions.unshift(config.versions.current); + } + const versionSelect = ` + + `; + + versionSwitch.innerHTML = versionSelect; + versionSwitch.firstElementChild.addEventListener("change", onSelectorSwitch); + } + + const languageSwitch = document.querySelector( + "div.switch-menus > div.language-switch", + ); + + if (themeLanguageSelector) { + if (config.projects.translations.length) { + // Add the current language to the options on the selector + let languages = config.projects.translations.concat( + config.projects.current, + ); + languages = languages.sort((a, b) => + a.language.name.localeCompare(b.language.name), + ); + + const languageSelect = ` + + `; + + languageSwitch.innerHTML = languageSelect; + languageSwitch.firstElementChild.addEventListener("change", onSelectorSwitch); + } + else { + languageSwitch.remove(); + } + } + }); +} + +document.addEventListener("readthedocs-addons-data-ready", function (event) { + // Trigger the Read the Docs Addons Search modal when clicking on "Search docs" input from the topnav. + document + .querySelector("[role='search'] input") + .addEventListener("focusin", () => { + const event = new CustomEvent("readthedocs-search-show"); + document.dispatchEvent(event); + }); +}); \ No newline at end of file diff --git a/docs/_build/html/_static/language_data.js b/docs/_build/html/_static/language_data.js new file mode 100644 index 0000000..c7fe6c6 --- /dev/null +++ b/docs/_build/html/_static/language_data.js @@ -0,0 +1,192 @@ +/* + * This script contains the language-specific data used by searchtools.js, + * namely the list of stopwords, stemmer, scorer and splitter. + */ + +var stopwords = ["a", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "near", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"]; + + +/* Non-minified version is copied as a separate JS file, if available */ + +/** + * Porter Stemmer + */ +var Stemmer = function() { + + var step2list = { + ational: 'ate', + tional: 'tion', + enci: 'ence', + anci: 'ance', + izer: 'ize', + bli: 'ble', + alli: 'al', + entli: 'ent', + eli: 'e', + ousli: 'ous', + ization: 'ize', + ation: 'ate', + ator: 'ate', + alism: 'al', + iveness: 'ive', + fulness: 'ful', + ousness: 'ous', + aliti: 'al', + iviti: 'ive', + biliti: 'ble', + logi: 'log' + }; + + var step3list = { + icate: 'ic', + ative: '', + alize: 'al', + iciti: 'ic', + ical: 'ic', + ful: '', + ness: '' + }; + + var c = "[^aeiou]"; // consonant + var v = "[aeiouy]"; // vowel + var C = c + "[^aeiouy]*"; // consonant sequence + var V = v + "[aeiou]*"; // vowel sequence + + var mgr0 = "^(" + C + ")?" + V + C; // [C]VC... is m>0 + var meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$"; // [C]VC[V] is m=1 + var mgr1 = "^(" + C + ")?" + V + C + V + C; // [C]VCVC... is m>1 + var s_v = "^(" + C + ")?" + v; // vowel in stem + + this.stemWord = function (w) { + var stem; + var suffix; + var firstch; + var origword = w; + + if (w.length < 3) + return w; + + var re; + var re2; + var re3; + var re4; + + firstch = w.substr(0,1); + if (firstch == "y") + w = firstch.toUpperCase() + w.substr(1); + + // Step 1a + re = /^(.+?)(ss|i)es$/; + re2 = /^(.+?)([^s])s$/; + + if (re.test(w)) + w = w.replace(re,"$1$2"); + else if (re2.test(w)) + w = w.replace(re2,"$1$2"); + + // Step 1b + re = /^(.+?)eed$/; + re2 = /^(.+?)(ed|ing)$/; + if (re.test(w)) { + var fp = re.exec(w); + re = new RegExp(mgr0); + if (re.test(fp[1])) { + re = /.$/; + w = w.replace(re,""); + } + } + else if (re2.test(w)) { + var fp = re2.exec(w); + stem = fp[1]; + re2 = new RegExp(s_v); + if (re2.test(stem)) { + w = stem; + re2 = /(at|bl|iz)$/; + re3 = new RegExp("([^aeiouylsz])\\1$"); + re4 = new RegExp("^" + C + v + "[^aeiouwxy]$"); + if (re2.test(w)) + w = w + "e"; + else if (re3.test(w)) { + re = /.$/; + w = w.replace(re,""); + } + else if (re4.test(w)) + w = w + "e"; + } + } + + // Step 1c + re = /^(.+?)y$/; + if (re.test(w)) { + var fp = re.exec(w); + stem = fp[1]; + re = new RegExp(s_v); + if (re.test(stem)) + w = stem + "i"; + } + + // Step 2 + re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/; + if (re.test(w)) { + var fp = re.exec(w); + stem = fp[1]; + suffix = fp[2]; + re = new RegExp(mgr0); + if (re.test(stem)) + w = stem + step2list[suffix]; + } + + // Step 3 + re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/; + if (re.test(w)) { + var fp = re.exec(w); + stem = fp[1]; + suffix = fp[2]; + re = new RegExp(mgr0); + if (re.test(stem)) + w = stem + step3list[suffix]; + } + + // Step 4 + re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/; + re2 = /^(.+?)(s|t)(ion)$/; + if (re.test(w)) { + var fp = re.exec(w); + stem = fp[1]; + re = new RegExp(mgr1); + if (re.test(stem)) + w = stem; + } + else if (re2.test(w)) { + var fp = re2.exec(w); + stem = fp[1] + fp[2]; + re2 = new RegExp(mgr1); + if (re2.test(stem)) + w = stem; + } + + // Step 5 + re = /^(.+?)e$/; + if (re.test(w)) { + var fp = re.exec(w); + stem = fp[1]; + re = new RegExp(mgr1); + re2 = new RegExp(meq1); + re3 = new RegExp("^" + C + v + "[^aeiouwxy]$"); + if (re.test(stem) || (re2.test(stem) && !(re3.test(stem)))) + w = stem; + } + re = /ll$/; + re2 = new RegExp(mgr1); + if (re.test(w) && re2.test(w)) { + re = /.$/; + w = w.replace(re,""); + } + + // and turn initial Y back to y + if (firstch == "y") + w = firstch.toLowerCase() + w.substr(1); + return w; + } +} + diff --git a/docs/_build/html/_static/minus.png b/docs/_build/html/_static/minus.png new file mode 100644 index 0000000..d96755f Binary files /dev/null and b/docs/_build/html/_static/minus.png differ diff --git a/docs/_build/html/_static/plus.png b/docs/_build/html/_static/plus.png new file mode 100644 index 0000000..7107cec Binary files /dev/null and b/docs/_build/html/_static/plus.png differ diff --git a/docs/_build/html/_static/pygments.css b/docs/_build/html/_static/pygments.css new file mode 100644 index 0000000..6f8b210 --- /dev/null +++ b/docs/_build/html/_static/pygments.css @@ -0,0 +1,75 @@ +pre { line-height: 125%; } +td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +.highlight .hll { background-color: #ffffcc } +.highlight { background: #f8f8f8; } +.highlight .c { color: #3D7B7B; font-style: italic } /* Comment */ +.highlight .err { border: 1px solid #F00 } /* Error */ +.highlight .k { color: #008000; font-weight: bold } /* Keyword */ +.highlight .o { color: #666 } /* Operator */ +.highlight .ch { color: #3D7B7B; font-style: italic } /* Comment.Hashbang */ +.highlight .cm { color: #3D7B7B; font-style: italic } /* Comment.Multiline */ +.highlight .cp { color: #9C6500 } /* Comment.Preproc */ +.highlight .cpf { color: #3D7B7B; font-style: italic } /* Comment.PreprocFile */ +.highlight .c1 { color: #3D7B7B; font-style: italic } /* Comment.Single */ +.highlight .cs { color: #3D7B7B; font-style: italic } /* Comment.Special */ +.highlight .gd { color: #A00000 } /* Generic.Deleted */ +.highlight .ge { font-style: italic } /* Generic.Emph */ +.highlight .ges { font-weight: bold; font-style: italic } /* Generic.EmphStrong */ +.highlight .gr { color: #E40000 } /* Generic.Error */ +.highlight .gh { color: #000080; font-weight: bold } /* Generic.Heading */ +.highlight .gi { color: #008400 } /* Generic.Inserted */ +.highlight .go { color: #717171 } /* Generic.Output */ +.highlight .gp { color: #000080; font-weight: bold } /* Generic.Prompt */ +.highlight .gs { font-weight: bold } /* Generic.Strong */ +.highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */ +.highlight .gt { color: #04D } /* Generic.Traceback */ +.highlight .kc { color: #008000; font-weight: bold } /* Keyword.Constant */ +.highlight .kd { color: #008000; font-weight: bold } /* Keyword.Declaration */ +.highlight .kn { color: #008000; font-weight: bold } /* Keyword.Namespace */ +.highlight .kp { color: #008000 } /* Keyword.Pseudo */ +.highlight .kr { color: #008000; font-weight: bold } /* Keyword.Reserved */ +.highlight .kt { color: #B00040 } /* Keyword.Type */ +.highlight .m { color: #666 } /* Literal.Number */ +.highlight .s { color: #BA2121 } /* Literal.String */ +.highlight .na { color: #687822 } /* Name.Attribute */ +.highlight .nb { color: #008000 } /* Name.Builtin */ +.highlight .nc { color: #00F; font-weight: bold } /* Name.Class */ +.highlight .no { color: #800 } /* Name.Constant */ +.highlight .nd { color: #A2F } /* Name.Decorator */ +.highlight .ni { color: #717171; font-weight: bold } /* Name.Entity */ +.highlight .ne { color: #CB3F38; font-weight: bold } /* Name.Exception */ +.highlight .nf { color: #00F } /* Name.Function */ +.highlight .nl { color: #767600 } /* Name.Label */ +.highlight .nn { color: #00F; font-weight: bold } /* Name.Namespace */ +.highlight .nt { color: #008000; font-weight: bold } /* Name.Tag */ +.highlight .nv { color: #19177C } /* Name.Variable */ +.highlight .ow { color: #A2F; font-weight: bold } /* Operator.Word */ +.highlight .w { color: #BBB } /* Text.Whitespace */ +.highlight .mb { color: #666 } /* Literal.Number.Bin */ +.highlight .mf { color: #666 } /* Literal.Number.Float */ +.highlight .mh { color: #666 } /* Literal.Number.Hex */ +.highlight .mi { color: #666 } /* Literal.Number.Integer */ +.highlight .mo { color: #666 } /* Literal.Number.Oct */ +.highlight .sa { color: #BA2121 } /* Literal.String.Affix */ +.highlight .sb { color: #BA2121 } /* Literal.String.Backtick */ +.highlight .sc { color: #BA2121 } /* Literal.String.Char */ +.highlight .dl { color: #BA2121 } /* Literal.String.Delimiter */ +.highlight .sd { color: #BA2121; font-style: italic } /* Literal.String.Doc */ +.highlight .s2 { color: #BA2121 } /* Literal.String.Double */ +.highlight .se { color: #AA5D1F; font-weight: bold } /* Literal.String.Escape */ +.highlight .sh { color: #BA2121 } /* Literal.String.Heredoc */ +.highlight .si { color: #A45A77; font-weight: bold } /* Literal.String.Interpol */ +.highlight .sx { color: #008000 } /* Literal.String.Other */ +.highlight .sr { color: #A45A77 } /* Literal.String.Regex */ +.highlight .s1 { color: #BA2121 } /* Literal.String.Single */ +.highlight .ss { color: #19177C } /* Literal.String.Symbol */ +.highlight .bp { color: #008000 } /* Name.Builtin.Pseudo */ +.highlight .fm { color: #00F } /* Name.Function.Magic */ +.highlight .vc { color: #19177C } /* Name.Variable.Class */ +.highlight .vg { color: #19177C } /* Name.Variable.Global */ +.highlight .vi { color: #19177C } /* Name.Variable.Instance */ +.highlight .vm { color: #19177C } /* Name.Variable.Magic */ +.highlight .il { color: #666 } /* Literal.Number.Integer.Long */ \ No newline at end of file diff --git a/docs/_build/html/_static/searchtools.js b/docs/_build/html/_static/searchtools.js new file mode 100644 index 0000000..2c774d1 --- /dev/null +++ b/docs/_build/html/_static/searchtools.js @@ -0,0 +1,632 @@ +/* + * Sphinx JavaScript utilities for the full-text search. + */ +"use strict"; + +/** + * Simple result scoring code. + */ +if (typeof Scorer === "undefined") { + var Scorer = { + // Implement the following function to further tweak the score for each result + // The function takes a result array [docname, title, anchor, descr, score, filename] + // and returns the new score. + /* + score: result => { + const [docname, title, anchor, descr, score, filename, kind] = result + return score + }, + */ + + // query matches the full name of an object + objNameMatch: 11, + // or matches in the last dotted part of the object name + objPartialMatch: 6, + // Additive scores depending on the priority of the object + objPrio: { + 0: 15, // used to be importantResults + 1: 5, // used to be objectResults + 2: -5, // used to be unimportantResults + }, + // Used when the priority is not in the mapping. + objPrioDefault: 0, + + // query found in title + title: 15, + partialTitle: 7, + // query found in terms + term: 5, + partialTerm: 2, + }; +} + +// Global search result kind enum, used by themes to style search results. +class SearchResultKind { + static get index() { return "index"; } + static get object() { return "object"; } + static get text() { return "text"; } + static get title() { return "title"; } +} + +const _removeChildren = (element) => { + while (element && element.lastChild) element.removeChild(element.lastChild); +}; + +/** + * See https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions#escaping + */ +const _escapeRegExp = (string) => + string.replace(/[.*+\-?^${}()|[\]\\]/g, "\\$&"); // $& means the whole matched string + +const _displayItem = (item, searchTerms, highlightTerms) => { + const docBuilder = DOCUMENTATION_OPTIONS.BUILDER; + const docFileSuffix = DOCUMENTATION_OPTIONS.FILE_SUFFIX; + const docLinkSuffix = DOCUMENTATION_OPTIONS.LINK_SUFFIX; + const showSearchSummary = DOCUMENTATION_OPTIONS.SHOW_SEARCH_SUMMARY; + const contentRoot = document.documentElement.dataset.content_root; + + const [docName, title, anchor, descr, score, _filename, kind] = item; + + let listItem = document.createElement("li"); + // Add a class representing the item's type: + // can be used by a theme's CSS selector for styling + // See SearchResultKind for the class names. + listItem.classList.add(`kind-${kind}`); + let requestUrl; + let linkUrl; + if (docBuilder === "dirhtml") { + // dirhtml builder + let dirname = docName + "/"; + if (dirname.match(/\/index\/$/)) + dirname = dirname.substring(0, dirname.length - 6); + else if (dirname === "index/") dirname = ""; + requestUrl = contentRoot + dirname; + linkUrl = requestUrl; + } else { + // normal html builders + requestUrl = contentRoot + docName + docFileSuffix; + linkUrl = docName + docLinkSuffix; + } + let linkEl = listItem.appendChild(document.createElement("a")); + linkEl.href = linkUrl + anchor; + linkEl.dataset.score = score; + linkEl.innerHTML = title; + if (descr) { + listItem.appendChild(document.createElement("span")).innerHTML = + " (" + descr + ")"; + // highlight search terms in the description + if (SPHINX_HIGHLIGHT_ENABLED) // set in sphinx_highlight.js + highlightTerms.forEach((term) => _highlightText(listItem, term, "highlighted")); + } + else if (showSearchSummary) + fetch(requestUrl) + .then((responseData) => responseData.text()) + .then((data) => { + if (data) + listItem.appendChild( + Search.makeSearchSummary(data, searchTerms, anchor) + ); + // highlight search terms in the summary + if (SPHINX_HIGHLIGHT_ENABLED) // set in sphinx_highlight.js + highlightTerms.forEach((term) => _highlightText(listItem, term, "highlighted")); + }); + Search.output.appendChild(listItem); +}; +const _finishSearch = (resultCount) => { + Search.stopPulse(); + Search.title.innerText = _("Search Results"); + if (!resultCount) + Search.status.innerText = Documentation.gettext( + "Your search did not match any documents. Please make sure that all words are spelled correctly and that you've selected enough categories." + ); + else + Search.status.innerText = Documentation.ngettext( + "Search finished, found one page matching the search query.", + "Search finished, found ${resultCount} pages matching the search query.", + resultCount, + ).replace('${resultCount}', resultCount); +}; +const _displayNextItem = ( + results, + resultCount, + searchTerms, + highlightTerms, +) => { + // results left, load the summary and display it + // this is intended to be dynamic (don't sub resultsCount) + if (results.length) { + _displayItem(results.pop(), searchTerms, highlightTerms); + setTimeout( + () => _displayNextItem(results, resultCount, searchTerms, highlightTerms), + 5 + ); + } + // search finished, update title and status message + else _finishSearch(resultCount); +}; +// Helper function used by query() to order search results. +// Each input is an array of [docname, title, anchor, descr, score, filename, kind]. +// Order the results by score (in opposite order of appearance, since the +// `_displayNextItem` function uses pop() to retrieve items) and then alphabetically. +const _orderResultsByScoreThenName = (a, b) => { + const leftScore = a[4]; + const rightScore = b[4]; + if (leftScore === rightScore) { + // same score: sort alphabetically + const leftTitle = a[1].toLowerCase(); + const rightTitle = b[1].toLowerCase(); + if (leftTitle === rightTitle) return 0; + return leftTitle > rightTitle ? -1 : 1; // inverted is intentional + } + return leftScore > rightScore ? 1 : -1; +}; + +/** + * Default splitQuery function. Can be overridden in ``sphinx.search`` with a + * custom function per language. + * + * The regular expression works by splitting the string on consecutive characters + * that are not Unicode letters, numbers, underscores, or emoji characters. + * This is the same as ``\W+`` in Python, preserving the surrogate pair area. + */ +if (typeof splitQuery === "undefined") { + var splitQuery = (query) => query + .split(/[^\p{Letter}\p{Number}_\p{Emoji_Presentation}]+/gu) + .filter(term => term) // remove remaining empty strings +} + +/** + * Search Module + */ +const Search = { + _index: null, + _queued_query: null, + _pulse_status: -1, + + htmlToText: (htmlString, anchor) => { + const htmlElement = new DOMParser().parseFromString(htmlString, 'text/html'); + for (const removalQuery of [".headerlink", "script", "style"]) { + htmlElement.querySelectorAll(removalQuery).forEach((el) => { el.remove() }); + } + if (anchor) { + const anchorContent = htmlElement.querySelector(`[role="main"] ${anchor}`); + if (anchorContent) return anchorContent.textContent; + + console.warn( + `Anchored content block not found. Sphinx search tries to obtain it via DOM query '[role=main] ${anchor}'. Check your theme or template.` + ); + } + + // if anchor not specified or not found, fall back to main content + const docContent = htmlElement.querySelector('[role="main"]'); + if (docContent) return docContent.textContent; + + console.warn( + "Content block not found. Sphinx search tries to obtain it via DOM query '[role=main]'. Check your theme or template." + ); + return ""; + }, + + init: () => { + const query = new URLSearchParams(window.location.search).get("q"); + document + .querySelectorAll('input[name="q"]') + .forEach((el) => (el.value = query)); + if (query) Search.performSearch(query); + }, + + loadIndex: (url) => + (document.body.appendChild(document.createElement("script")).src = url), + + setIndex: (index) => { + Search._index = index; + if (Search._queued_query !== null) { + const query = Search._queued_query; + Search._queued_query = null; + Search.query(query); + } + }, + + hasIndex: () => Search._index !== null, + + deferQuery: (query) => (Search._queued_query = query), + + stopPulse: () => (Search._pulse_status = -1), + + startPulse: () => { + if (Search._pulse_status >= 0) return; + + const pulse = () => { + Search._pulse_status = (Search._pulse_status + 1) % 4; + Search.dots.innerText = ".".repeat(Search._pulse_status); + if (Search._pulse_status >= 0) window.setTimeout(pulse, 500); + }; + pulse(); + }, + + /** + * perform a search for something (or wait until index is loaded) + */ + performSearch: (query) => { + // create the required interface elements + const searchText = document.createElement("h2"); + searchText.textContent = _("Searching"); + const searchSummary = document.createElement("p"); + searchSummary.classList.add("search-summary"); + searchSummary.innerText = ""; + const searchList = document.createElement("ul"); + searchList.setAttribute("role", "list"); + searchList.classList.add("search"); + + const out = document.getElementById("search-results"); + Search.title = out.appendChild(searchText); + Search.dots = Search.title.appendChild(document.createElement("span")); + Search.status = out.appendChild(searchSummary); + Search.output = out.appendChild(searchList); + + const searchProgress = document.getElementById("search-progress"); + // Some themes don't use the search progress node + if (searchProgress) { + searchProgress.innerText = _("Preparing search..."); + } + Search.startPulse(); + + // index already loaded, the browser was quick! + if (Search.hasIndex()) Search.query(query); + else Search.deferQuery(query); + }, + + _parseQuery: (query) => { + // stem the search terms and add them to the correct list + const stemmer = new Stemmer(); + const searchTerms = new Set(); + const excludedTerms = new Set(); + const highlightTerms = new Set(); + const objectTerms = new Set(splitQuery(query.toLowerCase().trim())); + splitQuery(query.trim()).forEach((queryTerm) => { + const queryTermLower = queryTerm.toLowerCase(); + + // maybe skip this "word" + // stopwords array is from language_data.js + if ( + stopwords.indexOf(queryTermLower) !== -1 || + queryTerm.match(/^\d+$/) + ) + return; + + // stem the word + let word = stemmer.stemWord(queryTermLower); + // select the correct list + if (word[0] === "-") excludedTerms.add(word.substr(1)); + else { + searchTerms.add(word); + highlightTerms.add(queryTermLower); + } + }); + + if (SPHINX_HIGHLIGHT_ENABLED) { // set in sphinx_highlight.js + localStorage.setItem("sphinx_highlight_terms", [...highlightTerms].join(" ")) + } + + // console.debug("SEARCH: searching for:"); + // console.info("required: ", [...searchTerms]); + // console.info("excluded: ", [...excludedTerms]); + + return [query, searchTerms, excludedTerms, highlightTerms, objectTerms]; + }, + + /** + * execute search (requires search index to be loaded) + */ + _performSearch: (query, searchTerms, excludedTerms, highlightTerms, objectTerms) => { + const filenames = Search._index.filenames; + const docNames = Search._index.docnames; + const titles = Search._index.titles; + const allTitles = Search._index.alltitles; + const indexEntries = Search._index.indexentries; + + // Collect multiple result groups to be sorted separately and then ordered. + // Each is an array of [docname, title, anchor, descr, score, filename, kind]. + const normalResults = []; + const nonMainIndexResults = []; + + _removeChildren(document.getElementById("search-progress")); + + const queryLower = query.toLowerCase().trim(); + for (const [title, foundTitles] of Object.entries(allTitles)) { + if (title.toLowerCase().trim().includes(queryLower) && (queryLower.length >= title.length/2)) { + for (const [file, id] of foundTitles) { + const score = Math.round(Scorer.title * queryLower.length / title.length); + const boost = titles[file] === title ? 1 : 0; // add a boost for document titles + normalResults.push([ + docNames[file], + titles[file] !== title ? `${titles[file]} > ${title}` : title, + id !== null ? "#" + id : "", + null, + score + boost, + filenames[file], + SearchResultKind.title, + ]); + } + } + } + + // search for explicit entries in index directives + for (const [entry, foundEntries] of Object.entries(indexEntries)) { + if (entry.includes(queryLower) && (queryLower.length >= entry.length/2)) { + for (const [file, id, isMain] of foundEntries) { + const score = Math.round(100 * queryLower.length / entry.length); + const result = [ + docNames[file], + titles[file], + id ? "#" + id : "", + null, + score, + filenames[file], + SearchResultKind.index, + ]; + if (isMain) { + normalResults.push(result); + } else { + nonMainIndexResults.push(result); + } + } + } + } + + // lookup as object + objectTerms.forEach((term) => + normalResults.push(...Search.performObjectSearch(term, objectTerms)) + ); + + // lookup as search terms in fulltext + normalResults.push(...Search.performTermsSearch(searchTerms, excludedTerms)); + + // let the scorer override scores with a custom scoring function + if (Scorer.score) { + normalResults.forEach((item) => (item[4] = Scorer.score(item))); + nonMainIndexResults.forEach((item) => (item[4] = Scorer.score(item))); + } + + // Sort each group of results by score and then alphabetically by name. + normalResults.sort(_orderResultsByScoreThenName); + nonMainIndexResults.sort(_orderResultsByScoreThenName); + + // Combine the result groups in (reverse) order. + // Non-main index entries are typically arbitrary cross-references, + // so display them after other results. + let results = [...nonMainIndexResults, ...normalResults]; + + // remove duplicate search results + // note the reversing of results, so that in the case of duplicates, the highest-scoring entry is kept + let seen = new Set(); + results = results.reverse().reduce((acc, result) => { + let resultStr = result.slice(0, 4).concat([result[5]]).map(v => String(v)).join(','); + if (!seen.has(resultStr)) { + acc.push(result); + seen.add(resultStr); + } + return acc; + }, []); + + return results.reverse(); + }, + + query: (query) => { + const [searchQuery, searchTerms, excludedTerms, highlightTerms, objectTerms] = Search._parseQuery(query); + const results = Search._performSearch(searchQuery, searchTerms, excludedTerms, highlightTerms, objectTerms); + + // for debugging + //Search.lastresults = results.slice(); // a copy + // console.info("search results:", Search.lastresults); + + // print the results + _displayNextItem(results, results.length, searchTerms, highlightTerms); + }, + + /** + * search for object names + */ + performObjectSearch: (object, objectTerms) => { + const filenames = Search._index.filenames; + const docNames = Search._index.docnames; + const objects = Search._index.objects; + const objNames = Search._index.objnames; + const titles = Search._index.titles; + + const results = []; + + const objectSearchCallback = (prefix, match) => { + const name = match[4] + const fullname = (prefix ? prefix + "." : "") + name; + const fullnameLower = fullname.toLowerCase(); + if (fullnameLower.indexOf(object) < 0) return; + + let score = 0; + const parts = fullnameLower.split("."); + + // check for different match types: exact matches of full name or + // "last name" (i.e. last dotted part) + if (fullnameLower === object || parts.slice(-1)[0] === object) + score += Scorer.objNameMatch; + else if (parts.slice(-1)[0].indexOf(object) > -1) + score += Scorer.objPartialMatch; // matches in last name + + const objName = objNames[match[1]][2]; + const title = titles[match[0]]; + + // If more than one term searched for, we require other words to be + // found in the name/title/description + const otherTerms = new Set(objectTerms); + otherTerms.delete(object); + if (otherTerms.size > 0) { + const haystack = `${prefix} ${name} ${objName} ${title}`.toLowerCase(); + if ( + [...otherTerms].some((otherTerm) => haystack.indexOf(otherTerm) < 0) + ) + return; + } + + let anchor = match[3]; + if (anchor === "") anchor = fullname; + else if (anchor === "-") anchor = objNames[match[1]][1] + "-" + fullname; + + const descr = objName + _(", in ") + title; + + // add custom score for some objects according to scorer + if (Scorer.objPrio.hasOwnProperty(match[2])) + score += Scorer.objPrio[match[2]]; + else score += Scorer.objPrioDefault; + + results.push([ + docNames[match[0]], + fullname, + "#" + anchor, + descr, + score, + filenames[match[0]], + SearchResultKind.object, + ]); + }; + Object.keys(objects).forEach((prefix) => + objects[prefix].forEach((array) => + objectSearchCallback(prefix, array) + ) + ); + return results; + }, + + /** + * search for full-text terms in the index + */ + performTermsSearch: (searchTerms, excludedTerms) => { + // prepare search + const terms = Search._index.terms; + const titleTerms = Search._index.titleterms; + const filenames = Search._index.filenames; + const docNames = Search._index.docnames; + const titles = Search._index.titles; + + const scoreMap = new Map(); + const fileMap = new Map(); + + // perform the search on the required terms + searchTerms.forEach((word) => { + const files = []; + const arr = [ + { files: terms[word], score: Scorer.term }, + { files: titleTerms[word], score: Scorer.title }, + ]; + // add support for partial matches + if (word.length > 2) { + const escapedWord = _escapeRegExp(word); + if (!terms.hasOwnProperty(word)) { + Object.keys(terms).forEach((term) => { + if (term.match(escapedWord)) + arr.push({ files: terms[term], score: Scorer.partialTerm }); + }); + } + if (!titleTerms.hasOwnProperty(word)) { + Object.keys(titleTerms).forEach((term) => { + if (term.match(escapedWord)) + arr.push({ files: titleTerms[term], score: Scorer.partialTitle }); + }); + } + } + + // no match but word was a required one + if (arr.every((record) => record.files === undefined)) return; + + // found search word in contents + arr.forEach((record) => { + if (record.files === undefined) return; + + let recordFiles = record.files; + if (recordFiles.length === undefined) recordFiles = [recordFiles]; + files.push(...recordFiles); + + // set score for the word in each file + recordFiles.forEach((file) => { + if (!scoreMap.has(file)) scoreMap.set(file, {}); + scoreMap.get(file)[word] = record.score; + }); + }); + + // create the mapping + files.forEach((file) => { + if (!fileMap.has(file)) fileMap.set(file, [word]); + else if (fileMap.get(file).indexOf(word) === -1) fileMap.get(file).push(word); + }); + }); + + // now check if the files don't contain excluded terms + const results = []; + for (const [file, wordList] of fileMap) { + // check if all requirements are matched + + // as search terms with length < 3 are discarded + const filteredTermCount = [...searchTerms].filter( + (term) => term.length > 2 + ).length; + if ( + wordList.length !== searchTerms.size && + wordList.length !== filteredTermCount + ) + continue; + + // ensure that none of the excluded terms is in the search result + if ( + [...excludedTerms].some( + (term) => + terms[term] === file || + titleTerms[term] === file || + (terms[term] || []).includes(file) || + (titleTerms[term] || []).includes(file) + ) + ) + break; + + // select one (max) score for the file. + const score = Math.max(...wordList.map((w) => scoreMap.get(file)[w])); + // add result to the result list + results.push([ + docNames[file], + titles[file], + "", + null, + score, + filenames[file], + SearchResultKind.text, + ]); + } + return results; + }, + + /** + * helper function to return a node containing the + * search summary for a given text. keywords is a list + * of stemmed words. + */ + makeSearchSummary: (htmlText, keywords, anchor) => { + const text = Search.htmlToText(htmlText, anchor); + if (text === "") return null; + + const textLower = text.toLowerCase(); + const actualStartPosition = [...keywords] + .map((k) => textLower.indexOf(k.toLowerCase())) + .filter((i) => i > -1) + .slice(-1)[0]; + const startWithContext = Math.max(actualStartPosition - 120, 0); + + const top = startWithContext === 0 ? "" : "..."; + const tail = startWithContext + 240 < text.length ? "..." : ""; + + let summary = document.createElement("p"); + summary.classList.add("context"); + summary.textContent = top + text.substr(startWithContext, 240).trim() + tail; + + return summary; + }, +}; + +_ready(Search.init); diff --git a/docs/_build/html/_static/sphinx_highlight.js b/docs/_build/html/_static/sphinx_highlight.js new file mode 100644 index 0000000..8a96c69 --- /dev/null +++ b/docs/_build/html/_static/sphinx_highlight.js @@ -0,0 +1,154 @@ +/* Highlighting utilities for Sphinx HTML documentation. */ +"use strict"; + +const SPHINX_HIGHLIGHT_ENABLED = true + +/** + * highlight a given string on a node by wrapping it in + * span elements with the given class name. + */ +const _highlight = (node, addItems, text, className) => { + if (node.nodeType === Node.TEXT_NODE) { + const val = node.nodeValue; + const parent = node.parentNode; + const pos = val.toLowerCase().indexOf(text); + if ( + pos >= 0 && + !parent.classList.contains(className) && + !parent.classList.contains("nohighlight") + ) { + let span; + + const closestNode = parent.closest("body, svg, foreignObject"); + const isInSVG = closestNode && closestNode.matches("svg"); + if (isInSVG) { + span = document.createElementNS("http://www.w3.org/2000/svg", "tspan"); + } else { + span = document.createElement("span"); + span.classList.add(className); + } + + span.appendChild(document.createTextNode(val.substr(pos, text.length))); + const rest = document.createTextNode(val.substr(pos + text.length)); + parent.insertBefore( + span, + parent.insertBefore( + rest, + node.nextSibling + ) + ); + node.nodeValue = val.substr(0, pos); + /* There may be more occurrences of search term in this node. So call this + * function recursively on the remaining fragment. + */ + _highlight(rest, addItems, text, className); + + if (isInSVG) { + const rect = document.createElementNS( + "http://www.w3.org/2000/svg", + "rect" + ); + const bbox = parent.getBBox(); + rect.x.baseVal.value = bbox.x; + rect.y.baseVal.value = bbox.y; + rect.width.baseVal.value = bbox.width; + rect.height.baseVal.value = bbox.height; + rect.setAttribute("class", className); + addItems.push({ parent: parent, target: rect }); + } + } + } else if (node.matches && !node.matches("button, select, textarea")) { + node.childNodes.forEach((el) => _highlight(el, addItems, text, className)); + } +}; +const _highlightText = (thisNode, text, className) => { + let addItems = []; + _highlight(thisNode, addItems, text, className); + addItems.forEach((obj) => + obj.parent.insertAdjacentElement("beforebegin", obj.target) + ); +}; + +/** + * Small JavaScript module for the documentation. + */ +const SphinxHighlight = { + + /** + * highlight the search words provided in localstorage in the text + */ + highlightSearchWords: () => { + if (!SPHINX_HIGHLIGHT_ENABLED) return; // bail if no highlight + + // get and clear terms from localstorage + const url = new URL(window.location); + const highlight = + localStorage.getItem("sphinx_highlight_terms") + || url.searchParams.get("highlight") + || ""; + localStorage.removeItem("sphinx_highlight_terms") + url.searchParams.delete("highlight"); + window.history.replaceState({}, "", url); + + // get individual terms from highlight string + const terms = highlight.toLowerCase().split(/\s+/).filter(x => x); + if (terms.length === 0) return; // nothing to do + + // There should never be more than one element matching "div.body" + const divBody = document.querySelectorAll("div.body"); + const body = divBody.length ? divBody[0] : document.querySelector("body"); + window.setTimeout(() => { + terms.forEach((term) => _highlightText(body, term, "highlighted")); + }, 10); + + const searchBox = document.getElementById("searchbox"); + if (searchBox === null) return; + searchBox.appendChild( + document + .createRange() + .createContextualFragment( + '" + ) + ); + }, + + /** + * helper function to hide the search marks again + */ + hideSearchWords: () => { + document + .querySelectorAll("#searchbox .highlight-link") + .forEach((el) => el.remove()); + document + .querySelectorAll("span.highlighted") + .forEach((el) => el.classList.remove("highlighted")); + localStorage.removeItem("sphinx_highlight_terms") + }, + + initEscapeListener: () => { + // only install a listener if it is really needed + if (!DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS) return; + + document.addEventListener("keydown", (event) => { + // bail for input elements + if (BLACKLISTED_KEY_CONTROL_ELEMENTS.has(document.activeElement.tagName)) return; + // bail with special keys + if (event.shiftKey || event.altKey || event.ctrlKey || event.metaKey) return; + if (DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS && (event.key === "Escape")) { + SphinxHighlight.hideSearchWords(); + event.preventDefault(); + } + }); + }, +}; + +_ready(() => { + /* Do not call highlightSearchWords() when we are on the search page. + * It will highlight words from the *previous* search query. + */ + if (typeof Search === "undefined") SphinxHighlight.highlightSearchWords(); + SphinxHighlight.initEscapeListener(); +}); diff --git a/docs/_build/html/ancestry.html b/docs/_build/html/ancestry.html new file mode 100644 index 0000000..ec097ed --- /dev/null +++ b/docs/_build/html/ancestry.html @@ -0,0 +1,119 @@ + + + + + + + + + Ancestry Prediction — my-project 0.0.1 documentation + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

Ancestry Prediction

+

Background: +The mixture deconvolution algorithm requires allele frequency data. While we’ve found that using global allele frequencies generally performs quite well, using allele frequencies as closely matched to the population group of the contributor can result in increased accuracy. +Given forensic samples are almost always from individuals of unknown ancestry, we’ve developed a method using the inferred genotypes from the mixture deconvolution method to predict the ancestry of each contributor using principal component analysis (PCA). +PCA uses the genotypes from individuals of known ancestry, in this case the 2,504 1000 Genomes samples, to create a statistical framework for predicting the ancestry of the unknown sample. PCA transforms high dimensionality data (here, the genomic data) into a lower-dimensionality space in the form of principal components, or PCs. PC1 explains the highest amount of variance in the data, PC2 explains the second highest amount of variance, and so forth. Using genetic data, the biogeographical ancestry is driving the top PCs given it accounts for most of the variation in the data. +Plotting the PCs against each other (and coloring samples by population) allows one to visually see the separation and clustering of populations, oftentimes down to the subpopulation level. Where the unknown sample falls within the plot (along with some distance calculations) allows the user to assign a population to the sample, assuming it falls within a known cluster. +Generally, ancestry can be visualized by plotting PC1 vs. PC2. However, we found that adding the 3rd PC in a 3-dimensional space provides the best separation of the populations. MixDeR creates 3-D ancestry plots and saves them as a .html file. These interactive plots are more insightful than the standard 2-D PCA plots. +MixDeR calculates the Euclidean distance from the centroid of each superpopulation to the unknown sample and rank the superpopulations based on the distance with the smallest distance on top. It should be noted here that the top ranked superpopulation does NOT always match the actual ancestry of the unknown but instead lists the closest population distance-wise. The user must consider the actual value of that distance and examine the accompanying PCA plot to determine the potential accuracy of the prediction. If the unknown sample is falling outside of the top-ranked population, it likely does not closely match that population. While this can be a reflection of the quality of the sample and/or PCA, it could also occur because the unknown ancestry does not match those in the 1000 Genomes database. For example, Ashkenazi Jews are not included in the 1000 Genomes dataset and given their genetic divergence from other European populations, would fall outside of the EUR superpopulation (at least that’s what we’ve found!). All of these factors must be considered when evaluating the ancestry prediction results. +MixDeR does not make its own ancestry determination, but provides the data and plots for the user to make the determination themselves. Analyzing samples of known ancestry is pertinent to understanding the strengths and weakness of the method and the developers strongly encourage users to perform validation studies before making any ancestry predictions.

+
+
+

Running the Ancestry Prediction tool

+

When first launching the Shiny app, the ancestry prediction tool first appears. This tool can be skipped by checking the Skip Ancestry Prediction Step box.
+The user must select whether they want to use the Superpopulation groups (AFR, AMR, EAS, EUR, and SAS) and/or the subpopulation groups (e.g. Toscani in Italy, Esan in Nigeria, etc.). If both are selected, PCA plots (coloring by either superpopulation or subpopulation) and centroid calculations are performed for both. +The ancestry prediction tool will first perform mixture deconvolution using the 1000 Genomes Global allele frequency data and perform genotype filtering using the specified settings (allele 1/2 probability thresholds, # of SNP bins, the static/dynamic AT, minimum # of SNPs). A conditioned and/or unconditioned deconvolution can be performed. A sample manifest and folder containing the mixture Sample Reports (as well the reference Sample Reports, if performing a conditioned deconvolution) are required. Please see the section entitled “Running mixture deconvolution using EuroForMix” for further information and guidance on these settings. +The user can select whether to use the only 94 ancestry SNPs or all autosomal SNPs for PCA (the deconvolution step is performed using all SNPs). While the 94 ancestry SNPs do an ample job with good quality samples and is quite fast, using all SNPs provides better clustering but is significantly slower (several minutes per contributor). It is important the user weights the benefits vs. drawbacks of each set of SNPs to determine the best option for their data and system.

+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/_build/html/genindex.html b/docs/_build/html/genindex.html new file mode 100644 index 0000000..3fddc5b --- /dev/null +++ b/docs/_build/html/genindex.html @@ -0,0 +1,105 @@ + + + + + + + + Index — MixDeR 1.0.0 documentation + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • + +
  • +
  • +
+
+
+
+
+ + +

Index

+ +
+ +
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/_build/html/index.html b/docs/_build/html/index.html new file mode 100644 index 0000000..c06535d --- /dev/null +++ b/docs/_build/html/index.html @@ -0,0 +1,162 @@ + + + + + + + + + reStructuredText — MixDeR 1.0.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

reStructuredText

+

This main document is in ‘reStructuredText’ (“rst”) format, +which differs in many ways from standard markdown commonly used in R packages. +rst is richer and more powerful than markdown. The remainder of this main +document demonstrates some of the features, with links to additional rst +documentation to help you get started. The definitive argument for the benefits +of rst over markdown is the official language format documentation, which starts with a very clear +explanation of the benefits.

+
+

Examples

+

All of the following are defined within the docs/index.rst file. Here is +some coloured text which demonstrates how raw HTML commands can be +incorporated. The following are examples of rst “admonitions”:

+
+

Note

+

Here is a note

+
+

Warning

+

With a warning inside the note

+
+
+
+

See also

+

The full list of ‘restructuredtext’ directives or a similar list of admonitions.

+
+

+This is a line of centered text

    +
  • and here is

  • +
  • A list of

  • +
+
    +
  • short items

  • +
  • that are

  • +
+
    +
  • displayed

  • +
  • in 3 columns

  • +
+
+

The remainder of this document shows three tables of contents for the main +README (under “Introduction”), and the vignettes and R directories of +a package. These can be restructured any way you like by changing the main +docs/index.rst file. The contents of this file – and indeed the contents +of any readthedocs file – can be viewed by +clicking View page source at the top left of any page.

+
+

Introduction:

+ +
+
+
+
+
+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/_build/html/mixder.html b/docs/_build/html/mixder.html new file mode 100644 index 0000000..25ffbfb --- /dev/null +++ b/docs/_build/html/mixder.html @@ -0,0 +1,607 @@ + + + + + + + + + MixDeR - Current Version: 1.0 — MixDeR 1.0.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ + +
+

MixDeR - Current Version: 1.0

+ + +

MixDeR (Mixture Deconvolution in R) is a workflow (with a +Shiny app) for performing mixture deconvolution of ForenSeq +Kintelligence SNP data for two-person mixtures using +EuroForMix and creating +GEDmatch PRO reports for the individual contributor SNP profiles.

+

This method requires extensive validation of the settings. MixDeR +provides the option of calculating various metrics for evaluating the +accuracy of the deduced SNP genotypes. This is extremely useful when +determining settings, specifically the allele 1 probability threshold, +the allele 2 probability threshold, and the minimum number of SNPs.

+

Note: MixDeR (and EFM) assume the mixture samples are composed of two +contributors. MixDeR is able to identify and alert the user to samples +that may be potentially either single source or consist of a mixture +with a large mixture ratio (i.e. > 1:100 ratio between contributors). +In these scenarios, the user is warned to be cautious with the minor +contributor inferred genotyping results.

+

Please cite the following paper if utilizing MixDeR:

+
Mitchell, R., Peck, M., Gorden, E., & Just, R. (2025). MixDeR: A SNP mixture 
+deconvolution workflow for forensic genetic genealogy. Forensic Science 
+International: Genetics, 76, 103224, doi: 10.1016/j.fsigen.2025.103224
+
+
+
+

Installation

+

For any installation, EuroForMix must be installed. Please follow the +instructions from the EuroForMix GitHub +page. EuroForMix version 4.2.4 +and earlier have been tested and are compatible with MixDeR. If using a +newer version, please be aware it has not been tested and errors may +occur!

+

If installing from GitHub:
+The R package devtools is required to install from GitHub:

+
install.packages("devtools")
+devtools::install_github("bioforensics/mixder")
+
+
+

If installing from source, first install the following R packages:

+
install.packages(c("dplyr", "ggplot2", "glue", "prompter", "readxl", "rlang", "shiny", "shinyFiles", "shinyjs", "tibble", "tidyr"))
+
+
+

To install MixDeR from source (i.e. the mixder_0.1.0.tar.gz file):

+
install.packages("/path/to/mixder_0.1.0.tar.gz", repos = NULL, type="source")
+
+
+

For example, if the mixder_0.1.0.tar.gz file is located in your +Documents folder:

+
install.packages("~/Documents/mixder_0.1.0.tar.gz", repos = NULL, type="source")
+
+
+
+
+

Usage

+

To launch the shiny app:

+
library(mixder)
+mixder()
+
+
+
+
+

Required files

+
    +
  1. Mixture Kintelligence SNP profiles
    +This can be in the form of the UAS Sample Report or a TSV file with +the below format:

  2. +
+ + + + + + + + + + + + + + + + + + + + + + + + + +

Marker

Allele

Reads

rs12615742

T

0

rs12615742

C

134

rs16885694

G

43

rs16885694

A

63

+

The files should be tab delimited and should be named as a .tsv file, +such as: SampleID.tsv.

+

If using the Shiny app, you must specify the folder containing these SNP +files. Multiple samples (with multiple files each) can be in the same +folder. Additional files may be present in the folder and will be +ignored by MixDeR.

+

MixDeR will divide the entire Kintelligence dataset into more manageable +sets (organized by total SNP read depth) to run through EFM (ideal for +best performance). The user may specify how many sets the program will +use (see below); the default is 10 sets. The user must then specify how +many sets are provided so MixDeR knows how many files to process per +sample.

+

The default is for MixDeR to use previously-created SNP sets, if present +in the specified input folder. If this option is unselected, MixDeR will +create new SNP sets files, overwriting any previously made files.

+
    +
  1. The sample manifest
    +This file lists the Sample IDs of the files to run. The SampleID +is extracted from the Sample Name field in the Sample Report. The +columns names do not matter, just the order and both columns +must be present even if no replicates are included. If a single +sample is run, only the single ID needs to be in the first column, +the second column should be left blank. If a second sample is to be +run in replicate, the replicate ID should be listed in the second +column.

  2. +
+ + + + + + + + + + + + + + +

SampleID

ReplicateID

Sample01a

Sample01a

Sample01b

+
+
+

Other files which may be required

+

Allele frequency file

+

MixDeR provides general population allele frequencies for Kintelligence +SNPs from either 1000 Genomes Phase 3 +dataset or the gnomAD v4 +dataset. However, it is +ideal to use allele frequencies derived from the population that closely +matches the contributor(s) of interest. Therefore, MixDeR provides the +user the opportunity to upload a different allele frequency file. +EuroForMix requires the below format for allele frequency files, for all +SNPs with each SNP as its own column:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

Allele

rs6690515

rs424079

rs2055204

A

0.122837

0.64677

0.501441

C

NA

0.35323

NA

G

0.877163

NA

0.498559

T

NA

NA

NA

+

Given the difficulty of formatting the data as such, MixDeR will create +this format for the user if the user provides the frequency data in a +CSV file with the following format (NOTE: the column names MUST match +below; the order of columns and additional columns will not affect it).

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

SNP

Ref

Alt

Ref_AF

Alt_AF

rs6690515

A

C

0.35323

0.64677

rs424079

T

C

0.122837

0.877163

rs2055204

G

A

0.501441

0.498559

+

MixDeR provides the option to use the same allele frequency file for +both the major and minor contributor or select different allele +frequency files for the major and minor contributor. If two different +frequency files are selected, MixDeR will run EFM twice, once using the +allele frequency file for the major contributor (and extracting the +inferred genotypes for the major contributor from those results) and +once using the allele frequency file for the minor contributor (and +extracting the inferred genotypes for the minor contributor from those +results).

+

Reference Genotypes

+

If calculating validation metrics or performing a conditioned +deconvolution, the reference genotypes are required.

+

There are two options for providing reference genotypes. MixDeR accepts +the UAS Sample Report, stored in a separate folder. A second option is +to provide a single CSV file containing all references named +EFM_references.csv with the following format:

+ + + + + + + + + + + + + + + + + + + + +

Sample.Name

Marker

Allele1

Allele2

Sample01

rs12615742

T

C

Sample01

rs16885694

G

G

+

The user must provide the folder containing either the Sample Reports or +the CSV reference file.
+NOTE: MixDeR first searches for the CSV file in the provided folder. If +there are additional wanted references not contained in this file, +please remove the CSV file and run MixDeR again. MixDeR creates the CSV +file containing genotypes from the Sample Reports within the provided +folder.
+________

+
+
+

MixDeR Details

+

MixDeR has four modules:

+
    +
  1. Ancestry Prediction

  2. +
  3. EFM mixture deconvolution

  4. +
  5. Calculate validation metrics

  6. +
  7. Create GEDmatch PRO reports

  8. +
+

EFM mixture deconvolution must be run at least once. If it’s been run +previously, the other modules can be run using the existing +deconvolution data.

+

MixDeR can either calculate validation metrics OR create the GEDmatch +PRO reports during a single run, not both.

+

NOTE about the allele probability thresholds:
+This workflow utilizes individual probabilities for each allele call +from EFM. The reported allele 1 is the allele with the higher +probability; allele 2 is the allele with the lower probability. When +applying the allele 1 probability threshold, any SNP with a probability +below the threshold will be removed completely from the dataset. When +applying the allele 2 probability threshold to the remaining SNPs, if +the probability is below the threshold, allele 2 is reported as the same +allele as allele 1. If it is above, the allele will be reported as +called. For example, if the genotype for SNP rs12615742 is C,T but the +allele 2 probability is below the threshold, the SNP genotype will be +reported as C,C. If it is above the threshold, the SNP genotype will be +reported as C,T.

+
+
+

Ancestry Prediction - Background

+

The mixture deconvolution algorithm requires allele frequency data. While we’ve found that using global allele frequencies generally performs quite well, using allele frequencies as closely matched to the population group of the contributor can result in increased accuracy.

+

Given forensic samples are almost always from individuals of unknown ancestry, we’ve developed a method using the inferred genotypes from the mixture deconvolution method to predict the ancestry of each contributor using principal component analysis (PCA).

+

PCA uses the genotypes from individuals of known ancestry, in this case the 2,504 1000 Genomes samples, to create a statistical framework for predicting the ancestry of the unknown sample. PCA transforms high dimensionality data (here, the genomic data) into a lower-dimensionality space in the form of principal components, or PCs. PC1 explains the highest amount of variance in the data, PC2 explains the second highest amount of variance, and so forth. Using genetic data, the biogeographical ancestry is driving the top PCs given it accounts for most of the variation in the data.

+

Plotting the PCs against each other (and coloring samples by population) allows one to visually see the separation and clustering of populations, oftentimes down to the subpopulation level. Where the unknown sample falls within the plot (along with some distance calculations) allows the user to assign a population to the sample, assuming it falls within a known cluster.

+

Generally, ancestry can be visualized by plotting PC1 vs. PC2. However, we found that adding the 3rd PC in a 3-dimensional space provides the best separation of the populations. MixDeR creates 3-D ancestry plots and saves them as a .html file. These interactive plots are more insightful than the standard 2-D PCA plots.

+

MixDeR calculates the Euclidean distance from the centroid of each superpopulation to the unknown sample and rank the superpopulations based on the distance with the smallest distance on top. It should be noted here that the top ranked superpopulation does NOT always match the actual ancestry of the unknown but instead lists the closest population distance-wise. The user must consider the actual value of that distance and examine the accompanying PCA plot to determine the potential accuracy of the prediction. If the unknown sample is falling outside of the top-ranked population, it likely does not closely match that population. While this can be a reflection of the quality of the sample and/or PCA, it could also occur because the unknown ancestry does not match those in the 1000 Genomes database. For example, Ashkenazi Jews are not included in the 1000 Genomes dataset and given their genetic divergence from other European populations, would fall outside of the EUR superpopulation (at least that’s what we’ve found!). All of these factors must be considered when evaluating the ancestry prediction results.

+

MixDeR does not make its own ancestry determination, but provides the data and plots for the user to make the determination themselves. Analyzing samples of known ancestry is pertinent to understanding the strengths and weakness of the method and the developers strongly encourage users to perform validation studies before making any ancestry predictions.

+
+
+

Running the Ancestry Prediction tool

+

When first launching the Shiny app, the ancestry prediction tool first appears. This tool can be skipped by checking the Skip Ancestry Prediction Step box.

+

The user must select whether they want to use the Superpopulation groups (AFR, AMR, EAS, EUR, and SAS) and/or the subpopulation groups (e.g. Toscani in Italy, Esan in Nigeria, etc.). If both are selected, PCA plots (coloring by either superpopulation or subpopulation) and centroid calculations are performed for both.
+The ancestry prediction tool will first perform mixture deconvolution using the 1000 Genomes Global allele frequency data and perform genotype filtering using the specified settings (allele 1/2 probability thresholds, # of SNP bins, the static/dynamic AT, minimum # of SNPs). A conditioned and/or unconditioned deconvolution can be performed. A sample manifest and folder containing the mixture Sample Reports (as well the reference Sample Reports, if performing a conditioned deconvolution) are required. Please see the section entitled “Running mixture deconvolution using EuroForMix” for further information and guidance on these settings.

+

The user can select whether to use the only 94 ancestry SNPs or all autosomal SNPs for PCA (the deconvolution step is performed using all SNPs). While the 94 ancestry SNPs do an ample job with good quality samples and is quite fast, using all SNPs provides better clustering but is significantly slower (several minutes per contributor). It is important the user weights the benefits vs. drawbacks of each set of SNPs to determine the best option for their data and system.

+
+
+

Running mixture deconvolution using EuroForMix

+

There are several options/settings to run EFM mixture deconvolution:
+The type of mixture deconvolution analysis to perform (one or both can +be selected at once):

+
    +
  • Unconditioned analysis

  • +
  • Conditioned analysis

  • +
+

Allele Frequency Data: The user must choose which allele frequency +data to use: 1000Genomes Phase 3 data, gnomAD v4 data, or upload a +custom file. See above for more details about the format for uploading a +custom AF file.
+References to Condition on: IF running a conditioned analysis, once +the reference folder has been uploaded, this dropdown menu will +auto-populate containing the reference sample IDs. The user must select +which references to condition on. As of now, MixDeR can only condition +on a single reference sample. However, multiple references can be +selected for conditioning; the conditioned analyses will be run +separately.
+Number of SNP Bins: The number of SNP sets for each sample. The +default is 10.
+Static Analytical Threshold: The minimum number of reads required +for a SNP to be included (default = 10).
+Dynamic Analytical Threshold: The percent of total SNP reads +required for a SNP to. be included (default = 0.015 or 1.5%). (A quick +note on the ATs: both the static and dynamic ATs are applied and the one +producing the higher AT will be used in the EFM software).
+Output Folder Name: The name of the folder created within the folder +containing the original SNP files to store the generated output for the +entire workflow.
+Minimum Number of SNPs: The minimum number of SNPs

+
+
+

Calculating validation metrics

+

There are several options/settings to calculate the validation +metrics:
+Minimum Allele 1 Probability Threshold and Maximum Allele 1 +Probability Threshold: The range of allele 1 probability thresholds +for calculating the validation metrics, increasing in increments of 0.01 +(i.e. if minimum is set to 0.5 and maximum is set to 1, will calculate +metrics using a threshold of 0.5, 0.51, 0.52, 0.53, up to 1).
+Minimum Allele 2 Probability Threshold and Maximum Allele 2 +Probability Threshold: The range of allele 2 probability thresholds +for calculating the validation metrics, increasing in increments of 0.01 +(i.e. if minimum is set to 0.5 and maximum is set to 1, will calculate +metrics using a threshold of 0.5, 0.51, 0.52, 0.53, up to 1).
+Major Contributor Sample ID: The ID of the major contributor of the +mixture. Once the reference folder is uploaded, this dropdown menu will +auto-populate with the reference sample IDs.
+Minor Contributor Sample ID: The ID of the minor contributor of the +mixture. Once the reference folder is uploaded, this dropdown menu will +auto-populate with the reference sample IDs.
+Remove SNPs If Missing Either Allele?: If an allele 1 is inferred to +be missing (reported as 99), that SNP will be automatically dropped +from the final dataset. By default, if an allele 2 is inferred to be +missing and the allele 2 probability is above the allele 2 probability +threshold, the SNP is reported as homozygous for allele 1. However, +selecting this option will result in dropping the SNP if the allele 2 +probability of the missing allele 2 is above the allele 2 probability +threshold, instead of reporting the SNP as homozygous for allele 1.

+

If calculating validation metrics, reference genotypes are required to +calculate genotype accuracy. MixDerR will calculate the metrics for the +range of allele 1 probability thresholds and allele 2 probability +thresholds specified by the user. The final output file looks as such:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

A1 cutoff

A2 cutoff

Total SNPs

N No Ref

N SNPs tested

N Genotypes Correct

Genotype Accuracy

Heterozygosity

0.99

0.01

9735

8

9727

9549

0.9817

0.456

0.99

0.02

9735

8

9727

9548

0.9815

0.456

0.99

0.03

9735

8

9727

9548

0.9815

0.456

+
+
+

Creating GEDmatch PRO Reports

+

Allele 1 Probability Threshold to create GEDmatch PRO Report and +Allele 2 Probability Threshold to create GEDmatch PRO Report: The +allele 1 and allele 2 probability thresholds. If the contributor is the +major contributor to the mixture, MixDeR will apply the allele 1 and +allele 2 probability thresholds UNLESS this results in a profile with +fewer SNPs than the specified minimum (6000 is the default). If it does +not meet this minimum, the top 6,000 SNPs (or whatever the user +specifies as the minimum) with the highest allele 1 probabilities are +used and then the allele 2 probability threshold is applied. For minor +contributors, the default setting is that the minimum number of SNPs is +automatically used and then the allele 2 probability threshold is +applied. The option to apply the allele 1 probability threshold (similar +in manner to the major contributor) can be applied.
+Remove SNPs If Missing Either Allele?: If an allele 1 is inferred to +be missing (reported as 99), that SNP will be automatically dropped +from the final dataset. By default, if an allele 2 is inferred to be +missing and the allele 2 probability is above the allele 2 probability +threshold, the SNP is reported as homozygous for allele 1. However, +selecting this option will result in dropping the SNP if the allele 2 +probability of the missing allele 2 is above the allele 2 probability +threshold, instead of reporting the SNP as homozygous for allele 1.

+

As a way to assist the analyst in evaluating the inferred genotypes of a +mixture of unknown contributors, several metrics are calculated in this +step for three different scenarios: (1) only the allele 2 probability +threshold applied; (2) the allele 1 and allele 2 probability thresholds +applied; and (3) the minimum number of SNPs used and the allele 2 +probability threshold applied. For each dataset, the follow metrics are +calculated: number of SNPs, mean allele 1 probabilities, median allele 1 +probabilities, standard deviation (SD) of the allele 1 probabilities and +heterozygosity. Below is an example of the table created by MixDeR in +this step. A density plot of allele 1 probability thresholds is also +created. In general, the more SNPs with higher allele 1 probabilities, +the higher the accuracy of the inferred genotypes. However, determining +exactly what qualifies as acceptable should be determined by individual +labs.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

Allele1 Threshold Applied

Allele2 Threshold Applied

Total SNPs

Mean A1 Prob

Median A1 Prob

SD A1 Prob

Heterozygosity

No

Yes

10024

0.9984

1

0.0096

0.4626

Yes

Yes

9718

0.9998

1

9.00E-04

0.4569

Minimum # of SNPs

Yes

6000

1

1

0

0.4495

+
+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/_build/html/objects.inv b/docs/_build/html/objects.inv new file mode 100644 index 0000000..94bc57f --- /dev/null +++ b/docs/_build/html/objects.inv @@ -0,0 +1,8 @@ +# Sphinx inventory version 2 +# Project: MixDeR +# Version: +# The remainder of this file is compressed using zlib. +xڅ +0D^{m/=ޭ;h &e@"T22=DѵO8**״tttYN8t !J4 +f![:'tƍ +:Exd*f ck.>$ptXm?Z=ޞJ7Ŝe}0okc \ No newline at end of file diff --git a/docs/_build/html/search.html b/docs/_build/html/search.html new file mode 100644 index 0000000..1713d11 --- /dev/null +++ b/docs/_build/html/search.html @@ -0,0 +1,120 @@ + + + + + + + + Search — MixDeR 1.0.0 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • + +
  • +
  • +
+
+
+
+
+ + + + +
+ +
+ +
+
+ +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/docs/_build/html/searchindex.js b/docs/_build/html/searchindex.js new file mode 100644 index 0000000..71c819b --- /dev/null +++ b/docs/_build/html/searchindex.js @@ -0,0 +1 @@ +Search.setIndex({"alltitles": {"Ancestry Prediction - Background": [[1, "ancestry-prediction-background"]], "Calculating validation metrics": [[1, "calculating-validation-metrics"]], "Creating GEDmatch PRO Reports": [[1, "creating-gedmatch-pro-reports"]], "Examples": [[0, "examples"]], "Installation": [[1, "installation"]], "Introduction:": [[0, null]], "MixDeR - Current Version: 1.0": [[1, null]], "MixDeR Details": [[1, "mixder-details"]], "Other files which may be required": [[1, "other-files-which-may-be-required"]], "Required files": [[1, "required-files"]], "Running mixture deconvolution using EuroForMix": [[1, "running-mixture-deconvolution-using-euroformix"]], "Running the Ancestry Prediction tool": [[1, "running-the-ancestry-prediction-tool"]], "Usage": [[1, "usage"]], "reStructuredText": [[0, null]]}, "docnames": ["index", "mixder"], "envversion": {"sphinx": 64, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1}, "filenames": ["index.rst", "mixder.md"], "indexentries": {}, "objects": {}, "objnames": {}, "objtypes": {}, "terms": {"": 1, "0": 0, "000": 1, "0096": 1, "00e": 1, "01": 1, "015": 1, "02": 1, "03": 1, "04": 1, "1": 0, "10": 1, "100": 1, "1000": 1, "1000genom": 1, "10024": 1, "1016": 1, "103224": 1, "122837": 1, "134": 1, "2": 1, "2025": 1, "3": [0, 1], "35323": 1, "3rd": 1, "4": 1, "43": 1, "4495": 1, "456": 1, "4569": 1, "4626": 1, "498559": 1, "5": 1, "501441": 1, "504": 1, "51": 1, "52": 1, "53": 1, "6": 1, "6000": 1, "63": 1, "64677": 1, "76": 1, "8": 1, "877163": 1, "9": 1, "94": 1, "9548": 1, "9549": 1, "9718": 1, "9727": 1, "9735": 1, "9815": 1, "9817": 1, "99": 1, "9984": 1, "9998": 1, "A": [0, 1], "AT": 1, "ATs": 1, "As": 1, "By": 1, "For": 1, "IF": 1, "If": 1, "In": 1, "It": 1, "NOT": 1, "No": 1, "OR": 1, "The": [0, 1], "There": 1, "These": [0, 1], "To": 1, "With": 0, "________": 1, "a1": 1, "a2": 1, "abl": 1, "about": 1, "abov": 1, "accept": 1, "accompani": 1, "account": 1, "accuraci": 1, "actual": 1, "ad": 1, "addit": [0, 1], "admonit": 0, "af": 1, "affect": 1, "afr": 1, "again": 1, "against": 1, "alert": 1, "algorithm": 1, "all": [0, 1], "allel": 1, "allele1": 1, "allele2": 1, "allow": 1, "almost": 1, "along": 1, "also": 1, "alt": 1, "alt_af": 1, "alwai": 1, "amount": 1, "ampl": 1, "amr": 1, "an": 1, "analys": 1, "analysi": 1, "analyst": 1, "analyt": 1, "analyz": 1, "ani": [0, 1], "app": 1, "appear": 1, "appli": 1, "ar": [0, 1], "argument": 0, "ashkenazi": 1, "assign": 1, "assist": 1, "assum": 1, "auto": 1, "automat": 1, "autosom": 1, "avail": [], "awar": 1, "background": [], "base": 1, "becaus": 1, "been": 1, "befor": 1, "below": 1, "benefit": [0, 1], "best": 1, "better": 1, "between": 1, "bin": 1, "bioforens": 1, "biogeograph": 1, "blank": 1, "both": 1, "box": 1, "c": 1, "calcul": [], "call": 1, "can": [0, 1], "case": 1, "cautiou": 1, "center": 0, "centroid": 1, "chang": 0, "check": 1, "choos": 1, "cite": 1, "clear": 0, "click": 0, "close": 1, "closest": 1, "cluster": 1, "color": 1, "colour": 0, "column": [0, 1], "command": 0, "commonli": 0, "compat": 1, "complet": 1, "compon": 1, "compos": 1, "condit": 1, "consid": 1, "consist": 1, "contain": 1, "content": 0, "contributor": 1, "convolut": 1, "correct": 1, "could": 1, "creat": [], "csv": 1, "current": 0, "custom": 1, "cutoff": 1, "d": 1, "data": 1, "databas": 1, "dataset": 1, "de": 1, "deconvolut": [], "deduc": 1, "default": 1, "defin": 0, "definit": 0, "delimit": 1, "demonstr": 0, "densiti": 1, "depth": 1, "deriv": 1, "determin": 1, "develop": 1, "deviat": 1, "devtool": 1, "differ": [0, 1], "difficulti": 1, "dimension": 1, "direct": 0, "directori": 0, "displai": 0, "distanc": 1, "diverg": 1, "divid": 1, "do": 1, "doc": 0, "document": [0, 1], "doe": 1, "doi": 1, "down": 1, "dplyr": 1, "drawback": 1, "drive": 1, "drop": 1, "dropdown": 1, "dure": 1, "dynam": 1, "e": 1, "ea": 1, "each": 1, "earlier": 1, "efm": 1, "efm_refer": 1, "either": 1, "encourag": 1, "entir": 1, "entitl": 1, "error": 1, "esan": 1, "etc": 1, "euclidean": 1, "eur": 1, "euroformix": [], "european": 1, "evalu": 1, "even": 1, "exactli": 1, "examin": 1, "exampl": 1, "exist": 1, "explain": 1, "explan": 0, "extens": 1, "extract": 1, "extrem": 1, "factor": 1, "fall": 1, "fast": 1, "featur": 0, "fewer": 1, "field": 1, "file": 0, "filter": 1, "final": 1, "first": 1, "folder": 1, "follow": [0, 1], "forens": 1, "forenseq": 1, "form": 1, "format": [0, 1], "forth": 1, "found": 1, "four": 1, "framework": 1, "frequenc": 1, "from": [0, 1], "fsigen": 1, "full": 0, "further": 1, "g": 1, "genealogi": 1, "gener": 1, "genet": 1, "genom": 1, "genotyp": 1, "get": 0, "ggplot2": 1, "github": 1, "given": 1, "global": 1, "glue": 1, "gnomad": 1, "good": 1, "gorden": 1, "group": 1, "guidanc": 1, "gz": 1, "ha": 1, "have": 1, "help": 0, "here": [0, 1], "heterozygos": 1, "high": 1, "higher": 1, "highest": 1, "homozyg": 1, "how": [0, 1], "howev": 1, "html": [0, 1], "i": [0, 1], "id": 1, "ideal": 1, "identifi": 1, "ignor": 1, "import": 1, "includ": 1, "incorpor": 0, "increas": 1, "increment": 1, "inde": 0, "index": 0, "individu": 1, "infer": 1, "inform": 1, "input": 1, "insid": 0, "insight": 1, "install_github": 1, "instead": 1, "instruct": 1, "interact": 1, "interest": 1, "intern": 1, "itali": 1, "item": 0, "its": 1, "j": 1, "jew": 1, "job": 1, "just": 1, "kintellig": 1, "know": 1, "known": 1, "lab": 1, "languag": 0, "larg": 1, "launch": 1, "least": 1, "left": [0, 1], "level": 1, "librari": 1, "like": [0, 1], "line": 0, "link": 0, "list": [0, 1], "locat": 1, "look": 1, "lower": 1, "m": 1, "made": 1, "main": 0, "major": 1, "make": 1, "manag": 1, "mani": [0, 1], "manifest": 1, "manner": 1, "manuscript": [], "markdown": 0, "marker": 1, "match": 1, "matter": 1, "maximum": 1, "mean": 1, "median": 1, "meet": 1, "menu": 1, "method": 1, "minimum": 1, "minor": 1, "minut": 1, "miss": 1, "mitchel": 1, "mix": 1, "mixder": 0, "mixder_0": 1, "mixderr": 1, "mixtur": [], "modul": 1, "more": [0, 1], "most": 1, "multipl": 1, "must": 1, "n": 1, "na": 1, "name": 1, "need": 1, "new": 1, "newer": 1, "nigeria": 1, "note": [0, 1], "now": 1, "null": 1, "number": 1, "occur": 1, "offici": 0, "oftentim": 1, "onc": 1, "one": 1, "onli": 1, "opportun": 1, "option": 1, "order": 1, "organ": 1, "origin": 1, "other": [], "output": 1, "outsid": 1, "over": 0, "overwrit": 1, "own": 1, "packag": [0, 1], "page": [0, 1], "paper": 1, "path": 1, "pc": 1, "pc1": 1, "pc2": 1, "pca": 1, "peck": 1, "per": 1, "percent": 1, "perform": 1, "person": 1, "pertin": 1, "phase": 1, "pleas": 1, "plot": 1, "popul": 1, "potenti": 1, "power": 0, "preprint": [], "present": 1, "previous": 1, "princip": 1, "prob": 1, "probabl": 1, "process": 1, "produc": 1, "profil": 1, "program": 1, "prompter": 1, "provid": 1, "qualifi": 1, "qualiti": 1, "quick": 1, "quit": 1, "r": [0, 1], "rang": 1, "rank": 1, "ratio": 1, "raw": 0, "read": 1, "readm": 0, "readthedoc": 0, "readxl": 1, "ref": 1, "ref_af": 1, "refer": 1, "reflect": 1, "remain": 1, "remaind": 0, "remov": 1, "replic": 1, "replicateid": 1, "repo": 1, "report": [], "requir": [], "restructur": 0, "result": 1, "richer": 0, "rlang": 1, "rs12615742": 1, "rs16885694": 1, "rs2055204": 1, "rs424079": 1, "rs6690515": 1, "rst": 0, "sa": 1, "same": 1, "sampl": 1, "sample01": 1, "sample01a": 1, "sample01b": 1, "sampleid": 1, "save": 1, "scenario": 1, "scienc": 1, "sd": 1, "search": 1, "second": 1, "section": 1, "see": 1, "select": 1, "separ": 1, "set": 1, "sever": 1, "shini": 1, "shinyfil": 1, "shinyj": 1, "short": 0, "should": 1, "show": 0, "significantli": 1, "similar": [0, 1], "singl": 1, "skip": 1, "slower": 1, "smallest": 1, "snp": 1, "so": 1, "softwar": 1, "some": [0, 1], "sourc": [0, 1], "space": 1, "specif": 1, "specifi": 1, "standard": [0, 1], "start": 0, "static": 1, "statist": 1, "step": 1, "store": 1, "strength": 1, "strongli": 1, "studi": 1, "subpopul": 1, "superpopul": 1, "system": 1, "t": 1, "tab": 1, "tabl": [0, 1], "tar": 1, "test": 1, "text": 0, "than": [0, 1], "thei": 1, "them": 1, "themselv": 1, "therefor": 1, "thi": [0, 1], "those": 1, "three": [0, 1], "threshold": 1, "through": 1, "tibbl": 1, "tidyr": 1, "top": [0, 1], "toscani": 1, "total": 1, "transform": 1, "tsv": 1, "ture": 1, "twice": 1, "two": 1, "type": 1, "ua": 1, "uncondit": 1, "under": 0, "understand": 1, "unknown": 1, "unless": 1, "unselect": 1, "up": 1, "upload": 1, "us": 0, "user": 1, "util": 1, "v": 1, "v4": 1, "valid": [], "valu": 1, "varianc": 1, "variat": 1, "variou": 1, "ve": 1, "veri": 0, "version": 0, "view": 0, "vignett": 0, "visual": 1, "wai": [0, 1], "want": 1, "warn": [0, 1], "we": 1, "weak": 1, "weight": 1, "well": 1, "what": 1, "whatev": 1, "when": 1, "where": 1, "whether": 1, "which": 0, "while": 1, "wise": 1, "within": [0, 1], "workflow": 1, "would": 1, "ye": 1, "you": [0, 1], "your": 1}, "titles": ["reStructuredText", "MixDeR - Current Version: 1.0"], "titleterms": {"0": 1, "1": 1, "ancestri": 1, "background": 1, "calcul": 1, "creat": 1, "current": 1, "deconvolut": 1, "detail": 1, "euroformix": 1, "exampl": 0, "file": 1, "gedmatch": 1, "instal": 1, "introduct": 0, "mai": 1, "metric": 1, "mixder": 1, "mixtur": 1, "other": 1, "predict": 1, "pro": 1, "report": 1, "requir": 1, "restructuredtext": 0, "run": 1, "tool": 1, "us": 1, "usag": 1, "valid": 1, "version": 1, "which": 1}}) \ No newline at end of file diff --git a/docs/articles/Ancestry_Prediction_Tool.html b/docs/articles/Ancestry_Prediction_Tool.html new file mode 100644 index 0000000..8d79273 --- /dev/null +++ b/docs/articles/Ancestry_Prediction_Tool.html @@ -0,0 +1,187 @@ + + + + + + + +Ancestry Prediction Tool • mixder + + + + + + + + Skip to contents + + +
+ + + + +
+
+ + + +
+

Ancestry Prediction +

+
+

Background: +

+

The mixture deconvolution algorithm requires allele frequency data. +While we’ve found that using global allele frequencies generally +performs quite well, using allele frequencies as closely matched to the +population group of the contributor can result in increased +accuracy.

+

Given forensic samples are almost always from individuals of unknown +ancestry, we’ve developed a method using the inferred genotypes from the +mixture deconvolution method to predict the ancestry of each contributor +using principal component analysis (PCA).

+

PCA uses the genotypes from individuals of known ancestry, in this +case the 2,504 1000 Genomes samples, to create a statistical framework +for predicting the ancestry of the unknown sample. PCA transforms high +dimensionality data (here, the genomic data) into a lower-dimensionality +space in the form of principal components, or PCs. PC1 explains the +highest amount of variance in the data, PC2 explains the second highest +amount of variance, and so forth. Using genetic data, the +biogeographical ancestry is driving the top PCs given it accounts for +most of the variation in the data.

+

Plotting the PCs against each other (and coloring samples by +population) allows one to visually see the separation and clustering of +populations, oftentimes down to the subpopulation level. Where the +unknown sample falls within the plot (along with some distance +calculations) allows the user to assign a population to the sample, +assuming it falls within a known cluster.

+

Generally, ancestry can be visualized by plotting PC1 vs. PC2. +However, we found that adding the 3rd PC in a 3-dimensional space +provides the best separation of the populations. MixDeR creates 3-D +ancestry plots and saves them as a .html file. These interactive plots +are more insightful than the standard 2-D PCA plots.

+

MixDeR calculates the Euclidean distance from the centroid of each +superpopulation to the unknown sample and rank the superpopulations +based on the distance with the smallest distance on top. It should be +noted here that the top ranked superpopulation does NOT always match the +actual ancestry of the unknown but instead lists the closest population +distance-wise. The user must consider the actual value of that distance +and examine the accompanying PCA plot to determine the potential +accuracy of the prediction. If the unknown sample is falling outside of +the top-ranked population, it likely does not closely match that +population. While this can be a reflection of the quality of the sample +and/or PCA, it could also occur because the unknown ancestry does not +match those in the 1000 Genomes database. For example, Ashkenazi Jews +are not included in the 1000 Genomes dataset and given their genetic +divergence from other European populations, would fall outside of the +EUR superpopulation (at least that’s what we’ve found!). All of these +factors must be considered when evaluating the ancestry prediction +results.

+

MixDeR does not make its own ancestry determination, but provides the +data and plots for the user to make the determination themselves. +Analyzing samples of known ancestry is pertinent to understanding the +strengths and weakness of the method and the developers strongly +encourage users to perform validation studies before making any ancestry +predictions.

+
+
+
+

Running the Ancestry Prediction tool +

+

When first launching the Shiny app, the ancestry prediction tool +first appears. This tool can be skipped by checking the +Skip Ancestry Prediction Step box.

+

The user must select whether they want to use the +Superpopulation groups (AFR, AMR, EAS, EUR, and SAS) and/or +the subpopulation groups (e.g. Toscani in Italy, Esan in +Nigeria, etc.). If both are selected, PCA plots (coloring by either +superpopulation or subpopulation) and centroid calculations are +performed for both.

+

The ancestry prediction tool will first perform mixture deconvolution +using the 1000 Genomes global allele frequency data and perform genotype +filtering using the specified settings (allele 1/2 probability +thresholds, # of SNP bins, the static/dynamic AT, minimum # of +SNPs).

+

A conditioned and/or unconditioned deconvolution can be performed. A +sample manifest and folder containing the mixture Sample Reports (as +well the reference Sample Reports, if performing a conditioned +deconvolution) are required. Please see the section entitled “Running +Mixture Deconvolution” for further information and guidance on these +settings.

+

The user can select whether to use the only 94 ancestry SNPs or all +autosomal SNPs for PCA (the deconvolution step is performed using all +SNPs). While the 94 ancestry SNPs do a satisfactory job with good +quality samples and is quite fast, using all SNPs provides better +clustering but is significantly slower (several minutes per +contributor). It is important the user weights the benefits +vs. drawbacks of each set of SNPs to determine the best option for their +data and system.

+
+
+
+ + + +
+ + + +
+
+ + + + + + + diff --git a/docs/articles/Installation.html b/docs/articles/Installation.html index ea75c8c..a6f5d1b 100644 --- a/docs/articles/Installation.html +++ b/docs/articles/Installation.html @@ -20,7 +20,7 @@ mixder - 0.7.5 + 1.0
diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..94f19d9 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,69 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +import os +import sys +sys.path.insert(0, os.path.abspath('.')) + + +# -- Project information ----------------------------------------------------- + +project = 'MixDeR' +copyright = '2025, DHS' +author = 'NBFAC' + +# The full version, including alpha/beta/rc tags +release = '1.0.0' + + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'myst_parser', + 'sphinx.ext.duration', + 'sphinx.ext.doctest', + 'sphinx.ext.autodoc', + 'sphinx.ext.autosummary', + 'sphinx.ext.intersphinx', +] + +intersphinx_mapping = { + 'python': ('https://docs.python.org/3/', None), + 'sphinx': ('https://www.sphinx-doc.org/en/master/', None), +} +intersphinx_disabled_domains = ['std'] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +#html_theme = 'alabaster' +html_theme = 'sphinx_rtd_theme' + + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] diff --git a/docs/index.html b/docs/index.html index 7c2be9e..e64cdb9 100644 --- a/docs/index.html +++ b/docs/index.html @@ -22,7 +22,7 @@ mixder - 0.7.5 + 1.0 + + + + + +
+
+
+ +
+

Used to run PCA for ancestry prediction

+
+ +
+

Usage

+
ancestry_1000G_allsamples
+
+ +
+

Format

+

A dataframe containing 10,030 columns (each SNP) and 2,504 rows (each individual):

SNP
+

SNP rsID with reference allele

+ +
ID
+

Sample ID of individual

+ + +
+
+

Source

+

1000 Genomes (https://www.internationalgenome.org/data)

+
+ +
+ + +
+ + + +
+ + + + + + + diff --git a/docs/reference/ancestry_prediction.html b/docs/reference/ancestry_prediction.html new file mode 100644 index 0000000..4d99dbe --- /dev/null +++ b/docs/reference/ancestry_prediction.html @@ -0,0 +1,110 @@ + +Title Ancestry prediction using PCA — ancestry_prediction • mixder + Skip to contents + + +
+
+
+ +
+

Title Ancestry prediction using PCA

+
+ +
+

Usage

+
ancestry_prediction(
+  report,
+  path,
+  id,
+  analysis_type,
+  contrib_status,
+  testsnps,
+  groups
+)
+
+ +
+

Arguments

+ + +
report
+

inferred genotypes

+ + +
path
+

write path

+ + +
id
+

sample ID

+ + +
analysis_type
+

mixure deconvolution type (conditioned vs. unconditioned)

+ + +
groups
+

How to color PCA plots (superpopulations and/or subpopulations)

+ +
+
+

Value

+

NA

+
+ +
+ + +
+ + + +
+ + + + + + + diff --git a/docs/reference/ancestrysnps_1000G_allsamples.html b/docs/reference/ancestrysnps_1000G_allsamples.html new file mode 100644 index 0000000..6fa8f84 --- /dev/null +++ b/docs/reference/ancestrysnps_1000G_allsamples.html @@ -0,0 +1,88 @@ + +1000 Genomes genotypes for 54 ancestry SNPs for all 2,504 1000G samples — ancestrysnps_1000G_allsamples • mixder + Skip to contents + + +
+
+
+ +
+

Used to run PCA for ancestry prediction

+
+ +
+

Usage

+
ancestrysnps_1000G_allsamples
+
+ +
+

Format

+

A dataframe containing 54 columns (each SNP) and 2,504 rows (each individual):

SNP
+

SNP rsID with reference allele

+ +
ID
+

Sample ID of individual

+ + +
+
+

Source

+

1000 Genomes (https://www.internationalgenome.org/data)

+
+ +
+ + +
+ + + +
+ + + + + + + diff --git a/docs/reference/assigned_A2.html b/docs/reference/assigned_A2.html index 157afda..8ec925f 100644 --- a/docs/reference/assigned_A2.html +++ b/docs/reference/assigned_A2.html @@ -7,7 +7,7 @@ mixder - 0.7.5 + 1.0 + + + + + +
+
+
+ +
+

Centroid Calculations and Plotting

+
+ +
+

Usage

+
centroids(groups, pca, inpath, ID)
+
+ +
+

Arguments

+ + +
groups
+

Groups used for PCA (Superpopulations and/or Subpopulations)

+ + +
pca
+

dataframe of PCs and ancestry

+ + +
inpath
+

input path

+ + +
ID
+

ID for creating file name(s)

+ +
+ +
+ + +
+ + + +
+ + + + + + + diff --git a/docs/reference/check_3_or_4_col.html b/docs/reference/check_3_or_4_col.html index adc4cf7..ee19355 100644 --- a/docs/reference/check_3_or_4_col.html +++ b/docs/reference/check_3_or_4_col.html @@ -9,7 +9,7 @@ mixder - 0.7.5 + 1.0
+ + + + + +
+
+
+ +
+

Allele frequencies for 10,039 SNPs

+
+ +
+

Usage

+
popFreq_AFR
+
+ +
+

Format

+

A list containing a 10039 elements (SNPs) with 4 rows:

SNP
+

SNP rsID

+ +
Allele
+

Allele (A/C/G/T)

+ +
Probability
+

Allele Probability

+ + +
+
+

Source

+

1000 Genomes (https://www.internationalgenome.org/data)

+
+ +
+ + +
+ + + +
+ + + + + + + diff --git a/docs/reference/popFreq_AMR.html b/docs/reference/popFreq_AMR.html new file mode 100644 index 0000000..50ee546 --- /dev/null +++ b/docs/reference/popFreq_AMR.html @@ -0,0 +1,91 @@ + +Allele Frequency file using 1000G Phase 3 dataset for all AMR individuals — popFreq_AMR • mixder + Skip to contents + + +
+
+
+ +
+

Allele frequencies for 10,039 SNPs

+
+ +
+

Usage

+
popFreq_AMR
+
+ +
+

Format

+

A list containing a 10039 elements (SNPs) with 4 rows:

SNP
+

SNP rsID

+ +
Allele
+

Allele (A/C/G/T)

+ +
Probability
+

Allele Probability

+ + +
+
+

Source

+

1000 Genomes (https://www.internationalgenome.org/data)

+
+ +
+ + +
+ + + +
+ + + + + + + diff --git a/docs/reference/popFreq_EAS.html b/docs/reference/popFreq_EAS.html new file mode 100644 index 0000000..367ddda --- /dev/null +++ b/docs/reference/popFreq_EAS.html @@ -0,0 +1,91 @@ + +Allele Frequency file using 1000G Phase 3 dataset for all EAS individuals — popFreq_EAS • mixder + Skip to contents + + +
+
+
+ +
+

Allele frequencies for 10,039 SNPs

+
+ +
+

Usage

+
popFreq_EAS
+
+ +
+

Format

+

A list containing a 10039 elements (SNPs) with 4 rows:

SNP
+

SNP rsID

+ +
Allele
+

Allele (A/C/G/T)

+ +
Probability
+

Allele Probability

+ + +
+
+

Source

+

1000 Genomes (https://www.internationalgenome.org/data)

+
+ +
+ + +
+ + + +
+ + + + + + + diff --git a/docs/reference/popFreq_EUR.html b/docs/reference/popFreq_EUR.html new file mode 100644 index 0000000..0a39da9 --- /dev/null +++ b/docs/reference/popFreq_EUR.html @@ -0,0 +1,91 @@ + +Allele Frequency file using 1000G Phase 3 dataset for all EUR individuals — popFreq_EUR • mixder + Skip to contents + + +
+
+
+ +
+

Allele frequencies for 10,039 SNPs

+
+ +
+

Usage

+
popFreq_EUR
+
+ +
+

Format

+

A list containing a 10039 elements (SNPs) with 4 rows:

SNP
+

SNP rsID

+ +
Allele
+

Allele (A/C/G/T)

+ +
Probability
+

Allele Probability

+ + +
+
+

Source

+

1000 Genomes (https://www.internationalgenome.org/data)

+
+ +
+ + +
+ + + +
+ + + + + + + diff --git a/docs/reference/popFreq_SAS.html b/docs/reference/popFreq_SAS.html new file mode 100644 index 0000000..9ec96c9 --- /dev/null +++ b/docs/reference/popFreq_SAS.html @@ -0,0 +1,91 @@ + +Allele Frequency file using 1000G Phase 3 dataset for all SAS individuals — popFreq_SAS • mixder + Skip to contents + + +
+
+
+ +
+

Allele frequencies for 10,039 SNPs

+
+ +
+

Usage

+
popFreq_SAS
+
+ +
+

Format

+

A list containing a 10039 elements (SNPs) with 4 rows:

SNP
+

SNP rsID

+ +
Allele
+

Allele (A/C/G/T)

+ +
Probability
+

Allele Probability

+ + +
+
+

Source

+

1000 Genomes (https://www.internationalgenome.org/data)

+
+ +
+ + +
+ + + +
+ + + + + + + diff --git a/docs/reference/popFreq_gnomad.html b/docs/reference/popFreq_gnomad.html index 9160585..664d7fc 100644 --- a/docs/reference/popFreq_gnomad.html +++ b/docs/reference/popFreq_gnomad.html @@ -7,7 +7,7 @@ mixder - 0.7.5 + 1.0