Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
9a348c2
Add ancestry prediction options/text to shiny app interface
rnmitchell Jan 13, 2025
8caf6bf
Building in 1000 G genotype data for ancestry SNPs
rnmitchell Jan 13, 2025
57c1664
began adding ancestry prediction to run_workflow script
rnmitchell Jan 13, 2025
5dee69a
began integrating ancestry prediction
rnmitchell Jan 14, 2025
6aa6cf1
update run_workflow
rnmitchell Jan 14, 2025
db3847c
update .gitignore
rnmitchell Jan 14, 2025
15ebe59
ancestry prediction running correctly for unconditioned analyses
rnmitchell Jan 15, 2025
8f0e653
merge master
rnmitchell Jan 15, 2025
5508300
ancestry prediction for conditioned analyses
rnmitchell Jan 15, 2025
2e0bca8
fix PCA plot title, add ancestry prediction step to config file settings
rnmitchell Jan 15, 2025
5d16bfd
update test
rnmitchell Jan 15, 2025
f48c1bc
testing ancestry pred with all snps
rnmitchell Feb 5, 2025
7fdda0a
3D PCA plots
rnmitchell Mar 21, 2025
355debc
merge main
rnmitchell Mar 21, 2025
2f1e3e7
merge main
rnmitchell Mar 21, 2025
8be5bf6
updated with 3D plotting
rnmitchell Mar 25, 2025
418a8c4
merge main
rnmitchell Mar 25, 2025
3ed7429
option to use either ancestry SNPs or all SNPs for PCA
rnmitchell Apr 7, 2025
53d088f
updated shiny app for multiple features with PCA plots
rnmitchell Apr 11, 2025
a4ac85b
including necessary data
rnmitchell Apr 11, 2025
a1d6246
data.R updated with included data in package
rnmitchell Apr 11, 2025
625332e
updated description/news with new version #
rnmitchell Apr 11, 2025
705b0ec
centroid analysis
rnmitchell Apr 11, 2025
9c38c51
fixed bug with loading AF
rnmitchell Apr 16, 2025
2d170d9
added superpopulation AF datasets
rnmitchell Apr 16, 2025
41c2ca0
added line breaks to pop up messages
rnmitchell Apr 18, 2025
f367f6e
removed unnecessary data; cleaned up scripts
rnmitchell May 5, 2025
a845c4e
begin adding tests for ancestry
rnmitchell Jun 5, 2025
4f093b8
added tests
rnmitchell Jun 5, 2025
a8e1c58
updated config
rnmitchell Jun 5, 2025
0ff2530
updated with test
rnmitchell Jul 18, 2025
12998e7
readthedocs
rnmitchell Jul 30, 2025
b9cbe33
updated readme
rnmitchell Jul 30, 2025
3e61cf1
remove readthedocs
rnmitchell Oct 3, 2025
e953da1
merge main
rnmitchell Oct 3, 2025
d2fc98c
updated docs
rnmitchell Oct 3, 2025
d948b6f
removed hard coded path
rnmitchell Oct 3, 2025
b95d031
update scripts to pass checks
rnmitchell Oct 3, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
^.*\.Rproj$
^\.Rproj\.user$
^README\.Rmd$
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,6 @@
.Ruserdata
.DS_Store
inst/doc
.RDataTmp
docs/_build/html/.buildinfo
.github
6 changes: 3 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
Package: mixder
Type: Package
Title: A workflow for performing SNP mixture deconvolution
Version: 0.7.5
Version: 1.0
Author: Rebecca Mitchell <rebecca.mitchell@st.dhs.gov>
Maintainer: Rebecca Mitchell <rebecca.mitchell@st.dhs.gov>
Description: A workflow for performing SNP mixture deconvolution of ForenSeq Kintelligence SNPs. Mixture deconvolution is performed using EuroForMix (https://github.com/oyvble/euroformix/). After mixture deconvolution, the user can choose to calculate metrics such as genotype accuracy and heterozygosity for a range of allele probability thresholds (useful for validation work) or create GEDmatch PRO reports utilizing specified allele probability thresholds.
License: file LICENSE
Imports: dplyr, euroformix, ggplot2, glue, methods, prompter, readxl, rlang, shiny, shinyFiles, shinyjs, tibble, tidyr
Imports: dplyr, euroformix, ggplot2, glue, kgp, methods, plotly, prompter, readxl, rlang, shiny, shinyFiles, shinyjs, tibble, tidyr
RoxygenNote: 7.3.2
Encoding: UTF-8
Depends:
R (>= 2.10)
R (>= 3.5)
LazyData: true
Suggests:
knitr,
Expand Down
7 changes: 6 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
# Generated by roxygen2: do not edit by hand

export(ancestry_prediction)
export(assigned_A2)
export(calc_metrics)
export(calculate_at)
export(centroids)
export(check_3_or_4_col)
export(check_allele_calls)
export(check_allele_probabilities)
Expand Down Expand Up @@ -37,8 +39,11 @@ export(run_indiv_efm_set)
export(run_workflow)
export(write_tables)
import(dplyr)
import(ggplot2)
import(ggplot2, except = last_plot)
import(glue)
import(kgp)
import(parallel)
import(plotly)
import(prompter)
import(shiny)
import(shinyFiles)
Expand Down
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@

## 1.0: October 3, 2025
- Implemented ancestry prediction tool
- Additional population-specific population allele frequency data included in package.

## Version 0.7.5: September 24, 2025
- Updated documentation

Expand Down
106 changes: 106 additions & 0 deletions R/ancestry_prediction.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
# -------------------------------------------------------------------------------------------------
# Copyright (c) 2024, DHS.
#
# This file is part of MixDeR and is licensed under the BSD license: see LICENSE.
#
# This software was prepared for the Department of Homeland Security (DHS) by the Battelle National
# Biodefense Institute, LLC (BNBI) as part of contract HSHQDC-15-C-00064 to manage and operate the
# National Biodefense Analysis and Countermeasures Center (NBACC), a Federally Funded Research and
# Development Center.
# -------------------------------------------------------------------------------------------------

#' Title Ancestry prediction using PCA
#'
#' @param report inferred genotypes
#' @param path write path
#' @param id sample ID
#' @param analysis_type mixure deconvolution type (conditioned vs. unconditioned)
#' @param groups How to color PCA plots (superpopulations and/or subpopulations)
#'
#' @import kgp
#' @import plotly
#'
#' @return NA
#' @export
#'
ancestry_prediction = function(report, path, id, analysis_type, contrib_status, testsnps, groups) {
if (testsnps == "All Autosomal SNPs") {
plotid="AllSNPs"
geno=mixder::ancestry_1000G_allsamples
} else {
plotid="AncestrySNPsOnly"
geno=mixder::ancestrysnps_1000G_allsamples
}
ncols=ncol(geno)
geno_filt=geno[,c(7:ncols)]
snps = data.frame("snp_id"=colnames(geno_filt))
snps = snps %>%
Comment on lines +34 to +37
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Code autoformatting could give a more consistent style in these files. Something to consider.

separate(.data$snp_id, c("rsid", "ref_allele"), remove=F)
snps$order = seq(1:nrow(snps))
merged_alleles = merge(snps, report, by="rsid", all.x=T) %>%
arrange(order)
## count alleles
merged_alleles$num_alt = ifelse(merged_alleles$Allele1==merged_alleles$ref_allele & merged_alleles$Allele2==merged_alleles$ref_allele, 2, ifelse(merged_alleles$Allele1==merged_alleles$ref_allele | merged_alleles$Allele2==merged_alleles$ref_allele, 1, 0))

## re-format to match 1000G samples
formatted_sample = merged_alleles %>%
select(.data$snp_id, .data$num_alt) %>%
pivot_wider(names_from=.data$snp_id, values_from=.data$num_alt)

## add unknown to 1000G genotypes
geno_filt_unk = rbind(geno_filt, formatted_sample)

message("Running PCA<br/>")
## remove any SNPs with NA values (in unknown sample)
betaRedNAOmit <- geno_filt_unk %>%
select_if(~ !any(is.na(.)))

##perform PCA
pcaRed <- stats::prcomp(betaRedNAOmit, center=TRUE, scale=FALSE)

## create data table of PCs
PCs = data.frame(pcaRed$x)

## add unknown to ancestry and genotype IDs
geno_unk = geno %>%
add_row(IID="Unk")
## merge genotypes with ancestry info; need to preserve order to match to PCA data
geno_ancestry=merge(geno_unk, mixder::ancestry_colors, by.x="IID", by.y="id")

## add ancestry info to PC data
newcol=ncols+1
newcol2=ncols+4
PCs_anc = cbind(geno_ancestry[,c(newcol:newcol2)], data.frame(PCs[,c(1:10)]))


centroids(groups, PCs_anc, glue("{path}/PCA_plots"), glue("{id}_{contrib_status}_{analysis_type}_{plotid}"))

dir.create(file.path(path, "PCA_plots"), showWarnings = FALSE, recursive=TRUE)

if ("Superpopulations (AFR/AMR/EAS/EUR/SAS Only)" %in% groups) {
pal = unique(geno_ancestry$superpop_color)
pal = setNames(pal, unique(geno_ancestry$reg))

fig = plot_ly(PCs_anc, x = ~PC1, y = ~PC2, z = ~PC3, color = ~reg, colors=pal, size=10)
fig = fig %>% add_markers()
fig = fig %>% layout(scene = list(xaxis = list(title = 'PC1'),
yaxis = list(title = 'PC2'),
zaxis = list(title = 'PC3')),
title=list(text=glue("{ncol(betaRedNAOmit)} SNPs; {id} {contrib_status} {analysis_type} Superpopulations")))

htmlwidgets::saveWidget(as_widget(fig), glue("{path}/PCA_plots/{id}_{contrib_status}_{analysis_type}_{plotid}_superpop_3D_PCAPlot.html"))
}
if ("Subpopulations" %in% groups) {
pal_sub = unique(geno_ancestry$color)
pal_sub = setNames(pal_sub, unique(geno_ancestry$population))

fig_sub = plot_ly(PCs_anc, x = ~PC1, y = ~PC2, z = ~PC3, color = ~population, colors=pal_sub, size=10)
fig_sub = fig_sub %>% add_markers()
fig_sub = fig_sub %>% layout(scene = list(xaxis = list(title = 'PC1'),
yaxis = list(title = 'PC2'),
zaxis = list(title = 'PC3')),
title=list(text=glue("{ncol(betaRedNAOmit)} SNPs; {id} {contrib_status} {analysis_type} Subpopulations")))

htmlwidgets::saveWidget(as_widget(fig_sub), glue("{path}/PCA_plots/{id}_{contrib_status}_{analysis_type}_{plotid}_subpopulations_3D_PCAPlot.html"))
}
}
Loading