From ce7c9ddd80ae37d8bd9e1b2d778d83a4fe3e3b3d Mon Sep 17 00:00:00 2001 From: Ben Rangihuna Date: Thu, 4 Mar 2021 11:07:42 +1300 Subject: [PATCH 01/23] should run ready for test --- ...{whole_pop_model_residuals.R => model_residuals_whole_pop.R} | 2 +- scripts/run_gcta_whole_pop.sh | 2 +- scripts/run_ldak_whole_pop.sh | 2 +- scripts/{whole_pop_pipline.sh => whole_pop_pipeline.sh} | 0 4 files changed, 3 insertions(+), 3 deletions(-) rename scripts/{whole_pop_model_residuals.R => model_residuals_whole_pop.R} (98%) rename scripts/{whole_pop_pipline.sh => whole_pop_pipeline.sh} (100%) diff --git a/scripts/whole_pop_model_residuals.R b/scripts/model_residuals_whole_pop.R similarity index 98% rename from scripts/whole_pop_model_residuals.R rename to scripts/model_residuals_whole_pop.R index c93b043..e7237d8 100644 --- a/scripts/whole_pop_model_residuals.R +++ b/scripts/model_residuals_whole_pop.R @@ -125,7 +125,7 @@ new_dat %>% select(SUBJECT, !!trait) %>% # pull out ids and trait column slice(-unlist(model_remove)) %>% # remove rows that didn't have residuals bind_cols(model_residuals) %>% - write_delim(path = here("tmp",paste0(pop,"_",trait,".residuals.txt")), + write_delim(file = here("tmp",paste0(pop,"_",trait,".residuals.txt")), delim = " ", col_names = FALSE) diff --git a/scripts/run_gcta_whole_pop.sh b/scripts/run_gcta_whole_pop.sh index f6ac54a..c40fbc6 100644 --- a/scripts/run_gcta_whole_pop.sh +++ b/scripts/run_gcta_whole_pop.sh @@ -59,6 +59,6 @@ software/gcta64 --bfile tmp/$BFILE --blup-snp ${POP}_results/GCTA/${TRAIT}_${MOD -cat ${POP}_results/GCTA/${TRAIT}_${MODEL}.hsq |tr -s " " | tr " " "\t" > ${DIR}/GCTA/${TRAIT}_${MODEL}.hsq.tsv +cat ${POP}_results/GCTA/${TRAIT}_${MODEL}.hsq |tr -s " " | tr " " "\t" > ${POP}_results/GCTA/${TRAIT}_${MODEL}.hsq.tsv diff --git a/scripts/run_ldak_whole_pop.sh b/scripts/run_ldak_whole_pop.sh index 78b3455..77d0442 100644 --- a/scripts/run_ldak_whole_pop.sh +++ b/scripts/run_ldak_whole_pop.sh @@ -28,7 +28,7 @@ software/LDAK/ldak5.linux --reml ${POP}_results/LDAK/${TRAIT}_${MODEL} --pheno t ### blups -software/LDAK/ldak5.linux --calc-blups ${POP}_results/LDAK/${TRAIT}_${MODEL} --remlfile ${POP}_results/LDAK/${TRAIT}_${CV}_${MODEL}.reml --grm results/${POP}_ldak_kinships --bfile tmp/${BFILE} --check-root NO +software/LDAK/ldak5.linux --calc-blups ${POP}_results/LDAK/${TRAIT}_${MODEL} --remlfile ${POP}_results/LDAK/${TRAIT}_${MODEL}.reml --grm results/${POP}_ldak_kinships --bfile tmp/${BFILE} --check-root NO diff --git a/scripts/whole_pop_pipline.sh b/scripts/whole_pop_pipeline.sh similarity index 100% rename from scripts/whole_pop_pipline.sh rename to scripts/whole_pop_pipeline.sh From bf9ca9463f9edd85b5108dc8ef2bbef718034c93 Mon Sep 17 00:00:00 2001 From: Ben Rangihuna Date: Thu, 4 Mar 2021 11:07:55 +1300 Subject: [PATCH 02/23] testing --- scripts/whole_pop_pipeline.sh | 49 ++++++++++++----------------------- 1 file changed, 17 insertions(+), 32 deletions(-) diff --git a/scripts/whole_pop_pipeline.sh b/scripts/whole_pop_pipeline.sh index e7826d7..60a4485 100644 --- a/scripts/whole_pop_pipeline.sh +++ b/scripts/whole_pop_pipeline.sh @@ -23,44 +23,31 @@ date #### Initial setup ##### -mkdir -p tmp ${RESULTS}/{GCTA,LDAK} +mkdir -p tmp ${RESULTS}/ -software/plink1.9b6.10 --bfile ${ORIG_DATA} --make-bed --out tmp/data_sorted + software/plink1.9b6.10 --bfile ${ORIG_DATA} --make-bed --out tmp/data_sorted # create lise of independent SNPs from data -software/plink1.9b6.10 --bfile tmp/data_sorted --indep-pairwise 50 5 0.2 --maf 0.1 --out tmp/data_pca_markers -## sort and keep our population - - -mkdir -p tmp + software/plink1.9b6.10 --bfile tmp/data_sorted --indep-pairwise 50 5 0.2 --maf 0.1 --out tmp/data_pca_markers +## sort and keep our population software/plink1.9b6.10 --bfile tmp/data_sorted --keep data/${POP}.keep --maf 0.01 --geno 0.05 --make-bed --out tmp/${POP}_sorteddata - - - - - ######################## GCTA #grm # only run this line once per population to save time - software/gcta64 --bfile tmp/${POP}_sorteddata --autosome --make-grm --out ${RESULTS}/${POP}_gcta_grm --threads 4 - - - ######################## PCAs ### calculate principal components for the population using the independent markers - -software/plink1.9b6.10 --bfile tmp/${POP}_sorteddata --pca 10 --out ${RESULTS}/${POP}_pcafile --extract data/data_pca_markers.prune.in + software/plink1.9b6.10 --bfile tmp/${POP}_sorteddata --pca 10 --out ${RESULTS}/${POP}_pcafile --extract data/data_pca_markers.prune.in @@ -68,29 +55,27 @@ software/plink1.9b6.10 --bfile tmp/${POP}_sorteddata --pca 10 --out ${RESULTS}/$ ####################### LDAK ### calculate the weights and kinships for LDAK prior to GRM - -software/LDAK/ldak5.linux --cut-weights $DIR/${POP}_ldak_sections --bfile tmp/${POP}_sorteddata - + software/LDAK/ldak5.linux --cut-weights ${RESULTS}/${POP}_ldak_sections --bfile tmp/${POP}_sorteddata + software/LDAK/ldak5.linux --calc-weights-all ${RESULTS}/${POP}_ldak_sections --bfile tmp/${POP}_sorteddata -software/LDAK/ldak5.linux --calc-weights-all $DIR/${POP}_ldak_sections --bfile tmp/${POP}_sorteddata - - - - -software/LDAK/ldak5.linux --calc-kins-direct $DIR/${POP}_ldak_kinships --bfile tmp/${POP}_sorteddata --weights $DIR/${POP}_ldak_sections/weights.short --power -0.25 +software/LDAK/ldak5.linux --calc-kins-direct ${RESULTS}/${POP}_ldak_kinships --bfile tmp/${POP}_sorteddata --weights ${RESULTS}/${POP}_ldak_sections/weights.short --power -0.25 +# example line to use for testing one iteration +#line="HEIGHT,linear", while read line do TRAIT=$(echo $line | cut -d',' -f1) REGRESSION_TYPE=$(echo $line | cut -d',' -f2) - + mkdir -p ${RESULTS}/{GCTA,LDAK} ##### Generate regression residuals for each model ##### # makes tmp/${POP}_${TRAIT}_residuals.txt - bash scripts/generate_models_whole_pop.sh ${POP} ${TRAIT} ${REGRESSION_TYPE} + + #bash scripts/generate_models_whole_pop.sh ${POP} ${TRAIT} ${REGRESSION_TYPE} + Rscript scripts/model_residuals_whole_pop.R --trait ${TRAIT} --pop ${POP} --regression ${REGRESSION_TYPE} --out_dir ${RESULTS}/${TRAIT} ##### Create phone files with the residuals ### prune and sort the data @@ -101,12 +86,12 @@ do # columns 8-12 are the FIVE residuals # height, egfr, serumurate, diabetes and gout in that order # need to join based on number fo models 2.2 - 2.8 was for 7 models - software/plink1.9b6.10 --bfile data/data --keep tmp/${POP}.keep --make-bed --out tmp/${POP}_${TRAIT} + software/plink1.9b6.10 --bfile data/data --keep data/${POP}.keep --make-bed --out tmp/${POP}_${TRAIT} # this should join using IID from fam and IID from the pheno (in pheno first and second cols are duplicated) includes entire fam in output so need to adjust accordingly for model numbers (-6 cols) - join -1 2 -2 1 -o auto -e "NA" <(sort -k1 tmp/${POP}.fam) <(sort -k1 tmp/${POP}_${TRAIT}_residuals.txt | cut -f2- ) > tmp/${POP}_${TRAIT}.pheno + join -1 2 -2 1 -o auto -e "NA" <(sort -k1 tmp/${POP}_${TRAIT}.fam) <(sort -k1 tmp/${POP}_${TRAIT}.residuals.txt | cut -f2- ) > tmp/${POP}_${TRAIT}.pheno # calculate the column indexes, offset by 2 (to account for FID and IID) from all of the residuals -> should be equal to $(seq 1 number_of_models) - pheno_cols=$( head -1 tmp/${POP}_${TRAIT}_residuals.txt | awk '{ for(i=1;i<=NF-2;i++){print i}}' | sort -u) + pheno_cols=$( head -1 tmp/${POP}_${TRAIT}.residuals.txt | awk '{ for(i=1;i<=NF-2;i++){print i}}' | sort -u) #### GCTA REML for each model of current trait From df7a669e33caa9687f36d21e08eed1dcf2b529d6 Mon Sep 17 00:00:00 2001 From: Ben Rangihuna Date: Tue, 9 Mar 2021 10:40:37 +1300 Subject: [PATCH 03/23] more timestamps --- scripts/whole_pop_pipeline.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/whole_pop_pipeline.sh b/scripts/whole_pop_pipeline.sh index 60a4485..3db87a8 100644 --- a/scripts/whole_pop_pipeline.sh +++ b/scripts/whole_pop_pipeline.sh @@ -54,6 +54,7 @@ software/gcta64 --bfile tmp/${POP}_sorteddata --autosome --make-grm --out ${RESU ####################### LDAK +echo "Start LDAK $(date)" ### calculate the weights and kinships for LDAK prior to GRM software/LDAK/ldak5.linux --cut-weights ${RESULTS}/${POP}_ldak_sections --bfile tmp/${POP}_sorteddata @@ -63,13 +64,15 @@ software/gcta64 --bfile tmp/${POP}_sorteddata --autosome --make-grm --out ${RESU software/LDAK/ldak5.linux --calc-kins-direct ${RESULTS}/${POP}_ldak_kinships --bfile tmp/${POP}_sorteddata --weights ${RESULTS}/${POP}_ldak_sections/weights.short --power -0.25 - +echo "Start traits $(date)" # example line to use for testing one iteration #line="HEIGHT,linear", while read line do TRAIT=$(echo $line | cut -d',' -f1) REGRESSION_TYPE=$(echo $line | cut -d',' -f2) + + echo "Start ${TRAIT} $(date)" mkdir -p ${RESULTS}/{GCTA,LDAK} ##### Generate regression residuals for each model ##### # makes tmp/${POP}_${TRAIT}_residuals.txt From ec3ce136011adf4115ce16b220928c9429a16afd Mon Sep 17 00:00:00 2001 From: Ben Rangihuna Date: Tue, 9 Mar 2021 11:10:56 +1300 Subject: [PATCH 04/23] comment out testing code that was causing issues --- scripts/model_residuals_whole_pop.R | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/scripts/model_residuals_whole_pop.R b/scripts/model_residuals_whole_pop.R index e7237d8..67a4b01 100644 --- a/scripts/model_residuals_whole_pop.R +++ b/scripts/model_residuals_whole_pop.R @@ -58,10 +58,12 @@ if(is.null(opt$pop)){ pop <- opt$pop } -pop <- 'nph' -model_type <- "linear" -trait <- "HEIGHT" -opt <-list(out_dir = "nph_results/") + +## Used for testing inside of an interactive session +#pop <- 'nph' +#model_type <- "linear" +#trait <- "HEIGHT" +#opt <-list(out_dir = "nph_results/") all_dat <- read_csv(file = here("data/curated_data.csv"), col_names = TRUE) %>% mutate(GOUT01recode = GOUT -1, T2D01recode = T2D -1, logBMI= log(BMI)) newpca <- read_delim(file = here("results",paste0(pop,"_","pcafile.eigenvec")), delim = " ", col_names = c("FID","SUBJECT",paste0("PC",1:10)), col_types = paste0(c("c","c",rep("d", 10)),collapse = "")) @@ -130,7 +132,7 @@ new_dat %>% col_names = FALSE) - +message(paste("residuals file:", here("tmp",paste0(pop,"_",trait,".residuals.txt")))) From e557c32d4d4afc148c5839645e18ef672bec3806 Mon Sep 17 00:00:00 2001 From: Ben Rangihuna Date: Tue, 1 Jun 2021 11:33:39 +1200 Subject: [PATCH 05/23] calc pca for individual pops --- scripts/model_residuals_whole_pop.R | 6 +++--- scripts/run_gcta_whole_pop.sh | 8 ++++---- scripts/run_ldak_whole_pop.sh | 10 +++++----- scripts/whole_pop_pipeline.sh | 21 +++++++++++---------- 4 files changed, 23 insertions(+), 22 deletions(-) diff --git a/scripts/model_residuals_whole_pop.R b/scripts/model_residuals_whole_pop.R index 67a4b01..3c383ab 100644 --- a/scripts/model_residuals_whole_pop.R +++ b/scripts/model_residuals_whole_pop.R @@ -66,7 +66,7 @@ if(is.null(opt$pop)){ #opt <-list(out_dir = "nph_results/") all_dat <- read_csv(file = here("data/curated_data.csv"), col_names = TRUE) %>% mutate(GOUT01recode = GOUT -1, T2D01recode = T2D -1, logBMI= log(BMI)) -newpca <- read_delim(file = here("results",paste0(pop,"_","pcafile.eigenvec")), delim = " ", col_names = c("FID","SUBJECT",paste0("PC",1:10)), col_types = paste0(c("c","c",rep("d", 10)),collapse = "")) +newpca <- read_delim(file = here(paste0(pop,"_results"),paste0(pop,"_pcafile.eigenvec")), delim = " ", col_names = c("FID","SUBJECT",paste0("PC",1:10)), col_types = paste0(c("c","c",rep("d", 10)),collapse = "")) if (!trait %in% names(all_dat)){ stop("Trait argument supplied does not match a column in the phenotypes.", call.=FALSE) @@ -127,12 +127,12 @@ new_dat %>% select(SUBJECT, !!trait) %>% # pull out ids and trait column slice(-unlist(model_remove)) %>% # remove rows that didn't have residuals bind_cols(model_residuals) %>% - write_delim(file = here("tmp",paste0(pop,"_",trait,".residuals.txt")), + write_delim(file = here(opt$out_dir,paste0(pop,"_",trait,".residuals.txt")), delim = " ", col_names = FALSE) -message(paste("residuals file:", here("tmp",paste0(pop,"_",trait,".residuals.txt")))) +message(paste("residuals file:", here(opt$out_dir,paste0(pop,"_",trait,".residuals.txt")))) diff --git a/scripts/run_gcta_whole_pop.sh b/scripts/run_gcta_whole_pop.sh index c40fbc6..44e069e 100644 --- a/scripts/run_gcta_whole_pop.sh +++ b/scripts/run_gcta_whole_pop.sh @@ -6,7 +6,7 @@ BFILE=$(basename $1 .fam) POP=$(echo $BFILE | cut -d'_' -f1) - +RESULTS=${POP}_results TRAIT=$(echo $BFILE | cut -d'_' -f2) @@ -45,7 +45,7 @@ echo "****** $REGRESSION ******" # $(seq 1 to how many trait/residual columns -software/gcta64 --reml --reml-pred-rand --grm ${POP}_results/${POP}_gcta_grm --pheno tmp/${BFILE}.pheno --mpheno ${PHENOCOL} --out ${POP}_results/GCTA/${TRAIT}_${MODEL} --threads 4 +software/gcta64 --reml --reml-pred-rand --grm ${RESULTS}/${POP}_gcta_grm --pheno ${RESULTS}/${BFILE}.pheno --mpheno ${PHENOCOL} --out ${RESULTS}/GCTA/${TRAIT}_${MODEL} --threads 4 @@ -55,10 +55,10 @@ software/gcta64 --reml --reml-pred-rand --grm ${POP}_results/${POP}_gcta_grm --p #software/plink1.9b6.10 --bfile data/data --keep tmp/$BFILE --maf 0.01 --geno 0.05 --make-bed --out tmp/$BFILE -software/gcta64 --bfile tmp/$BFILE --blup-snp ${POP}_results/GCTA/${TRAIT}_${MODEL}.indi.blp --autosome --out ${POP}_results/GCTA/${TRAIT}_${MODEL} --threads 4 +software/gcta64 --bfile tmp/$BFILE --blup-snp ${RESULTS}/GCTA/${TRAIT}_${MODEL}.indi.blp --autosome --out ${RESULTS}/GCTA/${TRAIT}_${MODEL} --threads 4 -cat ${POP}_results/GCTA/${TRAIT}_${MODEL}.hsq |tr -s " " | tr " " "\t" > ${POP}_results/GCTA/${TRAIT}_${MODEL}.hsq.tsv +cat ${RESULTS}/GCTA/${TRAIT}_${MODEL}.hsq |tr -s " " | tr " " "\t" > ${RESULTS}/GCTA/${TRAIT}_${MODEL}.hsq.tsv diff --git a/scripts/run_ldak_whole_pop.sh b/scripts/run_ldak_whole_pop.sh index 77d0442..799a487 100644 --- a/scripts/run_ldak_whole_pop.sh +++ b/scripts/run_ldak_whole_pop.sh @@ -6,7 +6,7 @@ BFILE=$(basename $1 .fam) POP=$(echo $BFILE | cut -d'_' -f1) - +RESULTS=${POP}_results TRAIT=$(echo $BFILE | cut -d'_' -f2) @@ -21,14 +21,14 @@ PHENOCOL=$(echo $MODEL + 4 | bc) # ignore the first 4 columns after FID and IID ### reml -software/LDAK/ldak5.linux --reml ${POP}_results/LDAK/${TRAIT}_${MODEL} --pheno tmp/${BFILE}.pheno --mpheno ${PHENOCOL} --grm ${POP}_results/${POP}_ldak_kinships +software/LDAK/ldak5.linux --reml ${RESULTS}/LDAK/${TRAIT}_${MODEL} --pheno ${RESULTS}/${BFILE}.pheno --mpheno ${PHENOCOL} --grm ${RESULTS}/${POP}_ldak_kinships ### blups -software/LDAK/ldak5.linux --calc-blups ${POP}_results/LDAK/${TRAIT}_${MODEL} --remlfile ${POP}_results/LDAK/${TRAIT}_${MODEL}.reml --grm results/${POP}_ldak_kinships --bfile tmp/${BFILE} --check-root NO +software/LDAK/ldak5.linux --calc-blups ${RESULTS}/LDAK/${TRAIT}_${MODEL} --remlfile ${RESULTS}/LDAK/${TRAIT}_${MODEL}.reml --grm ${RESULTS}/${POP}_ldak_kinships --bfile tmp/${BFILE} --check-root NO @@ -38,8 +38,8 @@ software/LDAK/ldak5.linux --calc-blups ${POP}_results/LDAK/${TRAIT}_${MODEL} --r ## make better results files -grep "^Her\|^Com" ${POP}_results/LDAK/${TRAIT}_${MODEL}.reml > ${POP}_results/LDAK/${TRAIT}_${MODEL}.h2 +grep "^Her\|^Com" ${RESULTS}/LDAK/${TRAIT}_${MODEL}.reml > ${RESULTS}/LDAK/${TRAIT}_${MODEL}.h2 -grep "^LRT_P" ${POP}_results/LDAK/${TRAIT}_${MODEL}.reml > ${POP}_results/LDAK/${TRAIT}_${MODEL}.p +grep "^LRT_P" ${RESULTS}/LDAK/${TRAIT}_${MODEL}.reml > ${RESULTS}/LDAK/${TRAIT}_${MODEL}.p diff --git a/scripts/whole_pop_pipeline.sh b/scripts/whole_pop_pipeline.sh index 3db87a8..8a1f43b 100644 --- a/scripts/whole_pop_pipeline.sh +++ b/scripts/whole_pop_pipeline.sh @@ -25,18 +25,18 @@ date #### Initial setup ##### mkdir -p tmp ${RESULTS}/ - software/plink1.9b6.10 --bfile ${ORIG_DATA} --make-bed --out tmp/data_sorted -# create lise of independent SNPs from data +software/plink1.9b6.10 --bfile ${ORIG_DATA} --make-bed --out tmp/data_sorted - software/plink1.9b6.10 --bfile tmp/data_sorted --indep-pairwise 50 5 0.2 --maf 0.1 --out tmp/data_pca_markers ## sort and keep our population software/plink1.9b6.10 --bfile tmp/data_sorted --keep data/${POP}.keep --maf 0.01 --geno 0.05 --make-bed --out tmp/${POP}_sorteddata -######################## GCTA +# create list of population specific independent SNPs from data +software/plink1.9b6.10 --bfile tmp/${POP}_sorteddata --indep-pairwise 50 5 0.2 --maf 0.1 --out ${RESULTS}/${POP}_pca_markers +######################## GCTA #grm # only run this line once per population to save time @@ -47,7 +47,8 @@ software/gcta64 --bfile tmp/${POP}_sorteddata --autosome --make-grm --out ${RESU ######################## PCAs ### calculate principal components for the population using the independent markers - software/plink1.9b6.10 --bfile tmp/${POP}_sorteddata --pca 10 --out ${RESULTS}/${POP}_pcafile --extract data/data_pca_markers.prune.in +software/plink1.9b6.10 --bfile tmp/${POP}_sorteddata --pca 10 --out ${RESULTS}/${POP}_pcafile --extract ${RESULTS}/${POP}_pca_markers.prune.in --allow-no-sex + @@ -56,10 +57,10 @@ software/gcta64 --bfile tmp/${POP}_sorteddata --autosome --make-grm --out ${RESU echo "Start LDAK $(date)" ### calculate the weights and kinships for LDAK prior to GRM - software/LDAK/ldak5.linux --cut-weights ${RESULTS}/${POP}_ldak_sections --bfile tmp/${POP}_sorteddata +software/LDAK/ldak5.linux --cut-weights ${RESULTS}/${POP}_ldak_sections --bfile tmp/${POP}_sorteddata - software/LDAK/ldak5.linux --calc-weights-all ${RESULTS}/${POP}_ldak_sections --bfile tmp/${POP}_sorteddata +software/LDAK/ldak5.linux --calc-weights-all ${RESULTS}/${POP}_ldak_sections --bfile tmp/${POP}_sorteddata software/LDAK/ldak5.linux --calc-kins-direct ${RESULTS}/${POP}_ldak_kinships --bfile tmp/${POP}_sorteddata --weights ${RESULTS}/${POP}_ldak_sections/weights.short --power -0.25 @@ -78,7 +79,7 @@ do # makes tmp/${POP}_${TRAIT}_residuals.txt #bash scripts/generate_models_whole_pop.sh ${POP} ${TRAIT} ${REGRESSION_TYPE} - Rscript scripts/model_residuals_whole_pop.R --trait ${TRAIT} --pop ${POP} --regression ${REGRESSION_TYPE} --out_dir ${RESULTS}/${TRAIT} + Rscript scripts/model_residuals_whole_pop.R --trait ${TRAIT} --pop ${POP} --regression ${REGRESSION_TYPE} --out_dir ${RESULTS}/ ##### Create phone files with the residuals ### prune and sort the data @@ -91,10 +92,10 @@ do # need to join based on number fo models 2.2 - 2.8 was for 7 models software/plink1.9b6.10 --bfile data/data --keep data/${POP}.keep --make-bed --out tmp/${POP}_${TRAIT} # this should join using IID from fam and IID from the pheno (in pheno first and second cols are duplicated) includes entire fam in output so need to adjust accordingly for model numbers (-6 cols) - join -1 2 -2 1 -o auto -e "NA" <(sort -k1 tmp/${POP}_${TRAIT}.fam) <(sort -k1 tmp/${POP}_${TRAIT}.residuals.txt | cut -f2- ) > tmp/${POP}_${TRAIT}.pheno + join -1 2 -2 1 -o auto -e "NA" <(sort -k1 tmp/${POP}_${TRAIT}.fam) <(sort -k1 ${RESULTS}/${POP}_${TRAIT}.residuals.txt | cut -f2- ) > ${RESULTS}/${POP}_${TRAIT}.pheno # calculate the column indexes, offset by 2 (to account for FID and IID) from all of the residuals -> should be equal to $(seq 1 number_of_models) - pheno_cols=$( head -1 tmp/${POP}_${TRAIT}.residuals.txt | awk '{ for(i=1;i<=NF-2;i++){print i}}' | sort -u) + pheno_cols=$( head -1 ${RESULTS}/${POP}_${TRAIT}.residuals.txt | awk '{ for(i=1;i<=NF-2;i++){print i}}' | sort -u) #### GCTA REML for each model of current trait From a6566ea6ce3fdc05f208415083688960368cd355 Mon Sep 17 00:00:00 2001 From: Ben Rangihuna Date: Tue, 1 Jun 2021 17:20:46 +1200 Subject: [PATCH 06/23] calc pca for individual pops --- .../scripts/model_residuals_whole_pop.R | 140 ++++++++++++++++++ tanya_data/scripts/run_gcta_whole_pop.sh | 64 ++++++++ tanya_data/scripts/run_ldak_whole_pop.sh | 45 ++++++ tanya_data/scripts/whole_pop_pipeline.sh | 116 +++++++++++++++ 4 files changed, 365 insertions(+) create mode 100644 tanya_data/scripts/model_residuals_whole_pop.R create mode 100644 tanya_data/scripts/run_gcta_whole_pop.sh create mode 100644 tanya_data/scripts/run_ldak_whole_pop.sh create mode 100644 tanya_data/scripts/whole_pop_pipeline.sh diff --git a/tanya_data/scripts/model_residuals_whole_pop.R b/tanya_data/scripts/model_residuals_whole_pop.R new file mode 100644 index 0000000..d186d45 --- /dev/null +++ b/tanya_data/scripts/model_residuals_whole_pop.R @@ -0,0 +1,140 @@ +if(!require(here)){ + install.packages("here", repos = "https://cloud.r-project.org") + library("here") +} +if(!require(fs)){ + install.packages("fs", repos = "https://cloud.r-project.org") + library(fs) +} +if(!require(tidyverse)){ + install.packages("tidyverse", repos = "https://cloud.r-project.org") + library(tidyverse) +} +if(!require(optparse)){ + install.packages("optparse", repos = "https://cloud.r-project.org") + library("optparse") +} + + +option_list = list( + make_option(c("--trait"), type="character", default=NULL, + help="Name of trait column that matches exactly from the phenotype file", metavar="character"), + make_option(c("-r", "--regression"), type="character", default="linear", + help="linear|logistic", metavar="character"), + make_option(c("-p","--pop"), type = "character", default=NULL, + help="Name of the population (no special characters except underscore).", metavar="character"), + make_option(c("--out_dir"), type = "character", default = "results", + help = "Name of the directory to store the results (no trailing slash needed).") + ); + +opt_parser = OptionParser(option_list=option_list); +opt = parse_args(opt_parser); + + +# name of trait column +#trait <- "HEIGHT" +if (is.null(opt$trait)){ + print_help(opt_parser) + stop("Trait argument must be supplied.", call.=FALSE) +} else { + trait <- opt$trait +} + + +# define model type: linear or logistic regression +#model_type <- "linear" +if(!(opt$regression == "linear" | opt$regression == "logistic")){ + print_help(opt_parser) + stop("Regression argument must be 'linear' or 'logistic'.", call.=FALSE) +} else { + model_type <- opt$regression +} + +# population name +if(is.null(opt$pop)){ + print_help(opt_parser) + stop("Population (pop) argument must be supplied.", call.=FALSE) +} else { + pop <- opt$pop +} + + +## Used for testing inside of an interactive session +#pop <- 'nph' +#model_type <- "linear" +#trait <- "HEIGHT" +#opt <-list(out_dir = "nph_results/") + +all_dat <- read_csv(file = here("data/tanya_data.csv"), col_names = TRUE)# %>% mutate(GOUT01recode = GOUT -1, T2D01recode = T2D -1, logBMI= log(BMI)) +newpca <- read_delim(file = here("results",paste0(pop,"_","pcafile.eigenvec")), delim = " ", col_names = c("FID","SUBJECT",paste0("PC",1:10)), col_types = paste0(c("c","c",rep("d", 10)),collapse = "")) + +if (!trait %in% names(all_dat)){ + stop("Trait argument supplied does not match a column in the phenotypes.", call.=FALSE) +} + +### add the trait information on to the samples in the PCA +new_dat <- newpca %>% left_join(., all_dat, by = "SUBJECT") + + + +# Define list of variables for the models #### +preds <- c("AGECOL", "SEX", paste0("PC",1:10)) + +# generate models stepwise adding variables +models <- list() +for(pred_idx in seq_along(preds)){ + if(pred_idx ==1){ + # create the base model + models <- paste(trait,"~",preds[[pred_idx]]) + } else{ + # add the next covariate to the base model + models[[pred_idx]] <- paste(models[[pred_idx-1]], preds[[pred_idx]], sep = " + ") + } +} + +# function to use for a given formula run the model as either linear or logistic regression +# runs the supplied model on the supplied dataset and returns the model results as a list(model, formula) +run_model <- function(model_formula, dat){ + model_results <- list() + if(model_type == "linear"){ + model_results[["model"]] <- lm(model_formula, data = dat, na.action = na.exclude) + } else if(model_type == "logistic"){ + model_results[["model"]] <- glm(model_formula, family = binomial(link='logit'), data = dat, na.action = na.exclude) + } + + model_results[["formula"]] <- model_formula + return(model_results) +} + +# test example +#run_model(model_formula = models[[1]], new_dat) + + + +# run the set of models on a the data +model_results <- purrr::map(models, run_model, new_dat) %>% set_names(., nm = paste0("model",seq_along(models))) +names(model_results) <- paste0("model",seq_along(models)) + + + +# put the residuals from all models into a dataframe for each cv +model_residuals <- map(model_results, list("model", "residuals")) +model_remove <- map(model_results, list("model","na.action")) + + +# write out residuals, columns: SUBJECT TRAIT model1 ... modelx +new_dat %>% + select(SUBJECT, !!trait) %>% # pull out ids and trait column + slice(-unlist(model_remove)) %>% # remove rows that didn't have residuals + bind_cols(model_residuals) %>% + write_delim(file = here("tmp",paste0(pop,"_",trait,".residuals.txt")), + delim = " ", + col_names = FALSE) + + +message(paste("residuals file:", here("tmp",paste0(pop,"_",trait,".residuals.txt")))) + + + +saveRDS(list(model_residuals, model_remove, trait, model_type, models, preds), file = here(opt$out_dir,paste0(pop,"_",trait,".RDS"))) + diff --git a/tanya_data/scripts/run_gcta_whole_pop.sh b/tanya_data/scripts/run_gcta_whole_pop.sh new file mode 100644 index 0000000..c40fbc6 --- /dev/null +++ b/tanya_data/scripts/run_gcta_whole_pop.sh @@ -0,0 +1,64 @@ +### run_gcta_whole_pop.sh + +## run: bash run_gcta_whole_pop.sh BFILE MODEL + +BFILE=$(basename $1 .fam) + + +POP=$(echo $BFILE | cut -d'_' -f1) + +TRAIT=$(echo $BFILE | cut -d'_' -f2) + + + +MODEL=$2 + +PHENOCOL=$(echo $MODEL + 4 | bc) + + + + + + + + + +echo "****** $REGRESSION ******" + + + + +######################## GCTA + +#grm only run this line once per population to save time + +# transfered to the phenoprep script + + + + +# software/gcta64 --bfile tmp/ --autosome --make-grm --out $DIR/GCTA/grm + + + +#reml + +# $(seq 1 to how many trait/residual columns + +software/gcta64 --reml --reml-pred-rand --grm ${POP}_results/${POP}_gcta_grm --pheno tmp/${BFILE}.pheno --mpheno ${PHENOCOL} --out ${POP}_results/GCTA/${TRAIT}_${MODEL} --threads 4 + + + + + +# BLUP solutions for the SNP effects + +#software/plink1.9b6.10 --bfile data/data --keep tmp/$BFILE --maf 0.01 --geno 0.05 --make-bed --out tmp/$BFILE + +software/gcta64 --bfile tmp/$BFILE --blup-snp ${POP}_results/GCTA/${TRAIT}_${MODEL}.indi.blp --autosome --out ${POP}_results/GCTA/${TRAIT}_${MODEL} --threads 4 + + + +cat ${POP}_results/GCTA/${TRAIT}_${MODEL}.hsq |tr -s " " | tr " " "\t" > ${POP}_results/GCTA/${TRAIT}_${MODEL}.hsq.tsv + + diff --git a/tanya_data/scripts/run_ldak_whole_pop.sh b/tanya_data/scripts/run_ldak_whole_pop.sh new file mode 100644 index 0000000..77d0442 --- /dev/null +++ b/tanya_data/scripts/run_ldak_whole_pop.sh @@ -0,0 +1,45 @@ +### run_ldak_whole_pop.sh + + + +BFILE=$(basename $1 .fam) + + +POP=$(echo $BFILE | cut -d'_' -f1) + +TRAIT=$(echo $BFILE | cut -d'_' -f2) + + + +MODEL=$2 + +PHENOCOL=$(echo $MODEL + 4 | bc) # ignore the first 4 columns after FID and IID + + + +####################### LDAK + +### reml + +software/LDAK/ldak5.linux --reml ${POP}_results/LDAK/${TRAIT}_${MODEL} --pheno tmp/${BFILE}.pheno --mpheno ${PHENOCOL} --grm ${POP}_results/${POP}_ldak_kinships + + + + +### blups + +software/LDAK/ldak5.linux --calc-blups ${POP}_results/LDAK/${TRAIT}_${MODEL} --remlfile ${POP}_results/LDAK/${TRAIT}_${MODEL}.reml --grm results/${POP}_ldak_kinships --bfile tmp/${BFILE} --check-root NO + + + + + + + +## make better results files + +grep "^Her\|^Com" ${POP}_results/LDAK/${TRAIT}_${MODEL}.reml > ${POP}_results/LDAK/${TRAIT}_${MODEL}.h2 + +grep "^LRT_P" ${POP}_results/LDAK/${TRAIT}_${MODEL}.reml > ${POP}_results/LDAK/${TRAIT}_${MODEL}.p + + diff --git a/tanya_data/scripts/whole_pop_pipeline.sh b/tanya_data/scripts/whole_pop_pipeline.sh new file mode 100644 index 0000000..3db87a8 --- /dev/null +++ b/tanya_data/scripts/whole_pop_pipeline.sh @@ -0,0 +1,116 @@ +### whole_pop_pipeline.sh + +## Run: + + +## bash scripts/whole_pop_pipeline.sh data/data {POP_NAME} + +## + + + + +ORIG_DATA=$1 +POP=$2 +RESULTS=${POP}_results/ + + + + +echo "START" + +date + + +#### Initial setup ##### +mkdir -p tmp ${RESULTS}/ + + software/plink1.9b6.10 --bfile ${ORIG_DATA} --make-bed --out tmp/data_sorted + +# create lise of independent SNPs from data + + software/plink1.9b6.10 --bfile tmp/data_sorted --indep-pairwise 50 5 0.2 --maf 0.1 --out tmp/data_pca_markers + +## sort and keep our population +software/plink1.9b6.10 --bfile tmp/data_sorted --keep data/${POP}.keep --maf 0.01 --geno 0.05 --make-bed --out tmp/${POP}_sorteddata + + +######################## GCTA + +#grm + +# only run this line once per population to save time +software/gcta64 --bfile tmp/${POP}_sorteddata --autosome --make-grm --out ${RESULTS}/${POP}_gcta_grm --threads 4 + + + +######################## PCAs + +### calculate principal components for the population using the independent markers + software/plink1.9b6.10 --bfile tmp/${POP}_sorteddata --pca 10 --out ${RESULTS}/${POP}_pcafile --extract data/data_pca_markers.prune.in + + + + +####################### LDAK + +echo "Start LDAK $(date)" +### calculate the weights and kinships for LDAK prior to GRM + software/LDAK/ldak5.linux --cut-weights ${RESULTS}/${POP}_ldak_sections --bfile tmp/${POP}_sorteddata + + + software/LDAK/ldak5.linux --calc-weights-all ${RESULTS}/${POP}_ldak_sections --bfile tmp/${POP}_sorteddata + + +software/LDAK/ldak5.linux --calc-kins-direct ${RESULTS}/${POP}_ldak_kinships --bfile tmp/${POP}_sorteddata --weights ${RESULTS}/${POP}_ldak_sections/weights.short --power -0.25 + +echo "Start traits $(date)" +# example line to use for testing one iteration +#line="HEIGHT,linear", +while read line +do + TRAIT=$(echo $line | cut -d',' -f1) + REGRESSION_TYPE=$(echo $line | cut -d',' -f2) + + echo "Start ${TRAIT} $(date)" + mkdir -p ${RESULTS}/{GCTA,LDAK} + ##### Generate regression residuals for each model ##### + # makes tmp/${POP}_${TRAIT}_residuals.txt + + #bash scripts/generate_models_whole_pop.sh ${POP} ${TRAIT} ${REGRESSION_TYPE} + Rscript scripts/model_residuals_whole_pop.R --trait ${TRAIT} --pop ${POP} --regression ${REGRESSION_TYPE} --out_dir ${RESULTS}/${TRAIT} + + ##### Create phone files with the residuals + ### prune and sort the data + # prune to only those with residuals + # residual are population specific with PCAs included in the model outside this script. + # combine residuals to fam file + # this step need to be repeated for each of the groups at bayesR section + # columns 8-12 are the FIVE residuals + # height, egfr, serumurate, diabetes and gout in that order + # need to join based on number fo models 2.2 - 2.8 was for 7 models + software/plink1.9b6.10 --bfile data/data --keep data/${POP}.keep --make-bed --out tmp/${POP}_${TRAIT} + # this should join using IID from fam and IID from the pheno (in pheno first and second cols are duplicated) includes entire fam in output so need to adjust accordingly for model numbers (-6 cols) + join -1 2 -2 1 -o auto -e "NA" <(sort -k1 tmp/${POP}_${TRAIT}.fam) <(sort -k1 tmp/${POP}_${TRAIT}.residuals.txt | cut -f2- ) > tmp/${POP}_${TRAIT}.pheno + + # calculate the column indexes, offset by 2 (to account for FID and IID) from all of the residuals -> should be equal to $(seq 1 number_of_models) + pheno_cols=$( head -1 tmp/${POP}_${TRAIT}.residuals.txt | awk '{ for(i=1;i<=NF-2;i++){print i}}' | sort -u) + + + #### GCTA REML for each model of current trait + # run the following with 16 parallel jobs + parallel -j 16 'nice -n 10 bash scripts/run_gcta_whole_pop.sh {1} {2}' ::: tmp/${POP}_${TRAIT}.fam ::: ${pheno_cols} + + #### LDAK for each model of current trait + parallel -j 16 'nice -n 10 bash scripts/run_ldak_whole_pop.sh {1} {2} ' ::: tmp/${POP}_${TRAIT}.fam ::: ${pheno_cols} + +done < data/pop_trait_models.csv + + + + +echo "FINISH" + +date + + From 39971250c427ede220eb88182464749186818d60 Mon Sep 17 00:00:00 2001 From: Ben Rangihuna Date: Fri, 4 Jun 2021 14:08:44 +1200 Subject: [PATCH 07/23] exclude missing data before making models --- scripts/model_residuals_whole_pop.R | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/scripts/model_residuals_whole_pop.R b/scripts/model_residuals_whole_pop.R index 3c383ab..ba19326 100644 --- a/scripts/model_residuals_whole_pop.R +++ b/scripts/model_residuals_whole_pop.R @@ -65,7 +65,7 @@ if(is.null(opt$pop)){ #trait <- "HEIGHT" #opt <-list(out_dir = "nph_results/") -all_dat <- read_csv(file = here("data/curated_data.csv"), col_names = TRUE) %>% mutate(GOUT01recode = GOUT -1, T2D01recode = T2D -1, logBMI= log(BMI)) +all_dat <- read_csv(file = here("data/tanya_data.csv"), col_names = TRUE) # %>% mutate(GOUT01recode = GOUT -1, T2D01recode = T2D -1, logBMI= log(BMI)) newpca <- read_delim(file = here(paste0(pop,"_results"),paste0(pop,"_pcafile.eigenvec")), delim = " ", col_names = c("FID","SUBJECT",paste0("PC",1:10)), col_types = paste0(c("c","c",rep("d", 10)),collapse = "")) if (!trait %in% names(all_dat)){ @@ -80,6 +80,9 @@ new_dat <- newpca %>% left_join(., all_dat, by = "SUBJECT") # Define list of variables for the models #### preds <- c("AGECOL", "SEX", paste0("PC",1:10)) +# subset the data to only include the variables needed to run the models and make it so that only individuals with complete observations are included +new_dat <- new_dat %>% select(SUBJECT, !!trait, all_of(preds)) %>% drop_na() + # generate models stepwise adding variables models <- list() for(pred_idx in seq_along(preds)){ @@ -125,7 +128,7 @@ model_remove <- map(model_results, list("model","na.action")) # write out residuals, columns: SUBJECT TRAIT model1 ... modelx new_dat %>% select(SUBJECT, !!trait) %>% # pull out ids and trait column - slice(-unlist(model_remove)) %>% # remove rows that didn't have residuals + #slice(-unlist(model_remove)) %>% # remove rows that didn't have residuals (step no longer needed due to step before model creation that only lets complete obs go into the model) bind_cols(model_residuals) %>% write_delim(file = here(opt$out_dir,paste0(pop,"_",trait,".residuals.txt")), delim = " ", From 45d9ac707aa477ac82056ac6531a4a7615446036 Mon Sep 17 00:00:00 2001 From: BenRangihuna <75346750+BenRangihuna@users.noreply.github.com> Date: Wed, 23 Jun 2021 10:09:58 +1200 Subject: [PATCH 08/23] Update whole_pop_pipeline.sh --- scripts/whole_pop_pipeline.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/whole_pop_pipeline.sh b/scripts/whole_pop_pipeline.sh index 8a1f43b..738fcea 100644 --- a/scripts/whole_pop_pipeline.sh +++ b/scripts/whole_pop_pipeline.sh @@ -92,7 +92,7 @@ do # need to join based on number fo models 2.2 - 2.8 was for 7 models software/plink1.9b6.10 --bfile data/data --keep data/${POP}.keep --make-bed --out tmp/${POP}_${TRAIT} # this should join using IID from fam and IID from the pheno (in pheno first and second cols are duplicated) includes entire fam in output so need to adjust accordingly for model numbers (-6 cols) - join -1 2 -2 1 -o auto -e "NA" <(sort -k1 tmp/${POP}_${TRAIT}.fam) <(sort -k1 ${RESULTS}/${POP}_${TRAIT}.residuals.txt | cut -f2- ) > ${RESULTS}/${POP}_${TRAIT}.pheno + join -1 2 -2 1 -o auto -e "NA" <(sort -k2 tmp/${POP}_${TRAIT}.fam) <(sort -k1 ${RESULTS}/${POP}_${TRAIT}.residuals.txt) > ${RESULTS}/${POP}_${TRAIT}.pheno # calculate the column indexes, offset by 2 (to account for FID and IID) from all of the residuals -> should be equal to $(seq 1 number_of_models) pheno_cols=$( head -1 ${RESULTS}/${POP}_${TRAIT}.residuals.txt | awk '{ for(i=1;i<=NF-2;i++){print i}}' | sort -u) From 4a0c47ee2935b2f3f40546819736853a661108c0 Mon Sep 17 00:00:00 2001 From: BenRangihuna <75346750+BenRangihuna@users.noreply.github.com> Date: Wed, 23 Jun 2021 10:28:50 +1200 Subject: [PATCH 09/23] use separate pheno file per model --- scripts/run_gcta_whole_pop.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/scripts/run_gcta_whole_pop.sh b/scripts/run_gcta_whole_pop.sh index 44e069e..c32ef9a 100644 --- a/scripts/run_gcta_whole_pop.sh +++ b/scripts/run_gcta_whole_pop.sh @@ -13,7 +13,8 @@ TRAIT=$(echo $BFILE | cut -d'_' -f2) MODEL=$2 -PHENOCOL=$(echo $MODEL + 4 | bc) +# pulls out the from the start of the residuals - first 7 cols are: FID IID PID MID SEX AFF TRUEPHENOVALUE +PHENOCOL=$(echo $MODEL + 7 | bc) @@ -44,8 +45,8 @@ echo "****** $REGRESSION ******" #reml # $(seq 1 to how many trait/residual columns - -software/gcta64 --reml --reml-pred-rand --grm ${RESULTS}/${POP}_gcta_grm --pheno ${RESULTS}/${BFILE}.pheno --mpheno ${PHENOCOL} --out ${RESULTS}/GCTA/${TRAIT}_${MODEL} --threads 4 +cut -d' ' -f1,2,${PHENOCOL} < ${RESULTS}/${BFILE}.pheno > ${RESULTS}/${BFILE}.pheno${MODEL} +software/gcta64 --reml --reml-pred-rand --grm ${RESULTS}/${POP}_gcta_grm --pheno ${RESULTS}/${BFILE}.pheno${MODEL} --out ${RESULTS}/GCTA/${TRAIT}_${MODEL} --threads 4 From f985064821627897bd57a8e7ec7d638a7ff420c4 Mon Sep 17 00:00:00 2001 From: BenRangihuna <75346750+BenRangihuna@users.noreply.github.com> Date: Wed, 23 Jun 2021 10:32:34 +1200 Subject: [PATCH 10/23] use separate pheno for each model --- scripts/run_ldak_whole_pop.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/run_ldak_whole_pop.sh b/scripts/run_ldak_whole_pop.sh index 799a487..3c4ce50 100644 --- a/scripts/run_ldak_whole_pop.sh +++ b/scripts/run_ldak_whole_pop.sh @@ -14,14 +14,14 @@ TRAIT=$(echo $BFILE | cut -d'_' -f2) MODEL=$2 PHENOCOL=$(echo $MODEL + 4 | bc) # ignore the first 4 columns after FID and IID - +cut -d' ' -f1,2,${PHENOCOL} < ${RESULTS}/${BFILE}.pheno > ${RESULTS}/${BFILE}.pheno${MODEL}_ldak ####################### LDAK ### reml -software/LDAK/ldak5.linux --reml ${RESULTS}/LDAK/${TRAIT}_${MODEL} --pheno ${RESULTS}/${BFILE}.pheno --mpheno ${PHENOCOL} --grm ${RESULTS}/${POP}_ldak_kinships +software/LDAK/ldak5.linux --reml ${RESULTS}/LDAK/${TRAIT}_${MODEL} --pheno ${RESULTS}/${BFILE}.pheno${MODEL}_ldak --grm ${RESULTS}/${POP}_ldak_kinships From 3ee50ac03c8a0716199f2af8923e1c45260e1766 Mon Sep 17 00:00:00 2001 From: BenRangihuna <75346750+BenRangihuna@users.noreply.github.com> Date: Wed, 23 Jun 2021 10:33:35 +1200 Subject: [PATCH 11/23] use gcta suffix on pheno file --- scripts/run_gcta_whole_pop.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/run_gcta_whole_pop.sh b/scripts/run_gcta_whole_pop.sh index c32ef9a..e847510 100644 --- a/scripts/run_gcta_whole_pop.sh +++ b/scripts/run_gcta_whole_pop.sh @@ -15,7 +15,7 @@ MODEL=$2 # pulls out the from the start of the residuals - first 7 cols are: FID IID PID MID SEX AFF TRUEPHENOVALUE PHENOCOL=$(echo $MODEL + 7 | bc) - +cut -d' ' -f1,2,${PHENOCOL} < ${RESULTS}/${BFILE}.pheno > ${RESULTS}/${BFILE}.pheno${MODEL}_gcta @@ -45,8 +45,8 @@ echo "****** $REGRESSION ******" #reml # $(seq 1 to how many trait/residual columns -cut -d' ' -f1,2,${PHENOCOL} < ${RESULTS}/${BFILE}.pheno > ${RESULTS}/${BFILE}.pheno${MODEL} -software/gcta64 --reml --reml-pred-rand --grm ${RESULTS}/${POP}_gcta_grm --pheno ${RESULTS}/${BFILE}.pheno${MODEL} --out ${RESULTS}/GCTA/${TRAIT}_${MODEL} --threads 4 + +software/gcta64 --reml --reml-pred-rand --grm ${RESULTS}/${POP}_gcta_grm --pheno ${RESULTS}/${BFILE}.pheno${MODEL}_gcta --out ${RESULTS}/GCTA/${TRAIT}_${MODEL} --threads 4 From d2c3eb009c50d5a8c2b25726b29d3fbdc53d3158 Mon Sep 17 00:00:00 2001 From: BenRangihuna <75346750+BenRangihuna@users.noreply.github.com> Date: Wed, 23 Jun 2021 10:34:56 +1200 Subject: [PATCH 12/23] Update whole_pop_pipeline.sh --- scripts/whole_pop_pipeline.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/whole_pop_pipeline.sh b/scripts/whole_pop_pipeline.sh index 738fcea..662abfc 100644 --- a/scripts/whole_pop_pipeline.sh +++ b/scripts/whole_pop_pipeline.sh @@ -90,7 +90,7 @@ do # columns 8-12 are the FIVE residuals # height, egfr, serumurate, diabetes and gout in that order # need to join based on number fo models 2.2 - 2.8 was for 7 models - software/plink1.9b6.10 --bfile data/data --keep data/${POP}.keep --make-bed --out tmp/${POP}_${TRAIT} + software/plink1.9b6.10 --bfile data/data --keep data/${POP}.keep --make-bed --maf 0.01 --geno 0.05 --out tmp/${POP}_${TRAIT} # this should join using IID from fam and IID from the pheno (in pheno first and second cols are duplicated) includes entire fam in output so need to adjust accordingly for model numbers (-6 cols) join -1 2 -2 1 -o auto -e "NA" <(sort -k2 tmp/${POP}_${TRAIT}.fam) <(sort -k1 ${RESULTS}/${POP}_${TRAIT}.residuals.txt) > ${RESULTS}/${POP}_${TRAIT}.pheno From ffeb7d8c9c5f2b0112dd1ce1e6c5b00b272069e8 Mon Sep 17 00:00:00 2001 From: Ben Rangihuna Date: Wed, 23 Jun 2021 10:36:40 +1200 Subject: [PATCH 13/23] extract results at the end --- scripts/whole_pop_pipeline.sh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/scripts/whole_pop_pipeline.sh b/scripts/whole_pop_pipeline.sh index 8a1f43b..802324c 100644 --- a/scripts/whole_pop_pipeline.sh +++ b/scripts/whole_pop_pipeline.sh @@ -104,9 +104,14 @@ do #### LDAK for each model of current trait parallel -j 16 'nice -n 10 bash scripts/run_ldak_whole_pop.sh {1} {2} ' ::: tmp/${POP}_${TRAIT}.fam ::: ${pheno_cols} - done < data/pop_trait_models.csv +WD=$(pwd) +cd ${RESULTS}/LDAK +# combine all LDAK h2 for pop into a single file +grep 'K1' *.h2 | cat <(echo FILE $(head -1 ${TRAIT}_1.h2 )) - > ${POP}_all_traits_LDAK_K1_h2.txt +# return to the orignal directory +cd $WD From 9d699101a33398705c317b8c22a93c1684444533 Mon Sep 17 00:00:00 2001 From: BenRangihuna <75346750+BenRangihuna@users.noreply.github.com> Date: Wed, 23 Jun 2021 10:51:07 +1200 Subject: [PATCH 14/23] extract from actual original data --- scripts/whole_pop_pipeline.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/whole_pop_pipeline.sh b/scripts/whole_pop_pipeline.sh index a05ab3e..2ae8e68 100644 --- a/scripts/whole_pop_pipeline.sh +++ b/scripts/whole_pop_pipeline.sh @@ -90,7 +90,7 @@ do # columns 8-12 are the FIVE residuals # height, egfr, serumurate, diabetes and gout in that order # need to join based on number fo models 2.2 - 2.8 was for 7 models - software/plink1.9b6.10 --bfile data/data --keep data/${POP}.keep --make-bed --maf 0.01 --geno 0.05 --out tmp/${POP}_${TRAIT} + software/plink1.9b6.10 --bfile ${ORIG_DATA} --keep data/${POP}.keep --make-bed --maf 0.01 --geno 0.05 --out tmp/${POP}_${TRAIT} # this should join using IID from fam and IID from the pheno (in pheno first and second cols are duplicated) includes entire fam in output so need to adjust accordingly for model numbers (-6 cols) join -1 2 -2 1 -o auto -e "NA" <(sort -k2 tmp/${POP}_${TRAIT}.fam) <(sort -k1 ${RESULTS}/${POP}_${TRAIT}.residuals.txt) > ${RESULTS}/${POP}_${TRAIT}.pheno From 5f6cd93bf1f10b1fec71c0717b34949b641a2da3 Mon Sep 17 00:00:00 2001 From: Ben Rangihuna Date: Wed, 14 Jul 2021 11:06:46 +1200 Subject: [PATCH 15/23] new scripts for running models on pca selected populations --- ...DAK_GCTA_heritability_with_trait_values.sh | 34 ++++++++ scripts/make_pheno_and_covar_files.R | 72 +++++++++++++++++ scripts/pca_select_pops.R | 81 +++++++++++++++++++ 3 files changed, 187 insertions(+) create mode 100644 scripts/LDAK_GCTA_heritability_with_trait_values.sh create mode 100644 scripts/make_pheno_and_covar_files.R create mode 100644 scripts/pca_select_pops.R diff --git a/scripts/LDAK_GCTA_heritability_with_trait_values.sh b/scripts/LDAK_GCTA_heritability_with_trait_values.sh new file mode 100644 index 0000000..dcfb48f --- /dev/null +++ b/scripts/LDAK_GCTA_heritability_with_trait_values.sh @@ -0,0 +1,34 @@ + +POP=nphpca +RESULTS=${POP}_results +mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT} +for covar in ls ${RESULTS}/*.covar +do + TRAIT=gout + MODEL=$(basename ${covar} .covar) + PREV=0.049 + mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT} + software/LDAK/ldak5.linux --reml ${RESULTS}/LDAK/${TRAIT}/${MODEL} --pheno data/tanya_${TRAIT}.pheno --grm ${RESULTS}/${POP}_ldak_kinships --prevalence ${PREV} --covar ${covar} + software/gcta64 --reml --reml-pred-rand --grm ${RESULTS}/${POP}_gcta_grm --pheno data/tanya_${TRAIT}.pheno --prevalence ${PREV} --out ${RESULTS}/GCTA/${TRAIT}/${MODEL} --qcovar ${covar} --threads 4 + +done > ${POP}_${TRAIT}.log + +for covar in ls ${RESULTS}/*.covar +do + TRAIT=t2d + MODEL=$(basename ${covar} .covar) + PREV=0.078 + mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT} + software/LDAK/ldak5.linux --reml ${RESULTS}/LDAK/${TRAIT}/${MODEL} --pheno data/tanya_${TRAIT}.pheno --grm ${RESULTS}/${POP}_ldak_kinships --prevalence ${PREV} --covar ${covar} + software/gcta64 --reml --reml-pred-rand --grm ${RESULTS}/${POP}_gcta_grm --pheno data/tanya_${TRAIT}.pheno --prevalence ${PREV} --out ${RESULTS}/GCTA/${TRAIT}/${MODEL} --qcovar ${covar} --threads 4 +done > ${POP}_${TRAIT}.log + + + + +#Code for transferring directories to local computer.. +# scp -r benrangihuna@biochemcompute1.uod.otago.ac.nz:/Volumes/scratch/merrimanlab/ben/genomic_predictions/{nph,euro,west,east}pca_results ~/Documents/genomic_prediction_results/ + + + + diff --git a/scripts/make_pheno_and_covar_files.R b/scripts/make_pheno_and_covar_files.R new file mode 100644 index 0000000..303c398 --- /dev/null +++ b/scripts/make_pheno_and_covar_files.R @@ -0,0 +1,72 @@ +library(tidyverse) +library(here) + +#CREBRF MASTER FILE (FROM TANYA) +tanya <- read_csv(file = here("data/tanya_data.csv"), col_names = TRUE) %>% + filter(SUBJECT != "Blank") + + + +nph <- read_delim(here("nphpca_results/nphpca_pcafile.eigenvec"), col_names = c("FID","IID",paste0("PCA",1:10)), delim = ' ') +east <- read_delim(here("eastpca_results/eastpca_pcafile.eigenvec"), col_names = c("FID","IID",paste0("PCA",1:10)), delim = ' ') +west <- read_delim(here("westpca_results/westpca_pcafile.eigenvec"), col_names = c("FID","IID",paste0("PCA",1:10)), delim = ' ') +euro <- read_delim(here("europca_results/europca_pcafile.eigenvec"), col_names = c("FID","IID",paste0("PCA",1:10)), delim = ' ') + +# pheno files + +all_IIDS <- c(nph$IID, east$IID, west$IID, euro$IID) + +tanya <- tanya %>% filter(SUBECT %in% all_IIDS) + +tanya %>% select(IID = SUBJECT, GOUT) %>% + mutate(GOUT_recode = case_when(str_detect(GOUT,"Control") ~ 1, + GOUT %in% c("ACR Gout","GP Gout") ~ 2, + TRUE ~ NA_real_ + ), + FID = IID) %>% select(FID, IID, GOUT = GOUT_recode) %>% + write_tsv(here("data/tanya_gout.pheno")) + + +tanya %>% select(IID = SUBJECT, TYPE2D) %>% + mutate(TYPE2D_recode = case_when(TYPE2D == "No" ~ 1, + TYPE2D == "Yes" ~ 2, + TRUE ~ NA_real_ + ), + FID = IID) %>% select(FID, IID, TYPE2D = TYPE2D_recode) %>% write_tsv(here("data/tanya_t2d.pheno")) + +# covar files +nph_covar <- tanya %>% mutate(FID = SUBJECT, IID = SUBJECT) %>% + select(FID,IID, AGECOL, SEX) %>% + right_join(nph) + +east_covar <- tanya %>% mutate(FID = SUBJECT, IID = SUBJECT) %>% + select(FID,IID, AGECOL, SEX) %>% + right_join(east) + +west_covar <- tanya %>% mutate(FID = SUBJECT, IID = SUBJECT) %>% + select(FID,IID, AGECOL, SEX) %>% + right_join(west) + +euro_covar <- tanya %>% mutate(FID = SUBJECT, IID = SUBJECT) %>% + select(FID,IID, AGECOL, SEX) %>% + right_join(euro) + +cols <- names(nph_covar)[-1:-2] +for(n in cols){ + p <- nph_covar %>% select(FID, IID:!!n) + fn <- paste(names(p)[-1:-2], collapse = "_") + print(fn) + write_tsv(p, here("nphpca_results/",paste0("nphpca_", fn,".covar"))) + + east_covar %>% select(FID, IID:!!n) %>% + write_tsv( here("eastpca_results/",paste0("eastpca_", fn,".covar"))) + + west_covar %>% select(FID, IID:!!n) %>% + write_tsv( here("westpca_results/",paste0("westpca_", fn,".covar"))) + + euro_covar %>% select(FID, IID:!!n) %>% + write_tsv( here("europca_results/",paste0("europca_", fn,".covar"))) + +} + + diff --git a/scripts/pca_select_pops.R b/scripts/pca_select_pops.R new file mode 100644 index 0000000..0964374 --- /dev/null +++ b/scripts/pca_select_pops.R @@ -0,0 +1,81 @@ +## 12 July 2021. + +## PCA based Pop selection. +library(tidyverse) +library(here) +library(patchwork) +#CREBRF MASTER FILE (FROM TANYA) +tanya <- read_csv(file = here("data/tanya_data.csv"), col_names = TRUE) # %>% mutate(GOUT01recode = GOUT -1, T2D01recode = T2D -1, logBMI= log(BMI)) + +orig <- read_csv(here("data/curated_data.csv")) + +east <- read_tsv(here("data/east597btanya.keep"), col_names = FALSE) %>% mutate(pop = "east") + +west <- read_tsv(here("data/west597btanya.keep"), col_names = FALSE) %>% mutate(pop = "west") + +euro <- read_tsv(here("data/euro597btanya.keep"), col_names = FALSE) %>% mutate(pop = "euro") + +nph <- read_tsv(here("data/nphtanya.keep"), col_names = FALSE) %>% mutate(pop ="nph") + +pops <- bind_rows(east, west, euro, nph) + +pops <- pops %>% left_join(orig %>% + select(SUBJECT, starts_with("PC")), + by = c("X2" = "SUBJECT")) %>% + drop_na() + + +# make plots using the original global coreExome PCs +theme_set(theme_bw()) +p1 <- pops %>% ggplot(aes(x = PCA2, y = PCA3, colour = pop)) + geom_point() + + ggtitle("Original") + + geom_hline(yintercept = 0.0025, colour = "red", linetype = "dashed") + + geom_vline(xintercept = 0, colour = 'red', linetype = "dashed") + + +new_nph <- pops %>% filter(pop == "nph" & PCA2 > 0 & PCA3 > 0.0025) %>% + select(X1,X2) %>% write_delim(file = here("data/nphpca.keep"), + delim = "\t", + col_names = FALSE) %>% mutate(pop = "nph") + +new_east <- pops %>% filter(pop == "east" & PCA2 > 0 & PCA3 > 0.0025) %>% + select(X1,X2) %>% write_delim(file = here("data/eastpca.keep"), + delim = "\t", + col_names = FALSE) %>% mutate(pop ="east") + +new_west <- pops %>% filter(pop == "west" & PCA2 > 0 & PCA3 < 0.0025) %>% + select(X1,X2) %>% write_delim(file = here("data/westpca.keep"), + delim = "\t", + col_names = FALSE) %>% mutate(pop = "west") + +new_euro <- pops %>% filter(pop == "euro" & PCA2 < 0) %>% + select(X1,X2) %>% write_delim(file = here("data/europca.keep"), + delim = "\t", + col_names = FALSE) %>% mutate(pop = "euro") + + + +new_pops <- bind_rows(new_east, new_west, new_euro, new_nph) + +new_pops <- new_pops %>% left_join(orig %>% + select(SUBJECT, starts_with("PC")), + by = c("X2" = "SUBJECT")) %>% + drop_na() + + +# make plots using the original global coreExome PCs + +p2 <- new_pops %>% ggplot(aes(x = PCA2, y = PCA3, colour = pop)) + + geom_point() + + ggtitle("After PCA Thresholds") + + geom_hline(yintercept = 0.0025, colour = "red", linetype = "dashed") + + geom_vline(xintercept = 0, colour = 'red', linetype = "dashed") + +p1 + p2 + patchwork::plot_layout(guides = "collect") + + +# make covar files +new_pops %>% left_join(tanya %>% select(X1 = SUBJECT, AGECOL, SEX), by = "X1") %>% + relocate(AGECOL, SEX, .after = X2) %>% rename(FID = X1, IID = X2) %>% + select(-pop) %>% write_tsv("data/tanya_covar.csv") + From cb360a5ca705b190099ff527fbb1bc29b7df1791 Mon Sep 17 00:00:00 2001 From: Ben Rangihuna Date: Wed, 14 Jul 2021 11:08:41 +1200 Subject: [PATCH 16/23] all pops --- ...DAK_GCTA_heritability_with_trait_values.sh | 79 +++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/scripts/LDAK_GCTA_heritability_with_trait_values.sh b/scripts/LDAK_GCTA_heritability_with_trait_values.sh index dcfb48f..5fdde54 100644 --- a/scripts/LDAK_GCTA_heritability_with_trait_values.sh +++ b/scripts/LDAK_GCTA_heritability_with_trait_values.sh @@ -24,6 +24,85 @@ do done > ${POP}_${TRAIT}.log +## East + +POP=eastpca +RESULTS=${POP}_results +mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT} +for covar in ls ${RESULTS}/*.covar +do + TRAIT=gout + MODEL=$(basename ${covar} .covar) + PREV=0.049 + mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT} + software/LDAK/ldak5.linux --reml ${RESULTS}/LDAK/${TRAIT}/${MODEL} --pheno data/tanya_${TRAIT}.pheno --grm ${RESULTS}/${POP}_ldak_kinships --prevalence ${PREV} --covar ${covar} + software/gcta64 --reml --reml-pred-rand --grm ${RESULTS}/${POP}_gcta_grm --pheno data/tanya_${TRAIT}.pheno --prevalence ${PREV} --out ${RESULTS}/GCTA/${TRAIT}/${MODEL} --qcovar ${covar} --threads 4 + +done > ${POP}_${TRAIT}.log + +for covar in ls ${RESULTS}/*.covar +do + TRAIT=t2d + MODEL=$(basename ${covar} .covar) + PREV=0.078 + mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT} + software/LDAK/ldak5.linux --reml ${RESULTS}/LDAK/${TRAIT}/${MODEL} --pheno data/tanya_${TRAIT}.pheno --grm ${RESULTS}/${POP}_ldak_kinships --prevalence ${PREV} --covar ${covar} + software/gcta64 --reml --reml-pred-rand --grm ${RESULTS}/${POP}_gcta_grm --pheno data/tanya_${TRAIT}.pheno --prevalence ${PREV} --out ${RESULTS}/GCTA/${TRAIT}/${MODEL} --qcovar ${covar} --threads 4 +done > ${POP}_${TRAIT}.log + +## West + +POP=westpca +RESULTS=${POP}_results +mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT} +for covar in ls ${RESULTS}/*.covar +do + TRAIT=gout + MODEL=$(basename ${covar} .covar) + PREV=0.049 + mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT} + software/LDAK/ldak5.linux --reml ${RESULTS}/LDAK/${TRAIT}/${MODEL} --pheno data/tanya_${TRAIT}.pheno --grm ${RESULTS}/${POP}_ldak_kinships --prevalence ${PREV} --covar ${covar} + software/gcta64 --reml --reml-pred-rand --grm ${RESULTS}/${POP}_gcta_grm --pheno data/tanya_${TRAIT}.pheno --prevalence ${PREV} --out ${RESULTS}/GCTA/${TRAIT}/${MODEL} --qcovar ${covar} --threads 4 + +done > ${POP}_${TRAIT}.log + +for covar in ls ${RESULTS}/*.covar +do + TRAIT=t2d + MODEL=$(basename ${covar} .covar) + PREV=0.078 + mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT} + software/LDAK/ldak5.linux --reml ${RESULTS}/LDAK/${TRAIT}/${MODEL} --pheno data/tanya_${TRAIT}.pheno --grm ${RESULTS}/${POP}_ldak_kinships --prevalence ${PREV} --covar ${covar} + software/gcta64 --reml --reml-pred-rand --grm ${RESULTS}/${POP}_gcta_grm --pheno data/tanya_${TRAIT}.pheno --prevalence ${PREV} --out ${RESULTS}/GCTA/${TRAIT}/${MODEL} --qcovar ${covar} --threads 4 +done > ${POP}_${TRAIT}.log + + +## Euro + + +POP=europca +RESULTS=${POP}_results +mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT} +for covar in ls ${RESULTS}/*.covar +do + TRAIT=gout + MODEL=$(basename ${covar} .covar) + PREV=0.049 + mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT} + software/LDAK/ldak5.linux --reml ${RESULTS}/LDAK/${TRAIT}/${MODEL} --pheno data/tanya_${TRAIT}.pheno --grm ${RESULTS}/${POP}_ldak_kinships --prevalence ${PREV} --covar ${covar} + software/gcta64 --reml --reml-pred-rand --grm ${RESULTS}/${POP}_gcta_grm --pheno data/tanya_${TRAIT}.pheno --prevalence ${PREV} --out ${RESULTS}/GCTA/${TRAIT}/${MODEL} --qcovar ${covar} --threads 4 + +done > ${POP}_${TRAIT}.log + +for covar in ls ${RESULTS}/*.covar +do + TRAIT=t2d + MODEL=$(basename ${covar} .covar) + PREV=0.078 + mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT} + software/LDAK/ldak5.linux --reml ${RESULTS}/LDAK/${TRAIT}/${MODEL} --pheno data/tanya_${TRAIT}.pheno --grm ${RESULTS}/${POP}_ldak_kinships --prevalence ${PREV} --covar ${covar} + software/gcta64 --reml --reml-pred-rand --grm ${RESULTS}/${POP}_gcta_grm --pheno data/tanya_${TRAIT}.pheno --prevalence ${PREV} --out ${RESULTS}/GCTA/${TRAIT}/${MODEL} --qcovar ${covar} --threads 4 +done > ${POP}_${TRAIT}.log #Code for transferring directories to local computer.. From f5981dab13b538403a7bc5ddee9edcca6c1e65de Mon Sep 17 00:00:00 2001 From: Ben Rangihuna Date: Wed, 14 Jul 2021 11:18:29 +1200 Subject: [PATCH 17/23] update prevalences --- ...DAK_GCTA_heritability_with_trait_values.sh | 34 ++++++++++--------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/scripts/LDAK_GCTA_heritability_with_trait_values.sh b/scripts/LDAK_GCTA_heritability_with_trait_values.sh index 5fdde54..d476f61 100644 --- a/scripts/LDAK_GCTA_heritability_with_trait_values.sh +++ b/scripts/LDAK_GCTA_heritability_with_trait_values.sh @@ -1,10 +1,11 @@ - +echo NPH POP=nphpca RESULTS=${POP}_results +TRAIT=gout mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT} for covar in ls ${RESULTS}/*.covar do - TRAIT=gout + MODEL=$(basename ${covar} .covar) PREV=0.049 mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT} @@ -13,9 +14,9 @@ do done > ${POP}_${TRAIT}.log +TRAIT=t2d for covar in ls ${RESULTS}/*.covar do - TRAIT=t2d MODEL=$(basename ${covar} .covar) PREV=0.078 mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT} @@ -25,26 +26,26 @@ done > ${POP}_${TRAIT}.log ## East - +echo East POP=eastpca RESULTS=${POP}_results +TRAIT=gout mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT} for covar in ls ${RESULTS}/*.covar do - TRAIT=gout MODEL=$(basename ${covar} .covar) - PREV=0.049 + PREV=0.043 mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT} software/LDAK/ldak5.linux --reml ${RESULTS}/LDAK/${TRAIT}/${MODEL} --pheno data/tanya_${TRAIT}.pheno --grm ${RESULTS}/${POP}_ldak_kinships --prevalence ${PREV} --covar ${covar} software/gcta64 --reml --reml-pred-rand --grm ${RESULTS}/${POP}_gcta_grm --pheno data/tanya_${TRAIT}.pheno --prevalence ${PREV} --out ${RESULTS}/GCTA/${TRAIT}/${MODEL} --qcovar ${covar} --threads 4 done > ${POP}_${TRAIT}.log +TRAIT=t2d for covar in ls ${RESULTS}/*.covar do - TRAIT=t2d MODEL=$(basename ${covar} .covar) - PREV=0.078 + PREV=0.084 mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT} software/LDAK/ldak5.linux --reml ${RESULTS}/LDAK/${TRAIT}/${MODEL} --pheno data/tanya_${TRAIT}.pheno --grm ${RESULTS}/${POP}_ldak_kinships --prevalence ${PREV} --covar ${covar} software/gcta64 --reml --reml-pred-rand --grm ${RESULTS}/${POP}_gcta_grm --pheno data/tanya_${TRAIT}.pheno --prevalence ${PREV} --out ${RESULTS}/GCTA/${TRAIT}/${MODEL} --qcovar ${covar} --threads 4 @@ -54,23 +55,24 @@ done > ${POP}_${TRAIT}.log POP=westpca RESULTS=${POP}_results +TRAIT=gout mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT} for covar in ls ${RESULTS}/*.covar do - TRAIT=gout MODEL=$(basename ${covar} .covar) - PREV=0.049 + PREV=0.051 mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT} software/LDAK/ldak5.linux --reml ${RESULTS}/LDAK/${TRAIT}/${MODEL} --pheno data/tanya_${TRAIT}.pheno --grm ${RESULTS}/${POP}_ldak_kinships --prevalence ${PREV} --covar ${covar} software/gcta64 --reml --reml-pred-rand --grm ${RESULTS}/${POP}_gcta_grm --pheno data/tanya_${TRAIT}.pheno --prevalence ${PREV} --out ${RESULTS}/GCTA/${TRAIT}/${MODEL} --qcovar ${covar} --threads 4 done > ${POP}_${TRAIT}.log +TRAIT=t2d for covar in ls ${RESULTS}/*.covar do - TRAIT=t2d + MODEL=$(basename ${covar} .covar) - PREV=0.078 + PREV=0.146 mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT} software/LDAK/ldak5.linux --reml ${RESULTS}/LDAK/${TRAIT}/${MODEL} --pheno data/tanya_${TRAIT}.pheno --grm ${RESULTS}/${POP}_ldak_kinships --prevalence ${PREV} --covar ${covar} software/gcta64 --reml --reml-pred-rand --grm ${RESULTS}/${POP}_gcta_grm --pheno data/tanya_${TRAIT}.pheno --prevalence ${PREV} --out ${RESULTS}/GCTA/${TRAIT}/${MODEL} --qcovar ${covar} --threads 4 @@ -82,23 +84,23 @@ done > ${POP}_${TRAIT}.log POP=europca RESULTS=${POP}_results +TRAIT=gout mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT} for covar in ls ${RESULTS}/*.covar do - TRAIT=gout MODEL=$(basename ${covar} .covar) - PREV=0.049 + PREV=0.024 mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT} software/LDAK/ldak5.linux --reml ${RESULTS}/LDAK/${TRAIT}/${MODEL} --pheno data/tanya_${TRAIT}.pheno --grm ${RESULTS}/${POP}_ldak_kinships --prevalence ${PREV} --covar ${covar} software/gcta64 --reml --reml-pred-rand --grm ${RESULTS}/${POP}_gcta_grm --pheno data/tanya_${TRAIT}.pheno --prevalence ${PREV} --out ${RESULTS}/GCTA/${TRAIT}/${MODEL} --qcovar ${covar} --threads 4 done > ${POP}_${TRAIT}.log +TRAIT=t2d for covar in ls ${RESULTS}/*.covar do - TRAIT=t2d MODEL=$(basename ${covar} .covar) - PREV=0.078 + PREV=0.049 mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT} software/LDAK/ldak5.linux --reml ${RESULTS}/LDAK/${TRAIT}/${MODEL} --pheno data/tanya_${TRAIT}.pheno --grm ${RESULTS}/${POP}_ldak_kinships --prevalence ${PREV} --covar ${covar} software/gcta64 --reml --reml-pred-rand --grm ${RESULTS}/${POP}_gcta_grm --pheno data/tanya_${TRAIT}.pheno --prevalence ${PREV} --out ${RESULTS}/GCTA/${TRAIT}/${MODEL} --qcovar ${covar} --threads 4 From 2c5a01ba9bcd053b232b84a466bf19b6f990a39a Mon Sep 17 00:00:00 2001 From: Ben Rangihuna Date: Thu, 22 Jul 2021 11:32:02 +1200 Subject: [PATCH 18/23] adding subshells ({}) into "for covar in.." lines of code --- ...DAK_GCTA_heritability_with_trait_values.sh | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/scripts/LDAK_GCTA_heritability_with_trait_values.sh b/scripts/LDAK_GCTA_heritability_with_trait_values.sh index d476f61..1614fb2 100644 --- a/scripts/LDAK_GCTA_heritability_with_trait_values.sh +++ b/scripts/LDAK_GCTA_heritability_with_trait_values.sh @@ -3,7 +3,7 @@ POP=nphpca RESULTS=${POP}_results TRAIT=gout mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT} -for covar in ls ${RESULTS}/*.covar +for covar in {ls ${RESULTS}/*.covar} do MODEL=$(basename ${covar} .covar) @@ -15,7 +15,7 @@ do done > ${POP}_${TRAIT}.log TRAIT=t2d -for covar in ls ${RESULTS}/*.covar +for covar in {ls ${RESULTS}/*.covar} do MODEL=$(basename ${covar} .covar) PREV=0.078 @@ -31,7 +31,7 @@ POP=eastpca RESULTS=${POP}_results TRAIT=gout mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT} -for covar in ls ${RESULTS}/*.covar +for covar in {ls ${RESULTS}/*.covar} do MODEL=$(basename ${covar} .covar) PREV=0.043 @@ -42,7 +42,7 @@ do done > ${POP}_${TRAIT}.log TRAIT=t2d -for covar in ls ${RESULTS}/*.covar +for covar in {ls ${RESULTS}/*.covar} do MODEL=$(basename ${covar} .covar) PREV=0.084 @@ -52,12 +52,12 @@ do done > ${POP}_${TRAIT}.log ## West - +echo WEST POP=westpca RESULTS=${POP}_results TRAIT=gout mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT} -for covar in ls ${RESULTS}/*.covar +for covar in {ls ${RESULTS}/*.covar} do MODEL=$(basename ${covar} .covar) PREV=0.051 @@ -68,7 +68,7 @@ do done > ${POP}_${TRAIT}.log TRAIT=t2d -for covar in ls ${RESULTS}/*.covar +for covar in {ls ${RESULTS}/*.covar} do MODEL=$(basename ${covar} .covar) @@ -81,12 +81,12 @@ done > ${POP}_${TRAIT}.log ## Euro - +echo EURO POP=europca RESULTS=${POP}_results TRAIT=gout mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT} -for covar in ls ${RESULTS}/*.covar +for covar in {ls ${RESULTS}/*.covar} do MODEL=$(basename ${covar} .covar) PREV=0.024 @@ -97,7 +97,7 @@ do done > ${POP}_${TRAIT}.log TRAIT=t2d -for covar in ls ${RESULTS}/*.covar +for covar in {ls ${RESULTS}/*.covar} do MODEL=$(basename ${covar} .covar) PREV=0.049 From 2735592ad71c0e0d2ad272df227021fa7ee50a69 Mon Sep 17 00:00:00 2001 From: Ben Rangihuna Date: Thu, 22 Jul 2021 16:42:29 +1200 Subject: [PATCH 19/23] remove {} from "for covar in.." lines --- .../LDAK_GCTA_heritability_with_trait_values.sh | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/scripts/LDAK_GCTA_heritability_with_trait_values.sh b/scripts/LDAK_GCTA_heritability_with_trait_values.sh index 1614fb2..daeaa9c 100644 --- a/scripts/LDAK_GCTA_heritability_with_trait_values.sh +++ b/scripts/LDAK_GCTA_heritability_with_trait_values.sh @@ -3,7 +3,7 @@ POP=nphpca RESULTS=${POP}_results TRAIT=gout mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT} -for covar in {ls ${RESULTS}/*.covar} +for covar in ls ${RESULTS}/*.covar do MODEL=$(basename ${covar} .covar) @@ -15,7 +15,7 @@ do done > ${POP}_${TRAIT}.log TRAIT=t2d -for covar in {ls ${RESULTS}/*.covar} +for covar in ls ${RESULTS}/*.covar do MODEL=$(basename ${covar} .covar) PREV=0.078 @@ -31,7 +31,7 @@ POP=eastpca RESULTS=${POP}_results TRAIT=gout mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT} -for covar in {ls ${RESULTS}/*.covar} +for covar in ls ${RESULTS}/*.covar do MODEL=$(basename ${covar} .covar) PREV=0.043 @@ -42,7 +42,7 @@ do done > ${POP}_${TRAIT}.log TRAIT=t2d -for covar in {ls ${RESULTS}/*.covar} +for covar in ls ${RESULTS}/*.covar do MODEL=$(basename ${covar} .covar) PREV=0.084 @@ -57,7 +57,7 @@ POP=westpca RESULTS=${POP}_results TRAIT=gout mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT} -for covar in {ls ${RESULTS}/*.covar} +for covar in ls ${RESULTS}/*.covar do MODEL=$(basename ${covar} .covar) PREV=0.051 @@ -68,7 +68,7 @@ do done > ${POP}_${TRAIT}.log TRAIT=t2d -for covar in {ls ${RESULTS}/*.covar} +for covar in ls ${RESULTS}/*.covar do MODEL=$(basename ${covar} .covar) @@ -86,7 +86,7 @@ POP=europca RESULTS=${POP}_results TRAIT=gout mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT} -for covar in {ls ${RESULTS}/*.covar} +for covar in ls ${RESULTS}/*.covar do MODEL=$(basename ${covar} .covar) PREV=0.024 @@ -97,7 +97,7 @@ do done > ${POP}_${TRAIT}.log TRAIT=t2d -for covar in {ls ${RESULTS}/*.covar} +for covar in ls ${RESULTS}/*.covar do MODEL=$(basename ${covar} .covar) PREV=0.049 From 75dfe6b3fd392e92d27b7a29322e6dd0c9cd6451 Mon Sep 17 00:00:00 2001 From: Ben Rangihuna Date: Wed, 22 Sep 2021 12:12:56 +1200 Subject: [PATCH 20/23] updates to pipeline and supplimental scripts --- scripts/cv_pipeline.sh | 2 +- scripts/make_pheno_and_covar_files.R | 2 +- scripts/make_pop_summary_stats.R | 49 +++++++++ scripts/make_sex_specific_pop_files.R | 60 ++++++++++ scripts/model_residuals_whole_data.R | 0 scripts/pca_select_pops.R | 2 +- scripts/run_ldak_whole_pop.sh | 2 +- scripts/whole_pop_pipeline.sh | 6 +- scripts/whole_pop_pipeline_CETP_CREBRF.sh | 127 +++++++++++++++++++++ scripts/whole_pop_pipeline_no_polysnps.sh | 128 ++++++++++++++++++++++ 10 files changed, 371 insertions(+), 7 deletions(-) create mode 100644 scripts/make_pop_summary_stats.R create mode 100644 scripts/make_sex_specific_pop_files.R delete mode 100644 scripts/model_residuals_whole_data.R create mode 100644 scripts/whole_pop_pipeline_CETP_CREBRF.sh create mode 100644 scripts/whole_pop_pipeline_no_polysnps.sh diff --git a/scripts/cv_pipeline.sh b/scripts/cv_pipeline.sh index 253936f..d7b82ef 100644 --- a/scripts/cv_pipeline.sh +++ b/scripts/cv_pipeline.sh @@ -19,7 +19,7 @@ parallel 'bash scripts/generate_pop_pca.sh {}' ::: nph east west euro # create the cv splits and residuals for all pops and trait combos # **** traits must not contain underscores in their names **** -CV=5 # number fo folds for cross validation +CV=2 # number fo folds for cross validation for POP in nph east west euro do diff --git a/scripts/make_pheno_and_covar_files.R b/scripts/make_pheno_and_covar_files.R index 303c398..8768f03 100644 --- a/scripts/make_pheno_and_covar_files.R +++ b/scripts/make_pheno_and_covar_files.R @@ -16,7 +16,7 @@ euro <- read_delim(here("europca_results/europca_pcafile.eigenvec"), col_names = all_IIDS <- c(nph$IID, east$IID, west$IID, euro$IID) -tanya <- tanya %>% filter(SUBECT %in% all_IIDS) +tanya <- tanya %>% filter(SUBJECT %in% all_IIDS) tanya %>% select(IID = SUBJECT, GOUT) %>% mutate(GOUT_recode = case_when(str_detect(GOUT,"Control") ~ 1, diff --git a/scripts/make_pop_summary_stats.R b/scripts/make_pop_summary_stats.R new file mode 100644 index 0000000..0f7073a --- /dev/null +++ b/scripts/make_pop_summary_stats.R @@ -0,0 +1,49 @@ +## Remove Pukapuka from West Polynesian Pops + + +tanya <- read_csv(file = here("data/tanya_data.csv"), col_names = TRUE) %>% + filter(SUBJECT != "Blank") %>% + mutate(GOUT_orig_code = GOUT, GOUT = case_when(str_detect(GOUT,"Control") ~ 1, + GOUT %in% c("ACR Gout","GP Gout") ~ 2, + TRUE ~ NA_real_ + ), + T2D = case_when(TYPE2D == "No" ~ 1, + TYPE2D == "Yes" ~ 2, + TRUE ~ NA_real_ + )) + +## west_pca_tanya.keep is the same as the current westpca.keep file (same PATIENT IDs) +keep_files <- list.files("/Volumes/scratch/merrimanlab/ben/genomic_predictions/data/", pattern = "pca.keep", full.names = TRUE) + +pops <- map_dfr(keep_files, read_tsv, col_names = FALSE) +tanya_filtered <- tanya %>% filter(SUBJECT %in% pops$X1) + +tanya_no_puka <- tanya_filtered %>% mutate(Westkeep= ifelse(ETH_SPECIFIC == "Pukapukan",0,1)) + +tanya_no_puka2 <- filter(tanya_no_puka, Westkeep == 1) + +height <- tanya_no_puka2 %>% drop_na(HEIGHT) +BMI <- tanya_no_puka2 %>% drop_na(BMI) +HDL <- tanya_no_puka2 %>% drop_na(HDL) +T2D<- tanya_no_puka2 %>% drop_na(T2D) +GOUT <- tanya_no_puka2 %>% drop_na(GOUT) + +library(dplyr) + +# There are 2 Mixed Polys, these are NPH participants (NPH group wasnt run) +data.table::setDT(height)[,list(Mean=mean(HEIGHT), Max=max(HEIGHT), Min=min(HEIGHT), Mean=as.numeric(mean(HEIGHT)), Std=sd(HEIGHT)), by=ANALYSISGROUP_EASTWEST] +data.table::setDT(BMI)[,list(Mean=mean(BMI), Max=max(BMI), Min=min(BMI), Mean=as.numeric(mean(BMI)), Std=sd(BMI)), by=ANALYSISGROUP_EASTWEST] +data.table::setDT(HDL)[,list(Mean=mean(HDL), Max=max(HDL), Min=min(HDL), Median=as.numeric(median(HDL)), Std=sd(HDL)), by=ANALYSISGROUP_EASTWEST] + +table(height$ANALYSISGROUP_EASTWEST) +table(BMI$ANALYSISGROUP_EASTWEST) +table(HDL$ANALYSISGROUP_EASTWEST) +table(T2D$ANALYSISGROUP_EASTWEST) +table(height$ANALYSISGROUP_EASTWEST) +table(height$ANALYSISGROUP_EASTWEST) +table(tanya$ANALYSISGROUP_EASTWEST) + +# Gout and T2D tables +tanya_no_puka2 %>% group_by(T2D, ANALYSISGROUP_EASTWEST) %>% tally(sort = F) +tanya_no_puka2 %>% group_by(GOUT, ANALYSISGROUP_EASTWEST) %>% tally(sort = F) +tanya_no_puka2 %>% group_by(SEX, ANALYSISGROUP_EASTWEST) %>% tally(sort = F) diff --git a/scripts/make_sex_specific_pop_files.R b/scripts/make_sex_specific_pop_files.R new file mode 100644 index 0000000..ee21a9a --- /dev/null +++ b/scripts/make_sex_specific_pop_files.R @@ -0,0 +1,60 @@ +# Make sex specific Euro and West Poly keepfiles.. + +tanya <- read_csv(file = here("data/tanya_data.csv"), col_names = TRUE) %>% + filter(SUBJECT != "Blank") %>% + mutate(GOUT_orig_code = GOUT, GOUT = case_when(str_detect(GOUT,"Control") ~ 1, + GOUT %in% c("ACR Gout","GP Gout") ~ 2, + TRUE ~ NA_real_ + ), + T2D = case_when(TYPE2D == "No" ~ 1, + TYPE2D == "Yes" ~ 2, + TRUE ~ NA_real_ + )) + + +## Read in Euro and West (no Puka) keep files +westpca <- read.delim(file = "data/westnopukapca.keep", sep = "\t", header = F) +euro <- read.delim(file = "data/europca.keep", sep = "\t", header = F) + + + +## List of Sex specific IDs +tanya_men <- filter(tanya, SEX == 1) %>% select(PATIENT, ANALYSISGROUP_EASTWEST) +tanya_women <- filter(tanya, SEX == 2) %>% select(PATIENT, ANALYSISGROUP_EASTWEST) + +## Renaming column so that "euro" or "westpca" have matching columns. +names(tanya_men)[1] <- 'V1' +names(tanya_women)[1] <- 'V1' + + + +euro_men <- filter(tanya_men, ANALYSISGROUP_EASTWEST == "European") %>% select(V1) +euro_women <- filter(tanya_women, ANALYSISGROUP_EASTWEST == "European") %>% select(V1) + +west_men <- filter(tanya_men, ANALYSISGROUP_EASTWEST == "West Polynesian") %>% select(V1) +west_women <- filter(tanya_women, ANALYSISGROUP_EASTWEST == "West Polynesian") %>% select(V1) + + +## Make sex specific population files +euromale <- merge(euro,euro_men,all = F) +eurofemale <- merge(euro,euro_women, all = F) + +westmale <- merge(westpca,west_men, all = F) +westfemale <- merge(westpca, west_women, all = F) +sum(is.na(westfemale$V1)) +sum(is.na(westfemale$V2)) +sum(is.na(westmale$V1)) +sum(is.na(westmale$V2)) + +ifelse(westfemale$V1==westfemale$V2,"Yes","No") + +## make keep files. +write_delim(euromale, file = "/Volumes/scratch/merrimanlab/ben/genomic_predictions/data/euromalepca.keep", col_names = F, delim = "\t") + +write_delim(eurofemale, file = "/Volumes/scratch/merrimanlab/ben/genomic_predictions/data/eurofemalepca.keep", col_names = F, delim = "\t") + +write_delim(westmale, file = "/Volumes/scratch/merrimanlab/ben/genomic_predictions/data/westmalepca.keep", col_names = F, delim = "\t") + +write_delim(westfemale, file = "/Volumes/scratch/merrimanlab/ben/genomic_predictions/data/westfemalepca.keep", col_names = F, delim = "\t") + + diff --git a/scripts/model_residuals_whole_data.R b/scripts/model_residuals_whole_data.R deleted file mode 100644 index e69de29..0000000 diff --git a/scripts/pca_select_pops.R b/scripts/pca_select_pops.R index 0964374..2ba5f73 100644 --- a/scripts/pca_select_pops.R +++ b/scripts/pca_select_pops.R @@ -32,7 +32,7 @@ p1 <- pops %>% ggplot(aes(x = PCA2, y = PCA3, colour = pop)) + geom_point() + geom_hline(yintercept = 0.0025, colour = "red", linetype = "dashed") + geom_vline(xintercept = 0, colour = 'red', linetype = "dashed") - +## 19 July 2021 Re-run NPH again new_nph <- pops %>% filter(pop == "nph" & PCA2 > 0 & PCA3 > 0.0025) %>% select(X1,X2) %>% write_delim(file = here("data/nphpca.keep"), delim = "\t", diff --git a/scripts/run_ldak_whole_pop.sh b/scripts/run_ldak_whole_pop.sh index 3c4ce50..96e881f 100644 --- a/scripts/run_ldak_whole_pop.sh +++ b/scripts/run_ldak_whole_pop.sh @@ -13,7 +13,7 @@ TRAIT=$(echo $BFILE | cut -d'_' -f2) MODEL=$2 -PHENOCOL=$(echo $MODEL + 4 | bc) # ignore the first 4 columns after FID and IID +PHENOCOL=$(echo $MODEL + 7 | bc) # ignore the first 7 columns: FID IID MID PID SEX AFF TRUEPHENOVALUE cut -d' ' -f1,2,${PHENOCOL} < ${RESULTS}/${BFILE}.pheno > ${RESULTS}/${BFILE}.pheno${MODEL}_ldak diff --git a/scripts/whole_pop_pipeline.sh b/scripts/whole_pop_pipeline.sh index 2ae8e68..8ef0c69 100644 --- a/scripts/whole_pop_pipeline.sh +++ b/scripts/whole_pop_pipeline.sh @@ -3,12 +3,12 @@ ## Run: -## bash scripts/whole_pop_pipeline.sh data/data {POP_NAME} +## bash scripts/whole_pop_pipeline.sh data/data_filtered {POP_NAME} ## - - +# remove duplicate markers and non-snps +# plink --bfile data --exclude remove_dup_markers.txt --snps-only just-acgt --make-bed --out data_filtered ORIG_DATA=$1 POP=$2 diff --git a/scripts/whole_pop_pipeline_CETP_CREBRF.sh b/scripts/whole_pop_pipeline_CETP_CREBRF.sh new file mode 100644 index 0000000..b04ac7c --- /dev/null +++ b/scripts/whole_pop_pipeline_CETP_CREBRF.sh @@ -0,0 +1,127 @@ +### whole_pop_pipeline.sh + +## Run: + + +## bash scripts/whole_pop_pipeline.sh data/data_filtered {POP_NAME} + +## + +# remove duplicate markers and non-snps +# plink --bfile data --exclude remove_dup_markers.txt --snps-only just-acgt --make-bed --out data_filtered + +ORIG_DATA=$1 +POP=$2 +RESULTS=${POP}_results/ + + + + +echo "START" + +date + + +#### Initial setup ##### +mkdir -p tmp ${RESULTS}/ + + +software/plink1.9b6.10 --bfile ${ORIG_DATA} --geno 0.05 --make-bed --out tmp/data_sorted + +software/plink1.9b6.10 --bfile tmp/data_sorted --missing-genotype . --merge data/PolySNPS --make-bed --out tmp/dataPolySNPS --allow-no-sex + +## sort and keep our population +software/plink1.9b6.10 --bfile tmp/dataPolySNPS --keep data/${POP}.keep --maf 0.01 --make-bed --out tmp/${POP}_sorteddata + + +# create list of population specific independent SNPs from data +software/plink1.9b6.10 --bfile tmp/${POP}_sorteddata --indep-pairwise 50 5 0.2 --maf 0.1 --out ${RESULTS}/${POP}_pca_markers + +#(dont' need to do) :make sure the snps of interest don't get filtered out later on +#echo RS373863828 >>${RESULTS}/${POP}_pca_markers.prune.in +#echo CETP_57004947 >> ${RESULTS}/${POP}_pca_markers.prune.in + +######################## GCTA +#grm + +# only run this line once per population to save time +software/gcta64 --bfile tmp/${POP}_sorteddata --autosome --make-grm --out ${RESULTS}/${POP}_gcta_grm --threads 4 + + + +######################## PCAs + +### calculate principal components for the population using the independent markers +software/plink1.9b6.10 --bfile tmp/${POP}_sorteddata --pca 10 --out ${RESULTS}/${POP}_pcafile --extract ${RESULTS}/${POP}_pca_markers.prune.in --allow-no-sex + + + + + +####################### LDAK + +echo "Start LDAK $(date)" +### calculate the weights and kinships for LDAK prior to GRM +software/LDAK/ldak5.linux --cut-weights ${RESULTS}/${POP}_ldak_sections --bfile tmp/${POP}_sorteddata + + +software/LDAK/ldak5.linux --calc-weights-all ${RESULTS}/${POP}_ldak_sections --bfile tmp/${POP}_sorteddata + + +software/LDAK/ldak5.linux --calc-kins-direct ${RESULTS}/${POP}_ldak_kinships --bfile tmp/${POP}_sorteddata --weights ${RESULTS}/${POP}_ldak_sections/weights.short --power -0.25 + +echo "Start traits $(date)" +# example line to use for testing one iteration +#line="HEIGHT,linear", +while read line +do + TRAIT=$(echo $line | cut -d',' -f1) + REGRESSION_TYPE=$(echo $line | cut -d',' -f2) + + echo "Start ${TRAIT} $(date)" + mkdir -p ${RESULTS}/{GCTA,LDAK} + ##### Generate regression residuals for each model ##### + # makes tmp/${POP}_${TRAIT}_residuals.txt + + #bash scripts/generate_models_whole_pop.sh ${POP} ${TRAIT} ${REGRESSION_TYPE} + Rscript scripts/model_residuals_whole_pop.R --trait ${TRAIT} --pop ${POP} --regression ${REGRESSION_TYPE} --out_dir ${RESULTS}/ + + ##### Create phone files with the residuals + ### prune and sort the data + # prune to only those with residuals + # residual are population specific with PCAs included in the model outside this script. + # combine residuals to fam file + # this step need to be repeated for each of the groups at bayesR section + # columns 8-12 are the FIVE residuals + # height, egfr, serumurate, diabetes and gout in that order + # need to join based on number fo models 2.2 - 2.8 was for 7 models + software/plink1.9b6.10 --bfile tmp/dataPolySNPS --keep data/${POP}.keep --make-bed --maf 0.01 --out tmp/${POP}_${TRAIT} + # this should join using IID from fam and IID from the pheno (in pheno first and second cols are duplicated) includes entire fam in output so need to adjust accordingly for model numbers (-6 cols) + join -1 2 -2 1 -o auto -e "NA" <(sort -k2 tmp/${POP}_${TRAIT}.fam) <(sort -k1 ${RESULTS}/${POP}_${TRAIT}.residuals.txt) > ${RESULTS}/${POP}_${TRAIT}.pheno + + # calculate the column indexes, offset by 2 (to account for FID and IID) from all of the residuals -> should be equal to $(seq 1 number_of_models) + pheno_cols=$( head -1 ${RESULTS}/${POP}_${TRAIT}.residuals.txt | awk '{ for(i=1;i<=NF-2;i++){print i}}' | sort -u) + + + #### GCTA REML for each model of current trait + # run the following with 16 parallel jobs + parallel -j 16 'nice -n 10 bash scripts/run_gcta_whole_pop.sh {1} {2}' ::: tmp/${POP}_${TRAIT}.fam ::: ${pheno_cols} + + #### LDAK for each model of current trait + parallel -j 16 'nice -n 10 bash scripts/run_ldak_whole_pop.sh {1} {2} ' ::: tmp/${POP}_${TRAIT}.fam ::: ${pheno_cols} +done < data/pop_trait_models.csv + +WD=$(pwd) +cd ${RESULTS}/LDAK +# combine all LDAK h2 for pop into a single file +grep 'K1' *.h2 | cat <(echo FILE $(head -1 ${TRAIT}_1.h2 )) - > ${POP}_all_traits_LDAK_K1_h2.txt +# return to the orignal directory +cd $WD + + + +echo "FINISH" + +date + + diff --git a/scripts/whole_pop_pipeline_no_polysnps.sh b/scripts/whole_pop_pipeline_no_polysnps.sh new file mode 100644 index 0000000..6cc87a7 --- /dev/null +++ b/scripts/whole_pop_pipeline_no_polysnps.sh @@ -0,0 +1,128 @@ +### whole_pop_pipeline.sh + +## Run: + + +## bash scripts/whole_pop_pipeline.sh data/data_filtered {POP_NAME} + +## + +# remove duplicate markers and non-snps +# plink --bfile data --exclude remove_dup_markers.txt --snps-only just-acgt --make-bed --out data_filtered + +ORIG_DATA=$1 +POP=$2 +RESULTS=${POP}_results/ + + + + +echo "START" + +date + + +#### Initial setup ##### +mkdir -p tmp ${RESULTS}/ + + +software/plink1.9b6.10 --bfile ${ORIG_DATA} --geno 0.05 --make-bed --out tmp/data_sorted + +# No need to add in Poly Snps right now. +# software/plink1.9b6.10 --bfile tmp/data_sorted --missing-genotype . --merge data/PolySNPS --make-bed --out tmp/dataPolySNPS --allow-no-sex + +## sort and keep our population +software/plink1.9b6.10 --bfile tmp/data_sorted --keep data/${POP}.keep --maf 0.01 --make-bed --out tmp/${POP}_sorteddata + + +# create list of population specific independent SNPs from data +software/plink1.9b6.10 --bfile tmp/${POP}_sorteddata --indep-pairwise 50 5 0.2 --maf 0.1 --out ${RESULTS}/${POP}_pca_markers + +#(dont' need to do) :make sure the snps of interest don't get filtered out later on +#echo RS373863828 >>${RESULTS}/${POP}_pca_markers.prune.in +#echo CETP_57004947 >> ${RESULTS}/${POP}_pca_markers.prune.in + +######################## GCTA +#grm + +# only run this line once per population to save time +software/gcta64 --bfile tmp/${POP}_sorteddata --autosome --make-grm --out ${RESULTS}/${POP}_gcta_grm --threads 4 + + + +######################## PCAs + +### calculate principal components for the population using the independent markers +software/plink1.9b6.10 --bfile tmp/${POP}_sorteddata --pca 10 --out ${RESULTS}/${POP}_pcafile --extract ${RESULTS}/${POP}_pca_markers.prune.in --allow-no-sex + + + + + +####################### LDAK + +echo "Start LDAK $(date)" +### calculate the weights and kinships for LDAK prior to GRM +software/LDAK/ldak5.linux --cut-weights ${RESULTS}/${POP}_ldak_sections --bfile tmp/${POP}_sorteddata + + +software/LDAK/ldak5.linux --calc-weights-all ${RESULTS}/${POP}_ldak_sections --bfile tmp/${POP}_sorteddata + + +software/LDAK/ldak5.linux --calc-kins-direct ${RESULTS}/${POP}_ldak_kinships --bfile tmp/${POP}_sorteddata --weights ${RESULTS}/${POP}_ldak_sections/weights.short --power -0.25 + +echo "Start traits $(date)" +# example line to use for testing one iteration +#line="HEIGHT,linear", +while read line +do + TRAIT=$(echo $line | cut -d',' -f1) + REGRESSION_TYPE=$(echo $line | cut -d',' -f2) + + echo "Start ${TRAIT} $(date)" + mkdir -p ${RESULTS}/{GCTA,LDAK} + ##### Generate regression residuals for each model ##### + # makes tmp/${POP}_${TRAIT}_residuals.txt + + #bash scripts/generate_models_whole_pop.sh ${POP} ${TRAIT} ${REGRESSION_TYPE} + Rscript scripts/model_residuals_whole_pop.R --trait ${TRAIT} --pop ${POP} --regression ${REGRESSION_TYPE} --out_dir ${RESULTS}/ + + ##### Create phone files with the residuals + ### prune and sort the data + # prune to only those with residuals + # residual are population specific with PCAs included in the model outside this script. + # combine residuals to fam file + # this step need to be repeated for each of the groups at bayesR section + # columns 8-12 are the FIVE residuals + # height, egfr, serumurate, diabetes and gout in that order + # need to join based on number fo models 2.2 - 2.8 was for 7 models + software/plink1.9b6.10 --bfile tmp/data_sorted --keep data/${POP}.keep --make-bed --maf 0.01 --out tmp/${POP}_${TRAIT} + # this should join using IID from fam and IID from the pheno (in pheno first and second cols are duplicated) includes entire fam in output so need to adjust accordingly for model numbers (-6 cols) + join -1 2 -2 1 -o auto -e "NA" <(sort -k2 tmp/${POP}_${TRAIT}.fam) <(sort -k1 ${RESULTS}/${POP}_${TRAIT}.residuals.txt) > ${RESULTS}/${POP}_${TRAIT}.pheno + + # calculate the column indexes, offset by 2 (to account for FID and IID) from all of the residuals -> should be equal to $(seq 1 number_of_models) + pheno_cols=$( head -1 ${RESULTS}/${POP}_${TRAIT}.residuals.txt | awk '{ for(i=1;i<=NF-2;i++){print i}}' | sort -u) + + + #### GCTA REML for each model of current trait + # run the following with 16 parallel jobs + parallel -j 16 'nice -n 10 bash scripts/run_gcta_whole_pop.sh {1} {2}' ::: tmp/${POP}_${TRAIT}.fam ::: ${pheno_cols} + + #### LDAK for each model of current trait + parallel -j 16 'nice -n 10 bash scripts/run_ldak_whole_pop.sh {1} {2} ' ::: tmp/${POP}_${TRAIT}.fam ::: ${pheno_cols} +done < data/pop_trait_models.csv + +WD=$(pwd) +cd ${RESULTS}/LDAK +# combine all LDAK h2 for pop into a single file +grep 'K1' *.h2 | cat <(echo FILE $(head -1 ${TRAIT}_1.h2 )) - > ${POP}_all_traits_LDAK_K1_h2.txt +# return to the orignal directory +cd $WD + + + +echo "FINISH" + +date + + From 764c564ce445755836ec56d281c44606964e90a9 Mon Sep 17 00:00:00 2001 From: BenRangihuna <75346750+BenRangihuna@users.noreply.github.com> Date: Mon, 11 Apr 2022 13:36:34 +1200 Subject: [PATCH 21/23] Update README.md Clarifying the use of the 'cv_pipeline.sh' script. --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index 5fd5680..c89853c 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,7 @@ `scripts/` contains the scripts that are used for the calculation of genomic profile scores -The main workflow script is `cv_pipeline.sh` which will run each of the required steps. +The main workflow script is `cv_pipeline.sh` which will run each of the required steps. (This pipeline is used to estimate trait heritabilities for each Polynesian Population of interest (i.e not for the CV-trait profile score stages)). It starts with a plink formatted dataset containing all populations of interest, subsets out each population, and does cross-validation for each trait of interest in each population using both GCTA and LDAK. - `scripts/create_model_reports.R` will create reports for each pop/trait combo as listed in `data/pop_trait_models.csv` and apply a template RMarkdown to pull in and summarise the results from LDAK and GCTA. The template document is `scripts/model_selection_doc.Rmd`. From 56a1ab40707f071e2edf42519b99a402c9aae552 Mon Sep 17 00:00:00 2001 From: BenRangihuna <75346750+BenRangihuna@users.noreply.github.com> Date: Mon, 31 Jul 2023 12:18:46 +1200 Subject: [PATCH 22/23] Update model_residuals_whole_pop.R added additional package (to (hopefully) overcome issues in BASH). --- scripts/model_residuals_whole_pop.R | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scripts/model_residuals_whole_pop.R b/scripts/model_residuals_whole_pop.R index ba19326..418925a 100644 --- a/scripts/model_residuals_whole_pop.R +++ b/scripts/model_residuals_whole_pop.R @@ -1,3 +1,7 @@ +if(!require(textshaping)){ + install.packages("here", repos = "https://cloud.r-project.org") + library("textshaping") +} if(!require(here)){ install.packages("here", repos = "https://cloud.r-project.org") library("here") From cccc9a77bb5dd2761c36e115cbd8c89975639383 Mon Sep 17 00:00:00 2001 From: BenRangihuna <75346750+BenRangihuna@users.noreply.github.com> Date: Mon, 31 Jul 2023 12:36:36 +1200 Subject: [PATCH 23/23] Update model_residuals_whole_pop.R Initial plan to install package "text shaping" to alleviate BASH issues did not work. So I have changed the code back to its original output. --- scripts/model_residuals_whole_pop.R | 4 ---- 1 file changed, 4 deletions(-) diff --git a/scripts/model_residuals_whole_pop.R b/scripts/model_residuals_whole_pop.R index 418925a..ba19326 100644 --- a/scripts/model_residuals_whole_pop.R +++ b/scripts/model_residuals_whole_pop.R @@ -1,7 +1,3 @@ -if(!require(textshaping)){ - install.packages("here", repos = "https://cloud.r-project.org") - library("textshaping") -} if(!require(here)){ install.packages("here", repos = "https://cloud.r-project.org") library("here")